def find_protoclusters(record: Record, cds_by_cluster_type: Dict[str, Set[str]],
                       rules_by_name: Dict[str, rule_parser.DetectionRule]) -> List[Protocluster]:
    """ Detects gene clusters based on the identified core genes """
    clusters: List[Protocluster] = []
    cds_feature_by_name = record.get_cds_name_mapping()

    for cluster_type, cds_names in cds_by_cluster_type.items():
        cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names])
        rule = rules_by_name[cluster_type]
        cutoff = rule.cutoff
        core_location = cds_features[0].location
        for cds in cds_features[1:]:
            if cds.overlaps_with(FeatureLocation(max(0, core_location.start - cutoff),
                                                 core_location.end + cutoff)):
                core_location = FeatureLocation(min(cds.location.start, core_location.start),
                                                max(cds.location.end, core_location.end))
                assert core_location.start >= 0 and core_location.end <= len(record)
                continue
            # create the previous cluster and start a new location
            surrounds = FeatureLocation(max(0, core_location.start - rule.neighbourhood),
                                        min(core_location.end + rule.neighbourhood, len(record)))
            surrounding_cdses = record.get_cds_features_within_location(surrounds, with_overlapping=False)
            real_start = min(contained.location.start for contained in surrounding_cdses)
            real_end = max(contained.location.end for contained in surrounding_cdses)
            surrounds = FeatureLocation(real_start, real_end)
            clusters.append(Protocluster(core_location, surrounding_location=surrounds,
                                    tool="rule-based-clusters", cutoff=cutoff,
                                    neighbourhood_range=rule.neighbourhood, product=cluster_type,
                                    detection_rule=str(rule.conditions)))
            core_location = cds.location

        # finalise the last cluster
        surrounds = FeatureLocation(max(0, core_location.start - rule.neighbourhood),
                                    min(core_location.end + rule.neighbourhood, len(record)))
        clusters.append(Protocluster(core_location, surrounding_location=surrounds,
                                tool="rule-based-clusters", cutoff=cutoff,
                                neighbourhood_range=rule.neighbourhood, product=cluster_type,
                                detection_rule=str(rule.conditions)))

    # fit to record if outside
    for cluster in clusters:
        contained = FeatureLocation(max(0, cluster.location.start),
                                    min(cluster.location.end, len(record)))
        if contained != cluster.location:
            cluster.location = contained

    clusters = remove_redundant_protoclusters(clusters, rules_by_name)

    logging.debug("%d rule-based cluster(s) found in record", len(clusters))
    return clusters
Exemple #2
0
def run_lanthi_on_genes(record: Record, focus: CDSFeature,
                        cluster: Protocluster, genes: List[CDSFeature],
                        results: LanthiResults) -> None:
    """ Runs lanthipeptide around a single focus gene which is a core biosynthetic
        enzyme for lanthipeptides.
        Updates the results object with any precursors found.

        Arguments:
            record: the Record instance containing the genes
            focus: a core lanthipeptide gene
            cluster: the Protocluster being analysed
            genes: a list of candidate precursor genes
            results: a LanthiResults object to update

        Returns:
            None
    """
    if not genes:
        return
    domains = get_detected_domains(cluster.cds_children)
    non_candidate_neighbours = find_neighbours_in_range(
        focus, cluster.cds_children)
    flavoprotein_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"Flavoprotein"})
    halogenase_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"Trp_halogenase"})
    oxygenase_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"p450"})
    dehydrogenase_found = contains_feature_with_single_domain(
        non_candidate_neighbours, {"adh_short", "adh_short_C2"})

    lant_class = predict_class_from_genes(focus, cluster.cds_children)
    if not lant_class:
        return

    for candidate in genes:
        result_vec = run_lanthipred(record, candidate, lant_class, domains)
        if result_vec is None:
            continue
        result_vec.aminovinyl_group = flavoprotein_found
        result_vec.chlorinated = halogenase_found
        result_vec.oxygenated = oxygenase_found
        result_vec.lactonated = dehydrogenase_found and result_vec.core.startswith(
            'S')
        motif = result_vec_to_feature(candidate, result_vec)
        results.motifs_by_locus[focus.get_name()].append(motif)
        results.clusters[cluster.get_protocluster_number()].add(
            focus.get_name())
        # track new CDSFeatures if found with all_orfs
        if candidate.region is None:
            results.new_cds_features.add(candidate)
    def from_json(json: Dict[str, Any], record: Record) -> Optional["RuleDetectionResults"]:
        """ Constructs a RuleDetectionResults instance from a JSON representation """
        if RuleDetectionResults.schema_version != json.get("schema_version", 1):
            return None

        cds_by_cluster = {}
        for json_cluster, json_cds_results in json["cds_by_protocluster"]:
            cluster = Protocluster.from_biopython(serialiser.feature_from_json(json_cluster))
            cds_results = [CDSResults.from_json(result_json, record) for result_json in json_cds_results]
            cds_by_cluster[cluster] = cds_results

        cdses_outside = [CDSResults.from_json(chunk, record) for chunk in json["outside_protoclusters"]]

        return RuleDetectionResults(cds_by_cluster, json["tool"], cdses_outside)