Esempio n. 1
0
def find_clusters(record, cds_by_cluster_type,
                  rules_by_name) -> List[ClusterBorder]:
    """ Detects gene clusters based on the identified core genes """
    clusters = []  # type: List[ClusterBorder]

    cds_feature_by_name = record.get_cds_name_mapping()

    for cluster_type, cds_names in cds_by_cluster_type.items():
        cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names])
        rule = rules_by_name[cluster_type]
        cutoff = rule.cutoff
        extent = rule.extent
        start, end = sorted(
            [cds_features[0].location.start, cds_features[0].location.end])
        cluster = ClusterBorder(FeatureLocation(start, end),
                                tool="rule-based-clusters",
                                cutoff=cutoff,
                                extent=extent,
                                product=cluster_type)
        assert cds_features[0].is_contained_by(cluster)
        assert cds_features[0] in record.get_cds_features_within_location(
            cluster.location)
        clusters.append(cluster)
        for cds in cds_features[1:]:
            feature_start, feature_end = sorted(
                [cds.location.start, cds.location.end])
            dummy_location = FeatureLocation(cluster.location.start - cutoff,
                                             cluster.location.end + cutoff)
            if cds.is_contained_by(dummy_location):
                start = min(feature_start, start)
                end = max(feature_end, end)
                cluster.location = FeatureLocation(start, end)
            else:
                start = feature_start
                end = feature_end
                cluster = ClusterBorder(FeatureLocation(start, end),
                                        tool="rule-based-clusters",
                                        cutoff=cutoff,
                                        extent=extent,
                                        product=cluster_type)
                clusters.append(cluster)

    for cluster in clusters:
        cluster.rule = str(rules_by_name[cluster.product].conditions)
        if cluster.location.start < 0:
            cluster.location = FeatureLocation(0, cluster.location.end)
            cluster.contig_edge = True
        if cluster.location.end > len(record):
            cluster.location = FeatureLocation(cluster.location.start,
                                               len(record))
            cluster.contig_edge = True

    clusters = remove_redundant_borders(clusters, rules_by_name)

    logging.debug("%d rule-based cluster(s) found in record", len(clusters))
    return clusters
Esempio n. 2
0
 def test_products_from_borders(self):
     location = FeatureLocation(1, 10)
     border1 = ClusterBorder(location, "toolA", product="A")
     assert border1.high_priority_product
     border2 = ClusterBorder(location, "toolB", product="B")
     assert border1.high_priority_product
     assert _build_products_from_borders([border1, border2]) == ["A", "B"]
     assert _build_products_from_borders([border2, border1]) == ["B", "A"]
     border2 = ClusterBorder(location, "toolB", product="B", high_priority_product=False)
     assert _build_products_from_borders([border1, border2]) == ["A"]
     assert _build_products_from_borders([border2, border1]) == ["A"]
     border1.high_priority_product = False
     assert _build_products_from_borders([border1, border2]) == ["A", "B"]
     assert _build_products_from_borders([border2, border1]) == ["B", "A"]
Esempio n. 3
0
    def from_json(json: Dict[str, Any],
                  record: Record) -> Optional["CassisResults"]:
        # throw away the results if the conditions are different
        if json["record_id"] != record.id:
            logging.debug(
                "Record identifiers don't match, discarding previous results")
            return None
        if json["max_percentage"] != MAX_PERCENTAGE:
            logging.debug(
                "CASSIS commonality threshold changed, discarding previous results"
            )
            return None
        if json["max_gap_length"] != MAX_GAP_LENGTH:
            logging.debug(
                "CASSIS maximum island length changed, discarding previous results"
            )
            return None

        borders = []
        promoters = []  # type: List[Promoter]
        for border in json["borders"]:
            borders.append(
                ClusterBorder.from_biopython(feature_from_json(border)))
        for promoter in json["promoters"]:
            if promoter["type"] == "CombinedPromoter":
                promoters.append(CombinedPromoter.from_json(promoter))
            else:
                promoters.append(Promoter.from_json(promoter))
        results = CassisResults(record.id)
        results.borders = borders
        results.promoters = promoters
        return results
Esempio n. 4
0
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction],
                           record: Record) -> List[ClusterBorder]:
    """ Create the predicted ClusterBorders """
    if not clusters:
        return []
    borders = []
    for i, cluster in enumerate(clusters):
        # cluster borders returned by hmmdetect are based on CDS features
        # in contrast, cluster borders returned by cassis are based on gene features
        # --> hmmdetect derived clusters have exact loctions, like the CDSs have
        # --> cassis derived clusters may have fuzzy locations, like the genes have
        left_name = cluster.start.gene
        right_name = cluster.end.gene
        left = None
        right = None
        for gene in record.get_genes():
            if gene.get_name() == left_name:
                left = gene
            if gene.get_name() == right_name:
                right = gene
            if left and right:
                break

        new_feature = SeqFeature(FeatureLocation(left.location.start,
                                                 right.location.end),
                                 type="cluster_border")
        new_feature.qualifiers = {
            "aStool": ["cassis"],
            "anchor": [anchor],
            "abundance": [cluster.start.abundance + cluster.end.abundance],
            "motif_score":
            ["{:.1e}".format(cluster.start.score + cluster.end.score)],
            "gene_left": [cluster.start.gene],
            "promoter_left": [cluster.start.promoter],
            "abundance_left": [cluster.start.abundance],
            "motif_left": [cluster.start.pairing_string],
            "motif_score_left": ["{:.1e}".format(cluster.start.score)],
            "gene_right": [cluster.end.gene],
            "promoter_right": [cluster.end.promoter],
            "abundance_right": [cluster.end.abundance],
            "motif_right": [cluster.end.pairing_string],
            "motif_score_right": ["{:.1e}".format(cluster.end.score)],
            "genes": [cluster.genes],
            "promoters": [cluster.promoters],
        }

        if i == 0:
            new_feature.qualifiers["note"] = [
                "best prediction (most abundant) for anchor gene {}".format(
                    anchor)
            ]
        else:
            new_feature.qualifiers["note"] = [
                "alternative prediction ({}) for anchor gene {}".format(
                    i, anchor)
            ]

        new_feature = ClusterBorder.from_biopython(new_feature)
        borders.append(new_feature)
    return borders
Esempio n. 5
0
 def create_border(self, rule_name, start, end):
     rule = self.rules_by_name[rule_name]
     return ClusterBorder(FeatureLocation(start, end),
                          tool="testing",
                          cutoff=rule.cutoff,
                          extent=rule.extent,
                          product=rule_name)
Esempio n. 6
0
def generate_results(record, options) -> List[ClusterBorder]:
    """ Find and construct cluster borders """
    rule_clusters = find_rule_based_clusters(record, options)
    prob_clusters = find_probabilistic_clusters(record, options)
    new_clusters = []
    new_clusters.extend(rule_clusters)
    for cluster in prob_clusters:
        new_cluster = ClusterBorder(cluster.location,
                                    tool="clusterfinder",
                                    probability=cluster.probability,
                                    product=PUTATIVE_PRODUCT,
                                    high_priority_product=False)
        new_clusters.append(new_cluster)
    if options.cf_create_clusters:
        for border in new_clusters:
            record.add_cluster_border(border)
    return ClusterFinderResults(record.id,
                                new_clusters,
                                create=options.cf_create_clusters)
Esempio n. 7
0
    def test_merges(self):
        clusterfinder.generate_results(self.record, self.config)
        assert len(self.record.get_cluster_borders()) == 2

        for start, end in [(10, 40), (1040, 1050), (110, 400)]:
            loc = FeatureLocation(start, end)
            self.record.add_cluster_border(ClusterBorder(loc, "testtool", product=str(start)))

        assert not self.record.get_clusters()

        self.record.create_clusters_from_borders()

        clusters = self.record.get_clusters()
        assert len(clusters) == 2

        assert clusters[0].location.start == 10
        assert clusters[0].location.end == 400
        assert clusters[0].products == ("10", "110")
        assert clusters[1].location.start == 1030
        assert clusters[1].location.end == 1120
        assert clusters[1].products == ("1040",)