Exemple #1
0
    def apply(self):
        tables, domain, source_var = [], None, None
        if self.primary_data is not None:
            tables = [self.primary_data] + list(self.more_data.values())
            domain = self.primary_data.domain
        elif self.more_data:
            tables = self.more_data.values()
            if self.merge_type == OWConcatenate.MergeUnion:
                domain = reduce(domain_union,
                                (table.domain for table in tables))
            else:
                domain = reduce(domain_intersection,
                                (table.domain for table in tables))

        if tables and self.append_source_column:
            assert domain is not None
            names = [getattr(t, 'name', '') for t in tables]
            if len(names) != len(set(names)):
                names = ['{} ({})'.format(name, i)
                         for i, name in enumerate(names)]
            source_var = Orange.data.DiscreteVariable(
                self.source_attr_name,
                values=names
            )
            places = ["class_vars", "attributes", "metas"]
            domain = add_columns(
                domain,
                **{places[self.source_column_role]: (source_var,)})

        tables = [table.transform(domain) for table in tables]
        if tables:
            data = type(tables[0]).concatenate(tables, axis=0)
            if source_var:
                source_ids = np.array(list(flatten(
                    [i] * len(table) for i, table in enumerate(tables)))).reshape((-1, 1))
                data[:, source_var] = source_ids

        else:
            data = None

        self.Outputs.data.send(data)
Exemple #2
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)]
        )
        clust_ids = km(self.data)
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
        else:
            self.Warning.no_silhouettes()
            scores = np.nan

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_ids.X.ravel()
        new_table.get_column_view(silhouette_var)[0][:] = scores

        centroids = Table(Domain(km.pre_domain.attributes), km.centroids)

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
    def _send_data(self):
        if self.partition is None or self.data is None:
            return
        domain = self.data.domain
        # Compute the frequency of each cluster index
        counts = np.bincount(self.partition)
        indices = np.argsort(counts)[::-1]
        index_map = {n: o for n, o in zip(indices, range(len(indices)))}
        new_partition = list(map(index_map.get, self.partition))

        cluster_var = DiscreteVariable(
            get_unique_names(domain, 'Cluster'),
            values=['C%d' % (i + 1) for i, _ in enumerate(np.unique(new_partition))]
        )

        new_domain = add_columns(domain, metas=[cluster_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = new_partition
        self.Outputs.annotated_data.send(new_table)

        if Graph is not None:
            graph = Graph(self.graph)
            graph.set_items(new_table)
            self.Outputs.graph.send(graph)
Exemple #4
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_next_name(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)])
        clust_ids = km(self.data)
        silhouette_var = ContinuousVariable(get_next_name(
            domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
        else:
            self.Warning.no_silhouettes()
            scores = np.nan

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_ids.X.ravel()
        new_table.get_column_view(silhouette_var)[0][:] = scores

        centroids = Table(Domain(km.pre_domain.attributes), km.centroids)

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
Exemple #5
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)])
        clust_ids = km(self.data)
        clust_col = clust_ids.X.ravel()
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
            clust_scores = []
            for i in range(km.k):
                in_clust = clust_col == i
                if in_clust.any():
                    clust_scores.append(np.mean(scores[in_clust]))
                else:
                    clust_scores.append(0.)
            clust_scores = np.atleast_2d(clust_scores).T
        else:
            self.Warning.no_silhouettes()
            scores = np.nan
            clust_scores = np.full((km.k, 1), np.nan)

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_col
        new_table.get_column_view(silhouette_var)[0][:] = scores

        centroid_attributes = [
            attr.compute_value.variable
            if isinstance(attr.compute_value, ReplaceUnknowns)
            and attr.compute_value.variable in domain.attributes else attr
            for attr in km.pre_domain.attributes
        ]
        centroid_domain = add_columns(Domain(centroid_attributes, [],
                                             domain.metas),
                                      metas=[cluster_var, silhouette_var])
        centroids = Table(
            centroid_domain, km.centroids, None,
            np.hstack((np.full((km.k, len(domain.metas)), np.nan),
                       np.arange(km.k).reshape(km.k, 1), clust_scores)))
        if self.data.name == Table.name:
            centroids.name = "centroids"
        else:
            centroids.name = f"{self.data.name} centroids"

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
Exemple #6
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)])
        clust_ids = km.labels
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
            clust_scores = []
            for i in range(km.k):
                in_clust = clust_ids == i
                if in_clust.any():
                    clust_scores.append(np.mean(scores[in_clust]))
                else:
                    clust_scores.append(0.)
            clust_scores = np.atleast_2d(clust_scores).T
        else:
            self.Warning.no_silhouettes()
            scores = np.nan
            clust_scores = np.full((km.k, 1), np.nan)

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        with new_table.unlocked(new_table.metas):
            new_table.get_column_view(cluster_var)[0][:] = clust_ids
            new_table.get_column_view(silhouette_var)[0][:] = scores

        domain_attributes = set(domain.attributes)
        centroid_attributes = [
            attr.compute_value.variable
            if isinstance(attr.compute_value, ReplaceUnknowns)
            and attr.compute_value.variable in domain_attributes else attr
            for attr in km.domain.attributes
        ]
        centroid_domain = add_columns(Domain(centroid_attributes, [],
                                             domain.metas),
                                      metas=[cluster_var, silhouette_var])
        # Table is constructed from a copy of centroids: if data is stored in
        # the widget, it can be modified, so the widget should preferrably
        # output a copy. The number of centroids is small, hence copying it is
        # cheap.
        centroids = Table(
            centroid_domain, km.centroids.copy(), None,
            np.hstack((np.full((km.k, len(domain.metas)), np.nan),
                       np.arange(km.k).reshape(km.k, 1), clust_scores)))
        if self.data.name == Table.name:
            centroids.name = "centroids"
        else:
            centroids.name = f"{self.data.name} centroids"

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
Exemple #7
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)]
        )
        clust_ids = km(self.data)
        clust_col = clust_ids.X.ravel()
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
            clust_scores = []
            for i in range(km.k):
                in_clust = clust_col == i
                if in_clust.any():
                    clust_scores.append(np.mean(scores[in_clust]))
                else:
                    clust_scores.append(0.)
            clust_scores = np.atleast_2d(clust_scores).T
        else:
            self.Warning.no_silhouettes()
            scores = np.nan
            clust_scores = np.full((km.k, 1), np.nan)

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_col
        new_table.get_column_view(silhouette_var)[0][:] = scores

        centroid_attributes = [
            attr.compute_value.variable
            if isinstance(attr.compute_value, ReplaceUnknowns)
            and attr.compute_value.variable in domain.attributes
            else attr
            for attr in km.pre_domain.attributes]
        centroid_domain = add_columns(
            Domain(centroid_attributes, [], domain.metas),
            metas=[cluster_var, silhouette_var])
        centroids = Table(
            centroid_domain, km.centroids, None,
            np.hstack((np.full((km.k, len(domain.metas)), np.nan),
                       np.arange(km.k).reshape(km.k, 1),
                       clust_scores))
        )
        if self.data.name == Table.name:
            centroids.name = "centroids"
        else:
            centroids.name = f"{self.data.name} centroids"

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)