Ejemplo n.º 1
0
    def make_line(self, x, n_images=50):
        """
        Returns tuple of lambda function for query line and line segment (which has shape (n_images, n_features))
        :param x: query point in scaled space
        :param n_images: how many images the line segment will consist of.
        :return:
        """
        x = to_vector(x)  # in scaled space
        x_p = project_point_on_decision_boundary(self.model.w, self.model.b,
                                                 x)  # in scaled space
        line = lambda t: x_p + (x - x_p) * t  # in scaled space

        all_train_data_scaled = self.dataset.scaling_transformation.transform(
            self.dataset.data["features"])  # Get all data in scaled space
        all_data_center = to_vector(all_train_data_scaled.mean(
            axis=0))  # Mean in scaled space (probably, near zero)
        if self.r is None:  # Compute r only the first time
            self.r = compute_radius_sphere(scaled_data=all_train_data_scaled)
            print "Radius", self.r
        del all_train_data_scaled

        if self.base_precision is not None:
            n_images = int(
                math.ceil(self.longest_line_possible /
                          float(self.base_precision)) + 1)

        line_segment = make_line_segment(radius=self.r,
                                         mu=all_data_center,
                                         a=to_vector(x_p),
                                         b=to_vector(x),
                                         n_points_line=n_images)
        return line, line_segment, to_vector(x_p)
Ejemplo n.º 2
0
 def label(self,
           sample,
           line=None,
           line_segment=None,
           sample_already_scaled=False,
           intersection_point_cdb=None):
     """
     This function labels a query line with its decision boundary point (= intersection line with decision boundary)
     and the label of the query point from which the query line was created.
     :param sample: query point: (n_features, 1) in original dataspace
     :param line: query line in scaled data space
     :param line_segment: can be used to convert to human understandable line query. In scaled data space
     :return:
     """
     sample = to_vector(sample).T
     if self.ideal_labeler is None:
         if not sample_already_scaled:
             # Transform from original data space to ground truth data space = scaled space.
             sample = self.dataset.scaling_transformation.transform(sample)
         label = self.predict(sample)
     else:
         label = self.ideal_labeler.label(sample.squeeze())
     A, B = line(0), line(1)  # a + (b-a)*0 = a and a+(b-a)*1 = b
     db_point = compute_intersection_line_decision_boundary(
         A, B, self.w, self.b0)
     assert db_point.shape == A.shape
     return label, db_point
Ejemplo n.º 3
0
 def label(self, sample, sample_already_scaled=False, *args):
     # Transform from original data space to ground truth data space = scaled space.
     sample = to_vector(sample).T
     if not sample_already_scaled:
         sample = self.dataset.scaling_transformation.transform(sample)
     label = self.predict(sample)
     return label
Ejemplo n.º 4
0
    def make_query(self):
        try:
            unlabeled_train_data = self.dataset.get_unlabeled_train_data()
        except ValueError:
            raise IndexError("No more unlabeled train samples")
        unlabeled_entry_ids, X_pool = unlabeled_train_data[
            "entry_ids"], unlabeled_train_data["features"]
        del unlabeled_train_data
        if len(X_pool) > 0:
            start_time = time.time()
            # Cluster centroids!
            if self.n_queries % self.batch_size == 0:
                # Cluster again!
                self.clustered = KMeans(n_clusters=self.batch_size).fit(X_pool)
            query_image = self.clustered.cluster_centers_[self.n_queries %
                                                          self.batch_size]
            query_image_original_space = self.dataset._scaling_transformation.inverse_transform(
                to_vector(query_image).T).T  # (n_features, 1)
            print "Found new query using %d unlabeled clustered samples in %.2f seconds" % (
                X_pool.shape[0], time.time() - start_time)
            self.n_queries += 1
            if self.save_path_queries is not None:
                self.save_query_to_hdf5_point(self.save_path_queries_hdf5, -1,
                                              query_image_original_space.T)

            return None, query_image
        else:
            raise IndexError("No more unlabeled train samples")
Ejemplo n.º 5
0
    def save_query_to_hdf5_point(self, save_path_queries_hdf5, entry_id,
                                 sample_original_space):
        """
        :return:
        """
        sample_original_space = to_vector(sample_original_space).T
        if os.path.isfile(save_path_queries_hdf5):
            with h5py.File(save_path_queries_hdf5, 'r+') as hf:
                points_dataset = hf.get('point_queries')
                already_in_points_ds = points_dataset.shape[0]
                points_dataset.resize(already_in_points_ds +
                                      sample_original_space.shape[0],
                                      axis=0)
                points_dataset[
                    already_in_points_ds:already_in_points_ds +
                    sample_original_space.shape[0], :] = sample_original_space

                entryids_dataset = hf.get('entry_ids')
                already_in_entryids_ds = entryids_dataset.len()
                entryids_dataset.resize(already_in_entryids_ds + 1, axis=0)
                entryids_dataset[
                    already_in_entryids_ds:already_in_entryids_ds +
                    1] = entry_id

                split_dict = {
                    "data": {
                        "point_queries": (0, already_in_points_ds +
                                          sample_original_space.shape[0]),
                        "entry_ids": (0, already_in_entryids_ds + 1)
                    }
                }
                hf.attrs["split"] = H5PYDataset.create_split_array(split_dict)
        else:
            # HDF5 query line save file does not exist yet!
            f = h5py.File(save_path_queries_hdf5, "w")

            points_dataset = f.create_dataset(
                'point_queries',
                sample_original_space.shape,
                maxshape=(None, sample_original_space.shape[1]),
                dtype="float32")
            points_dataset[...] = sample_original_space
            entryids_dataset = f.create_dataset('entry_ids', (1, ),
                                                maxshape=(None, ),
                                                dtype=int)
            entryids_dataset[...] = entry_id

            split_dict = {
                "data": {
                    "point_queries": (0, sample_original_space.shape[0]),
                    "entry_ids": (0, 1)
                }
            }
            f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
            f.flush()
            f.close()
Ejemplo n.º 6
0
    def generate_images_line_save(self,
                                  line_segment,
                                  query_id,
                                  image_original_space=None):
        """
        ID of query point from which query line was generated is
        added to the filename of the saved line query.
        :param line_segment:
        :param query_id:
        :return:
        """
        try:
            if image_original_space is not None:
                x = self.generative_model.decode(image_original_space.T)
            else:
                x = self.generative_model.decode(
                    to_vector(self.dataset.data["features"][query_id]).T
                )  # comes from dataset.data["features"], so is already in original space in which ALI operates.
            save_path = os.path.join(
                self.save_path_queries,
                "pointquery_%d_%d.png" % (self.n_queries + 1, query_id))
            if x.shape[1] == 1:
                plt.imsave(save_path, x[0, 0, :, :], cmap=cm.Greys)
            else:
                plt.imsave(save_path,
                           x[0, :, :, :].transpose(1, 2, 0),
                           cmap=cm.Greys_r)

            decoded_images = self.generative_model.decode(
                self.dataset.scaling_transformation.inverse_transform(
                    line_segment)
            )  # Transform to original space, in which ALI operates.
            figure = plt.figure()
            grid = ImageGrid(figure,
                             111, (1, decoded_images.shape[0]),
                             axes_pad=0.1)
            for image, axis in zip(decoded_images, grid):
                if image.shape[0] == 1:
                    axis.imshow(image[0, :, :].squeeze(),
                                cmap=cm.Greys,
                                interpolation='nearest')
                else:
                    axis.imshow(image.transpose(1, 2, 0).squeeze(),
                                cmap=cm.Greys_r,
                                interpolation='nearest')
                axis.set_yticklabels(['' for _ in range(image.shape[1])])
                axis.set_xticklabels(['' for _ in range(image.shape[2])])
                axis.axis('off')
            save_path = os.path.join(
                self.save_path_queries,
                "linequery_%d_%d.pdf" % (self.n_queries + 1, query_id))
            plt.savefig(save_path, transparent=True, bbox_inches='tight')
        except Exception as e:
            print "EXCEPTION:", traceback.format_exc()
            raise e
Ejemplo n.º 7
0
 def label(self, sample, sample_already_scaled=False, *args):
     # Transform from original data space to ground truth data space = scaled space.
     sample = to_vector(sample)
     if sample_already_scaled:
         sample = self.dataset.scaling_transformation.inverse_transform(
             sample.T).T
     point_query_image = self.ali.decode(sample.T)
     oracle = PointLabelInterface(
         point_query_image,
         list(self.dataset.classes),
         classes_dictionary=self.dataset.classes_dictionary)
     return oracle.label_point_query
Ejemplo n.º 8
0
 def make_query(self, n_images=50):
     try:
         unlabeled_train_data = self.dataset.get_unlabeled_train_data()
     except ValueError:
         raise IndexError("No more unlabeled train samples")
     unlabeled_entry_ids, X_pool = unlabeled_train_data[
         "entry_ids"], unlabeled_train_data["features"]
     del unlabeled_train_data
     if len(X_pool) > 0:
         start_time = time.time()
         # Cluster centroids!
         if self.n_queries % self.batch_size == 0:
             # Cluster again!
             self.clustered = KMeans(n_clusters=self.batch_size).fit(X_pool)
         query_image = self.clustered.cluster_centers_[self.n_queries %
                                                       self.batch_size]
         query_image_original_space = self.dataset._scaling_transformation.inverse_transform(
             to_vector(query_image).T).T  # (n_features, 1)
         print "Found new query using %d unlabeled clustered samples in %.2f seconds" % (
             X_pool.shape[0], time.time() - start_time)
         # Make line. Project query_image on current decision boundary
         start_time = time.time()
         line, line_segment, intersection_point = self.make_line(
             query_image, n_images)  # scaled space
         print "Made line from found query in %.2f seconds" % (time.time() -
                                                               start_time)
         if not self.human_experiment and self.generative_model is not None:
             # Change 1 to higher number for faster algorithm (less generating and plotting)
             if self.n_queries % 1 == 0:
                 start_time = time.time()
                 self.generate_images_line_save(
                     line_segment,
                     None,
                     image_original_space=query_image_original_space)
                 print "Plotted query line in %.2f seconds" % (time.time() -
                                                               start_time)
         else:
             self.save_query_to_hdf5(
                 query_image_original_space.T,  # original space
                 -1,
                 self.dataset.scaling_transformation.inverse_transform(
                     line_segment),
                 self.dataset.scaling_transformation.inverse_transform(
                     intersection_point.T))
         self.n_queries += 1
         return None, line, line_segment, query_image, intersection_point
     else:
         raise IndexError("No more unlabeled train samples")
Ejemplo n.º 9
0
 def update(self, entry_id, new_label, sample=None):
     """
     Updates an entry with entry_id with the given label.
     :param entry_id: entry id of the sample to update.
     :param label: Label of the sample to be update.
     """
     if isinstance(new_label, int):
         new_label = np.array(new_label).reshape(1, 1)
     if entry_id is None and sample is not None:
         self.data["features"] = np.concatenate(
             (self.data["features"], to_vector(sample).T), axis=0)
         self.data["targets"] = np.concatenate(
             (self.data["targets"], new_label), axis=0)
     else:
         self.data["targets"][entry_id] = new_label
     for callback in self._update_callback:
         callback(entry_id, new_label)
Ejemplo n.º 10
0
 def make_query(self, n_images=50):
     try:
         unlabeled_train_data = self.dataset.get_unlabeled_train_data()
     except ValueError:
         raise IndexError("No more unlabeled train samples")
     unlabeled_entry_ids, X_pool = unlabeled_train_data[
         "entry_ids"], unlabeled_train_data["features"]
     del unlabeled_train_data
     if len(X_pool) > 0:
         # least confident and most representative of data
         start_time = time.time()
         uncertainties = np.max(self.model.predict_real(X_pool), axis=1)
         ask_id = self.get_most_uncertainty_dense(uncertainties,
                                                  self.similarity_matrix,
                                                  beta=1)
         self.delete_index_similarity_matrix(ask_id)
         print "Found new query amongst %d unlabeled samples in %.2f seconds" % (
             X_pool.shape[0], time.time() - start_time)
         # Make line. Project query_image on current decision boundary
         start_time = time.time()
         line, line_segment, intersection_point = self.make_line(
             X_pool[ask_id], n_images)
         print "Made line from found query in %.2f seconds" % (time.time() -
                                                               start_time)
         if not self.human_experiment and self.generative_model is not None:
             # Change 1 to higher number for faster algorithm (less generating and plotting)
             if self.n_queries % 1 == 0:
                 start_time = time.time()
                 self.generate_images_line_save(line_segment,
                                                unlabeled_entry_ids[ask_id])
                 print "Plotted query line in %.2f seconds" % (time.time() -
                                                               start_time)
         else:
             self.save_query_to_hdf5(
                 to_vector(self.dataset.data["features"][
                     unlabeled_entry_ids[ask_id]]).T,
                 unlabeled_entry_ids[ask_id],
                 self.dataset.scaling_transformation.inverse_transform(
                     line_segment),
                 self.dataset.scaling_transformation.inverse_transform(
                     intersection_point.T))
         self.n_queries += 1
         return unlabeled_entry_ids[
             ask_id], line, line_segment, None, intersection_point
     else:
         raise IndexError("No more unlabeled train samples")
Ejemplo n.º 11
0
    def save_decision_boundary(self, w, b):
        """
        :return:
        """
        w = to_vector(w).T
        if os.path.isfile(self.save_path_boundaries):
            with h5py.File(self.save_path_boundaries, 'r+') as hf:
                w_dataset = hf.get('w')
                already_in_w_ds = w_dataset.shape[0]
                w_dataset.resize(already_in_w_ds + w.shape[0], axis=0)
                w_dataset[already_in_w_ds:already_in_w_ds + w.shape[0], :] = w

                b_dataset = hf.get('b')
                already_in_b_ds = b_dataset.len()
                b_dataset.resize(already_in_b_ds + 1, axis=0)
                b_dataset[already_in_b_ds:already_in_b_ds + 1] = b

                split_dict = {
                    "data": {
                        "w": (0, already_in_w_ds + w.shape[0]),
                        "b": (0, already_in_b_ds + 1)
                    }
                }
                hf.attrs["split"] = H5PYDataset.create_split_array(split_dict)
        else:
            # HDF5 query line save file does not exist yet!
            f = h5py.File(self.save_path_boundaries, "w")

            w_dataset = f.create_dataset('w',
                                         w.shape,
                                         maxshape=(None, w.shape[1]),
                                         dtype="float32")
            w_dataset[...] = w

            b_dataset = f.create_dataset('b', (1, ),
                                         maxshape=(None, ),
                                         dtype="float32")
            b_dataset[...] = b

            split_dict = {"data": {"w": (0, w.shape[0]), "b": (0, 1)}}
            f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
            f.flush()
            f.close()
Ejemplo n.º 12
0
 def make_query(self, n_images=50):
     try:
         unlabeled_train_data = self.dataset.get_unlabeled_train_data()
     except ValueError:
         raise IndexError("No more unlabeled train samples")
     unlabeled_entry_ids, X_pool = unlabeled_train_data[
         "entry_ids"], unlabeled_train_data["features"]
     del unlabeled_train_data
     if len(X_pool) > 0:
         # least confident
         start_time = time.time()
         ask_id = np.random.randint(0, len(unlabeled_entry_ids))
         print "Found new query amongst %d unlabeled samples in %.2f seconds" % (
             X_pool.shape[0], time.time() - start_time)
         # Make line. Project query_image on current decision boundary
         start_time = time.time()
         line, line_segment, intersection_point = self.make_line(
             X_pool[ask_id], n_images)
         print "Made line from found query in %.2f seconds" % (time.time() -
                                                               start_time)
         if not self.human_experiment and self.generative_model is not None:
             # Change 1 to higher number for faster algorithm (less generating and plotting)
             if self.n_queries % 1 == 0:
                 start_time = time.time()
                 self.generate_images_line_save(line_segment,
                                                unlabeled_entry_ids[ask_id])
                 print "Plotted query line in %.2f seconds" % (time.time() -
                                                               start_time)
         else:
             self.save_query_to_hdf5(
                 to_vector(self.dataset.data["features"][
                     unlabeled_entry_ids[ask_id]]).T,
                 # Already in original space
                 unlabeled_entry_ids[ask_id],
                 self.dataset.scaling_transformation.inverse_transform(
                     line_segment),  # Transform to original space,
                 self.dataset.scaling_transformation.inverse_transform(
                     intersection_point.T))
         self.n_queries += 1
         return unlabeled_entry_ids[
             ask_id], line, line_segment, None, intersection_point
     else:
         raise IndexError("No more unlabeled train samples")
Ejemplo n.º 13
0
 def label(self,
           sample,
           line=None,
           line_segment=None,
           sample_already_scaled=False,
           intersection_point_cdb=None):
     """
     NB ONLY HANDLES LINES FROM UNCERTAINTY STRATEGY (for clustercentroids, check if everything is in correct space!)
     :param sample: for uncertainty strategy in original ALI space
     :param line: lambda function in scaled space
     :param line_segment: scaled space
     :param sample_already_scaled:
     :return:
     """
     sample = to_vector(sample)  # original space if uncertainty strategy
     if sample_already_scaled:
         sample = self.dataset.scaling_transformation.inverse_transform(
             sample.T).T
     line_segment_original_space = self.dataset.scaling_transformation.inverse_transform(
         line_segment)
     line_images = self.generate_images(line_segment_original_space)
     point_query_image = self.generate_images(sample.T)
     intersection_point_cdb_original_space = self.dataset.scaling_transformation.inverse_transform(
         intersection_point_cdb.T).T
     oracle = LabelInterface(
         line_segment_original_space,
         line_images,
         point_query=sample,
         point_query_image=point_query_image,
         intersection_point_cdb=intersection_point_cdb_original_space,
         classes=list(self.dataset.classes),
         classes_dictionary=self.dataset.classes_dictionary)
     db_point_original_space = oracle.chosen_point
     if db_point_original_space is not None:
         db_point_scaled_space = self.dataset.scaling_transformation.transform(
             db_point_original_space.T).T
     else:
         db_point_scaled_space = None
     label_point_query = oracle.label_point_query
     print "Chosen label", label_point_query
     return label_point_query, db_point_scaled_space