Esempio n. 1
0
 def fit(self, x, y=None, classifier=None, **kwargs):
     if (not is_none(classifier)): self.classifier = classifier
     elif (not is_none(y)) and (not is_numeric(y[0])):
         self.classifier = True
     if ((type(x) != np.ndarray) or (len(x.shape) != 2)):
         raise (UnexpectedType("Provided 'x' should be a 2D numpy array."))
     if (not is_none(y)):
         if (not hasattr(y, "__len__")):
             raise (MissingOperator(
                 "Provided 'y' to fit must have defined '__len__' operator."
             ))
         elif (not hasattr(y, "__getitem__")):
             raise (MissingOperator(
                 "Provided 'y' to fit must have defined '__getitem__' operator."
             ))
         elif (not hasattr(y[0], "__add__")):
             raise (MissingOperator(
                 "Elements of provided 'y' must have defined '__add__' operator."
             ))
         elif (not hasattr(y[0], "__mul__")):
             raise (MissingOperator(
                 "Elements of provided 'y' must have defined '__mul__' operator."
             ))
         self.y = y
     # Fit the provided x values.
     return self._fit(x.copy(), **kwargs)
Esempio n. 2
0
 def fit(self, x, y, *args, num_comps=None, **kwargs):
     # Set the number of components appropriately.
     if is_none(num_comps): num_comps = min(x.shape)
     if not is_none(dim): num_comps = min(dim, num_comps)
     # Compute the components and the values.
     if method == "PCA":
         # Compute the principle components as the new axes.
         components, values = pca(x,
                                  num_components=num_comps,
                                  **cond_kwargs)
         # Compute the values so that the transformed points have unit metric slope.
         values = normalize_error(np.matmul(x, components.T), y, metric,
                                  display)
     elif method == "MPCA":
         # Use metric PCA to compute components and values.
         components, values = mpca(x,
                                   y,
                                   metric=metric,
                                   num_components=num_comps,
                                   num_vecs=samples,
                                   **cond_kwargs)
     # Reset the values if scale should not be used.
     if not scale: values[:] = 1.
     if display:
         np.set_printoptions(precision=3, sign=" ")
         print("\nComponents and values:")
         for (c, v) in zip(components, values):
             print(f" {v:.2f}  {c}")
         print()
         np.set_printoptions(precision=8, sign="-")
     # Generate the conditioning matrix.
     self.conditioner = np.matmul(np.diag(values), components).T
     # Return the normal fit operation.
     return super().fit(np.matmul(x, self.conditioner), y, *args,
                        **kwargs)
Esempio n. 3
0
 def _fit(self, control_points, k=None, display=True, **kwargs):
     if (not is_none(k)): self.num_neighbors = k
     # Process and store local information
     self.points = control_points.copy()
     self.tree = KDTree(self.points)
     # Automatically select the value for "k" if appropriate and
     # the response values are available for the points.
     if is_none(self.num_neighbors):
         if (not is_none(self.y)):
             self.auto_kwargs.update(kwargs)
             # If "mean" was not provided, pick based on problem type.
             if "mean" not in self.auto_kwargs:
                 self.auto_kwargs["mean"] = not self.classifier
             # Begin the estimation of best value for 'k'.
             end = ("\n" if display else "\r")
             print("Nearest neighbor, estimating best value for 'k'..",
                   end=end,
                   flush=True)
             self.num_neighbors = auto(self.points, self.y,
                                       **self.auto_kwargs)
             print(f"  chose k = {self.num_neighbors}", end=end, flush=True)
             print("                                                 ",
                   end=end,
                   flush=True)
         else:
             self.num_neighbors = 1
Esempio n. 4
0
 def fit(self, x, y, classifier=None, *args, **kwargs):
     if (not is_none(classifier)): self.classifier = classifier
     elif (not is_numeric(y[0])): self.classifier = True
     if ((type(x) != np.ndarray) or (len(x.shape) != 2)):
         raise (UnexpectedType("Provided 'x' should be a 2D numpy array."))
     # If this is a classification problem, convert y values into
     # vertices of a regular simplex.
     if (self.classifier):
         from util.data import regular_simplex
         from util.system import sorted_unique
         self.class_map = sorted_unique(y)
         values = regular_simplex(len(self.class_map))
         y = np.array([values[self.class_map.index(v)] for v in y])
     if (type(y) == list): y = np.array(y)
     if (type(y) != np.ndarray):
         raise (UnexpectedType(
             "Provided 'y' should be a 1D or 2D numpy array."))
     elif (len(y.shape) == 1):
         y = np.reshape(y, (y.shape[0], 1))
         self._response_dim = 1
     elif (len(y.shape) == 2):
         pass
     else:
         raise (UnexpectedShape(
             "Provided 'y' should be a 1D or 2D numpy array."))
     return self._fit(x.copy(), y.copy(), *args, **kwargs)
Esempio n. 5
0
 def predict(self, x, *args, **kwargs):
     if ((type(x) != np.ndarray) or (0 >= len(x.shape) < 2)):
         raise (UnexpectedType(
             "Provided 'x' should be a 1D or 2D numpy array."))
     single_response = len(x.shape) == 1
     if single_response:
         x = np.reshape(x, (1, len(x)))
     indices, weights = self._predict(x.copy(), *args, **kwargs)
     # Return the indices and weights if no y values were provided.
     if (is_none(self.y)):
         response = [(ids, wts) for (ids, wts) in zip(indices, weights)]
     else:
         # Collect response values via weighted sums of self.y values
         response = []
         for ids, wts in zip(indices, weights):
             if self.classifier:
                 val_weights = {}
                 # Sum the weights associated with each category.
                 for i, w in zip(ids, wts):
                     val_weights[self.y[i]] = val_weights.get(
                         self.y[i], 0.) + w
                 # Return the category with the largest sum of weight.
                 response.append(
                     max(val_weights.items(), key=lambda i: i[-1])[0])
             else:
                 # Return the weighted sum of predictions.
                 response.append(
                     sum(self.y[i] * w for (i, w) in zip(ids, wts)))
     # Reduce to one approximation point if that's what was provided.
     if single_response: response = response[0]
     # Return the response
     return response
Esempio n. 6
0
 def predict(self, points, *args, **kwargs):
     if ((type(points) != np.ndarray) or (0 >= len(points.shape) < 2)):
         raise (UnexpectedType(
             "Provided 'points' should be a 1D or 2D numpy array."))
     # If values were provided, return usual prediction.
     elif not is_none(self.original_values):
         return super().predict(points)
     # Otherwise we are getting the points and indices in original data.
     single_response = len(points.shape) == 1
     if single_response:
         points = np.reshape(points, (1, len(points)))
     # Otherwise, return points and weights in original indices.
     indices, weights = self._predict(points, *args, **kwargs)
     response = []
     for ids, wts in zip(indices, weights):
         orig_ids = []
         orig_wts = []
         for (i, w) in zip(ids, wts):
             pt = tuple(self.original_points[self.unique_indices[i]])
             orig_ids += self.unique_points[pt]
             # w /= len(self.unique_points[pt]) # <- Equally weight unique points.
             orig_wts += [w] * len(self.unique_points[pt])
         # Normalize sum of weights, giving repeated points higher 'weight'
         orig_wts_sum = sum(orig_wts)
         orig_wts = [w / orig_wts_sum for w in orig_wts]
         response.append((orig_ids, orig_wts))
     if single_response: response = response[0]
     return response
Esempio n. 7
0
 def fit(self, points, values=None, *args, **kwargs):
     if ((type(points) != np.ndarray) or (len(points.shape) != 2)):
         raise (UnexpectedType(
             "Expected 2D numpy array as first argument."))
     self.original_points = points
     self.unique_points = {}
     for i, pt in enumerate(points):
         pt = tuple(pt)
         self.unique_points[pt] = self.unique_points.get(pt, []) + [i]
     # Store the indices of the first occurrence of each unique point.
     self.unique_indices = np.array(
         sorted(self.unique_points[pt][0] for pt in self.unique_points))
     # Average the response value for the points that are identical.
     if (not is_none(values)):
         self.original_values = values
         to_add = set(self.unique_points)
         avg_values = []
         for pt in self.original_points:
             pt = tuple(pt)
             if pt in to_add:
                 indices = self.unique_points[pt]
                 wt = 1. / len(indices)
                 avg_values.append(sum(values[i] * wt for i in indices))
                 to_add.remove(pt)
         args = args + (avg_values, )
     # Call the fit method on parent with unique points only.
     return super().fit(self.original_points[self.unique_indices, :],
                        *args, **kwargs)
Esempio n. 8
0
 def _fit(self, points):
     from util.math import is_none
     # Sort points by their distance from the center of the data.
     center = (np.max(points, axis=0) - np.min(points, axis=0)) / 2
     dists = np.linalg.norm(points - center, axis=1)
     indices = np.argsort(dists)
     if not is_none(self.y): self.y = [self.y[i] for i in indices]
     # Get points in a specific order.
     self.points = np.asarray(points[indices].T, order="F")
     self.box_sizes = np.ones((self.points.shape[0]*2,self.points.shape[1]),
                               dtype=np.float64, order="F") * -1
     self.meshes.build_ibm(self.points, self.box_sizes)
Esempio n. 9
0
def auto(points,
         values,
         metric=abs_diff,
         max_k=None,
         samples=100,
         mean=True,
         k_step=1,
         model=NearestNeighbor,
         display=False):
    from util.random import random_range
    # Make the maximum value for "k" the nearest power of 2 that
    # contains less than or equal to half of the provided data.
    if is_none(max_k):
        from math import floor, log2
        max_k = 2**floor(log2(len(points) // 2))
    # Compute up to 'max_k' nearest neighbors about selected points.
    # Add "+1" to exclude the current active point as a neighbor.
    model = NearestNeighbor(k=max_k + 1)
    model.fit(points)
    # Randomly pick a set of points as the "checks".
    indices = [i for i in random_range(len(points), count=samples)]
    neighbors = np.array([i[1:] for (i, w) in model(points[indices])])
    differences = np.array(
        [[metric(values[i1], values[i2]) for i2 in neighbors[i]]
         for i, i1 in enumerate(indices)])
    k_values = {}
    # Pick the function for identifying the best selection of "k".
    for k_pow in range(0, int(log2(max_k)) + 1, k_step):
        k = 2**k_pow
        if mean: k_values[k] = np.mean(differences[:, :k])
        else: k_values[k] = np.mean(np.min(differences[:, :k], axis=1))
    if (2**k_pow != max_k):
        k = 2**int(log2(max_k))
        if mean: k_values[k] = np.mean(differences[:, :k])
        else: k_values[k] = np.mean(np.min(differences[:, :k], axis=1))
    # Find the k with the lowest mean error.
    best_k = min(k_values.items(), key=lambda i: i[1])[0]
    if display:
        name = "mean" if mean else "minimum"
        from math import log10, ceil
        print('-' * 52)
        print(" Estimated " + name + " error for various choices of 'k':")
        for k in sorted(k_values):
            extra = "  <-- chosen 'k'" if k == best_k else ""
            print(f"  k = {k:{ceil(log10(max_k))}d} ~ {k_values[k]:.4e}" +
                  extra)
        print('-' * 52)
    # Return the "k" with the minimum mean difference
    return best_k
Esempio n. 10
0
def pca(points, num_components=None, display=True):
    from util.math import is_none
    from sklearn.decomposition import PCA
    pca = PCA(n_components=num_components)
    if is_none(num_components): num_components = min(*points.shape)
    else: num_components = min(num_components, *points.shape)
    if display:
        print(f"Computing {num_components} principle components..",
              end="\r",
              flush=True)
    pca.fit(points)
    if display:
        print("                                                          ",
              end="\r",
              flush=True)
    principle_components = pca.components_
    magnitudes = pca.singular_values_
    # Normalize the component magnitudes to have sum 1.
    magnitudes /= np.sum(magnitudes)
    return principle_components, magnitudes
Esempio n. 11
0
def samples(size=None, error=None, confidence=None, at=None):
    # Determine what to calculate based on what was provided.
    from util.math import is_none, choose, Fraction
    if is_none(size): to_calculate = "samples"
    elif is_none(error): to_calculate = "error"
    elif is_none(confidence): to_calculate = "confidence"
    else: to_calculate = "verify"
    # Default evaluation point is at (1/2), where the error is greatest.
    if is_none(at): at = Fraction(1, 2)
    else: at = Fraction(at)
    # Set the default values for other things that were not provided.
    if type(error) == type(None): error = Fraction(10, 100)
    if type(confidence) == type(None): confidence = Fraction(95, 100)
    # Convert error and confidence to fraction types if necessary.
    if not type(error) == Fraction: error = Fraction(error)
    if not type(confidence) == Fraction: confidence = Fraction(confidence)
    # If the user provided something with a length, use that number.
    if hasattr(size, "__len__"): size = len(size)
    # \sum_{i=0}^n choose(n, i) * ( at^i (1-at)^(n-i) )
    if not is_none(size):
        # Compute the probability of any given observed EDF value.
        prob = lambda i: choose(size, i) * (at**i * (1 - at)**(size - i))
        # If we are calculating the confidence or verifying, compute confidence.
        if to_calculate in {"confidence", "verify"}:
            if (at == 1 / 2): conf = _half_confidence(size, error)
            else:
                conf = Fraction()
                steps = 0
                # Sum those probabilities that are closer than "error" distance.
                for i in range(size + 1):
                    p = Fraction(i, size)
                    if (abs(p - at) <= error):
                        steps += 1
                        conf += prob(i)
            # Return the total confidence.
            if to_calculate == "confidence": return float(conf)
            else: return conf >= confidence
        elif to_calculate == "error":
            # Store the "contained" outcomes by "allowed error".
            error = Fraction()
            contained = Fraction()
            # Sort the percentiles by their distance from "at".
            i_p = sorted(enumerate(
                Fraction(i, size, _normalize=False) for i in range(size + 1)),
                         key=lambda ip: abs(ip[1] - at))
            # Cycle through percentiles, starting closest to "at" and moving out.
            for step in range(len(i_p)):
                # If this step has the same probability as the last, skip.
                if (i_p[step][1] == i_p[step - 1][1]): continue
                i, p = i_p[step]
                # Compute the amount of data contained by this step away.
                next_contained = contained + prob(i)
                # If the distance from "at" is the same for two steps, take two.
                if (step + 1 < len(i_p)) and (abs(at - i_p[step][1])
                                              == abs(at - i_p[step + 1][1])):
                    next_contained += prob(i_p[step + 1][0])
                # Only update the "allowed error" if confidence is maintained.
                if next_contained < confidence:
                    contained = next_contained
                    error = abs(i_p[step][1] - at)
                else:
                    break
            return float(error)
    else:
        # Compute the number of samples required.
        size, step = 2**10, 2**9
        # print("Desired ----------------")
        # print("error:      ",error)
        # print("confidence: ",confidence)
        # for size in range(2, 500):
        #     conf_below = samples(size-1, error=error, at=at)
        #     conf_at = samples(size, error=error, at=at)
        #     print("", "size: ",size, float(f"{conf_below:.2e}"), float(f"{conf_at:.2e}"))
        # exit()

        under, over = 0, None
        # We have the right size when any smaller size is not passable.
        conf_below = samples(size=size - 1, error=error, at=at)
        conf_at = samples(size=size, error=error, at=at)
        print("", "size: ", size, float(f"{conf_below:.2e}"),
              float(f"{conf_at:.2e}"))
        while not (conf_below < confidence <= conf_at):
            if conf_at < confidence:
                # Update "under". Scale up if we haven't found "over".
                under = max(under, size)
                if (over == None): step *= 2
                # Take the step.
                size += step
            else:
                # Update "over". Take step. Scale down.
                over = min(over if (over != None) else float('inf'), size)
                size = size - step
                step = step // 2
            # Recompute the confidence at and below this step size.
            conf_below = samples(size - 1, error=error, at=at)
            conf_at = samples(size, error=error, at=at)
            print("", "size: ", size, float(f"{conf_below:.2e}"),
                  float(f"{conf_at:.2e}"))
            # Correct for strange sample size error that can happen bc
            # of alignment of "at" and one of the sample values.
            if conf_at < conf_below:
                size = size - 1
                conf_at = conf_below
                conf_below = samples(size - 1, error=error, at=at)
                print("", "size: ", size, float(f"{conf_below:.2e}"),
                      float(f"{conf_at:.2e}"))
        # Return the computed best sample size.
        return size