def fit(self, x, y=None, classifier=None, **kwargs): if (not is_none(classifier)): self.classifier = classifier elif (not is_none(y)) and (not is_numeric(y[0])): self.classifier = True if ((type(x) != np.ndarray) or (len(x.shape) != 2)): raise (UnexpectedType("Provided 'x' should be a 2D numpy array.")) if (not is_none(y)): if (not hasattr(y, "__len__")): raise (MissingOperator( "Provided 'y' to fit must have defined '__len__' operator." )) elif (not hasattr(y, "__getitem__")): raise (MissingOperator( "Provided 'y' to fit must have defined '__getitem__' operator." )) elif (not hasattr(y[0], "__add__")): raise (MissingOperator( "Elements of provided 'y' must have defined '__add__' operator." )) elif (not hasattr(y[0], "__mul__")): raise (MissingOperator( "Elements of provided 'y' must have defined '__mul__' operator." )) self.y = y # Fit the provided x values. return self._fit(x.copy(), **kwargs)
def fit(self, x, y, *args, num_comps=None, **kwargs): # Set the number of components appropriately. if is_none(num_comps): num_comps = min(x.shape) if not is_none(dim): num_comps = min(dim, num_comps) # Compute the components and the values. if method == "PCA": # Compute the principle components as the new axes. components, values = pca(x, num_components=num_comps, **cond_kwargs) # Compute the values so that the transformed points have unit metric slope. values = normalize_error(np.matmul(x, components.T), y, metric, display) elif method == "MPCA": # Use metric PCA to compute components and values. components, values = mpca(x, y, metric=metric, num_components=num_comps, num_vecs=samples, **cond_kwargs) # Reset the values if scale should not be used. if not scale: values[:] = 1. if display: np.set_printoptions(precision=3, sign=" ") print("\nComponents and values:") for (c, v) in zip(components, values): print(f" {v:.2f} {c}") print() np.set_printoptions(precision=8, sign="-") # Generate the conditioning matrix. self.conditioner = np.matmul(np.diag(values), components).T # Return the normal fit operation. return super().fit(np.matmul(x, self.conditioner), y, *args, **kwargs)
def _fit(self, control_points, k=None, display=True, **kwargs): if (not is_none(k)): self.num_neighbors = k # Process and store local information self.points = control_points.copy() self.tree = KDTree(self.points) # Automatically select the value for "k" if appropriate and # the response values are available for the points. if is_none(self.num_neighbors): if (not is_none(self.y)): self.auto_kwargs.update(kwargs) # If "mean" was not provided, pick based on problem type. if "mean" not in self.auto_kwargs: self.auto_kwargs["mean"] = not self.classifier # Begin the estimation of best value for 'k'. end = ("\n" if display else "\r") print("Nearest neighbor, estimating best value for 'k'..", end=end, flush=True) self.num_neighbors = auto(self.points, self.y, **self.auto_kwargs) print(f" chose k = {self.num_neighbors}", end=end, flush=True) print(" ", end=end, flush=True) else: self.num_neighbors = 1
def fit(self, x, y, classifier=None, *args, **kwargs): if (not is_none(classifier)): self.classifier = classifier elif (not is_numeric(y[0])): self.classifier = True if ((type(x) != np.ndarray) or (len(x.shape) != 2)): raise (UnexpectedType("Provided 'x' should be a 2D numpy array.")) # If this is a classification problem, convert y values into # vertices of a regular simplex. if (self.classifier): from util.data import regular_simplex from util.system import sorted_unique self.class_map = sorted_unique(y) values = regular_simplex(len(self.class_map)) y = np.array([values[self.class_map.index(v)] for v in y]) if (type(y) == list): y = np.array(y) if (type(y) != np.ndarray): raise (UnexpectedType( "Provided 'y' should be a 1D or 2D numpy array.")) elif (len(y.shape) == 1): y = np.reshape(y, (y.shape[0], 1)) self._response_dim = 1 elif (len(y.shape) == 2): pass else: raise (UnexpectedShape( "Provided 'y' should be a 1D or 2D numpy array.")) return self._fit(x.copy(), y.copy(), *args, **kwargs)
def predict(self, x, *args, **kwargs): if ((type(x) != np.ndarray) or (0 >= len(x.shape) < 2)): raise (UnexpectedType( "Provided 'x' should be a 1D or 2D numpy array.")) single_response = len(x.shape) == 1 if single_response: x = np.reshape(x, (1, len(x))) indices, weights = self._predict(x.copy(), *args, **kwargs) # Return the indices and weights if no y values were provided. if (is_none(self.y)): response = [(ids, wts) for (ids, wts) in zip(indices, weights)] else: # Collect response values via weighted sums of self.y values response = [] for ids, wts in zip(indices, weights): if self.classifier: val_weights = {} # Sum the weights associated with each category. for i, w in zip(ids, wts): val_weights[self.y[i]] = val_weights.get( self.y[i], 0.) + w # Return the category with the largest sum of weight. response.append( max(val_weights.items(), key=lambda i: i[-1])[0]) else: # Return the weighted sum of predictions. response.append( sum(self.y[i] * w for (i, w) in zip(ids, wts))) # Reduce to one approximation point if that's what was provided. if single_response: response = response[0] # Return the response return response
def predict(self, points, *args, **kwargs): if ((type(points) != np.ndarray) or (0 >= len(points.shape) < 2)): raise (UnexpectedType( "Provided 'points' should be a 1D or 2D numpy array.")) # If values were provided, return usual prediction. elif not is_none(self.original_values): return super().predict(points) # Otherwise we are getting the points and indices in original data. single_response = len(points.shape) == 1 if single_response: points = np.reshape(points, (1, len(points))) # Otherwise, return points and weights in original indices. indices, weights = self._predict(points, *args, **kwargs) response = [] for ids, wts in zip(indices, weights): orig_ids = [] orig_wts = [] for (i, w) in zip(ids, wts): pt = tuple(self.original_points[self.unique_indices[i]]) orig_ids += self.unique_points[pt] # w /= len(self.unique_points[pt]) # <- Equally weight unique points. orig_wts += [w] * len(self.unique_points[pt]) # Normalize sum of weights, giving repeated points higher 'weight' orig_wts_sum = sum(orig_wts) orig_wts = [w / orig_wts_sum for w in orig_wts] response.append((orig_ids, orig_wts)) if single_response: response = response[0] return response
def fit(self, points, values=None, *args, **kwargs): if ((type(points) != np.ndarray) or (len(points.shape) != 2)): raise (UnexpectedType( "Expected 2D numpy array as first argument.")) self.original_points = points self.unique_points = {} for i, pt in enumerate(points): pt = tuple(pt) self.unique_points[pt] = self.unique_points.get(pt, []) + [i] # Store the indices of the first occurrence of each unique point. self.unique_indices = np.array( sorted(self.unique_points[pt][0] for pt in self.unique_points)) # Average the response value for the points that are identical. if (not is_none(values)): self.original_values = values to_add = set(self.unique_points) avg_values = [] for pt in self.original_points: pt = tuple(pt) if pt in to_add: indices = self.unique_points[pt] wt = 1. / len(indices) avg_values.append(sum(values[i] * wt for i in indices)) to_add.remove(pt) args = args + (avg_values, ) # Call the fit method on parent with unique points only. return super().fit(self.original_points[self.unique_indices, :], *args, **kwargs)
def _fit(self, points): from util.math import is_none # Sort points by their distance from the center of the data. center = (np.max(points, axis=0) - np.min(points, axis=0)) / 2 dists = np.linalg.norm(points - center, axis=1) indices = np.argsort(dists) if not is_none(self.y): self.y = [self.y[i] for i in indices] # Get points in a specific order. self.points = np.asarray(points[indices].T, order="F") self.box_sizes = np.ones((self.points.shape[0]*2,self.points.shape[1]), dtype=np.float64, order="F") * -1 self.meshes.build_ibm(self.points, self.box_sizes)
def auto(points, values, metric=abs_diff, max_k=None, samples=100, mean=True, k_step=1, model=NearestNeighbor, display=False): from util.random import random_range # Make the maximum value for "k" the nearest power of 2 that # contains less than or equal to half of the provided data. if is_none(max_k): from math import floor, log2 max_k = 2**floor(log2(len(points) // 2)) # Compute up to 'max_k' nearest neighbors about selected points. # Add "+1" to exclude the current active point as a neighbor. model = NearestNeighbor(k=max_k + 1) model.fit(points) # Randomly pick a set of points as the "checks". indices = [i for i in random_range(len(points), count=samples)] neighbors = np.array([i[1:] for (i, w) in model(points[indices])]) differences = np.array( [[metric(values[i1], values[i2]) for i2 in neighbors[i]] for i, i1 in enumerate(indices)]) k_values = {} # Pick the function for identifying the best selection of "k". for k_pow in range(0, int(log2(max_k)) + 1, k_step): k = 2**k_pow if mean: k_values[k] = np.mean(differences[:, :k]) else: k_values[k] = np.mean(np.min(differences[:, :k], axis=1)) if (2**k_pow != max_k): k = 2**int(log2(max_k)) if mean: k_values[k] = np.mean(differences[:, :k]) else: k_values[k] = np.mean(np.min(differences[:, :k], axis=1)) # Find the k with the lowest mean error. best_k = min(k_values.items(), key=lambda i: i[1])[0] if display: name = "mean" if mean else "minimum" from math import log10, ceil print('-' * 52) print(" Estimated " + name + " error for various choices of 'k':") for k in sorted(k_values): extra = " <-- chosen 'k'" if k == best_k else "" print(f" k = {k:{ceil(log10(max_k))}d} ~ {k_values[k]:.4e}" + extra) print('-' * 52) # Return the "k" with the minimum mean difference return best_k
def pca(points, num_components=None, display=True): from util.math import is_none from sklearn.decomposition import PCA pca = PCA(n_components=num_components) if is_none(num_components): num_components = min(*points.shape) else: num_components = min(num_components, *points.shape) if display: print(f"Computing {num_components} principle components..", end="\r", flush=True) pca.fit(points) if display: print(" ", end="\r", flush=True) principle_components = pca.components_ magnitudes = pca.singular_values_ # Normalize the component magnitudes to have sum 1. magnitudes /= np.sum(magnitudes) return principle_components, magnitudes
def samples(size=None, error=None, confidence=None, at=None): # Determine what to calculate based on what was provided. from util.math import is_none, choose, Fraction if is_none(size): to_calculate = "samples" elif is_none(error): to_calculate = "error" elif is_none(confidence): to_calculate = "confidence" else: to_calculate = "verify" # Default evaluation point is at (1/2), where the error is greatest. if is_none(at): at = Fraction(1, 2) else: at = Fraction(at) # Set the default values for other things that were not provided. if type(error) == type(None): error = Fraction(10, 100) if type(confidence) == type(None): confidence = Fraction(95, 100) # Convert error and confidence to fraction types if necessary. if not type(error) == Fraction: error = Fraction(error) if not type(confidence) == Fraction: confidence = Fraction(confidence) # If the user provided something with a length, use that number. if hasattr(size, "__len__"): size = len(size) # \sum_{i=0}^n choose(n, i) * ( at^i (1-at)^(n-i) ) if not is_none(size): # Compute the probability of any given observed EDF value. prob = lambda i: choose(size, i) * (at**i * (1 - at)**(size - i)) # If we are calculating the confidence or verifying, compute confidence. if to_calculate in {"confidence", "verify"}: if (at == 1 / 2): conf = _half_confidence(size, error) else: conf = Fraction() steps = 0 # Sum those probabilities that are closer than "error" distance. for i in range(size + 1): p = Fraction(i, size) if (abs(p - at) <= error): steps += 1 conf += prob(i) # Return the total confidence. if to_calculate == "confidence": return float(conf) else: return conf >= confidence elif to_calculate == "error": # Store the "contained" outcomes by "allowed error". error = Fraction() contained = Fraction() # Sort the percentiles by their distance from "at". i_p = sorted(enumerate( Fraction(i, size, _normalize=False) for i in range(size + 1)), key=lambda ip: abs(ip[1] - at)) # Cycle through percentiles, starting closest to "at" and moving out. for step in range(len(i_p)): # If this step has the same probability as the last, skip. if (i_p[step][1] == i_p[step - 1][1]): continue i, p = i_p[step] # Compute the amount of data contained by this step away. next_contained = contained + prob(i) # If the distance from "at" is the same for two steps, take two. if (step + 1 < len(i_p)) and (abs(at - i_p[step][1]) == abs(at - i_p[step + 1][1])): next_contained += prob(i_p[step + 1][0]) # Only update the "allowed error" if confidence is maintained. if next_contained < confidence: contained = next_contained error = abs(i_p[step][1] - at) else: break return float(error) else: # Compute the number of samples required. size, step = 2**10, 2**9 # print("Desired ----------------") # print("error: ",error) # print("confidence: ",confidence) # for size in range(2, 500): # conf_below = samples(size-1, error=error, at=at) # conf_at = samples(size, error=error, at=at) # print("", "size: ",size, float(f"{conf_below:.2e}"), float(f"{conf_at:.2e}")) # exit() under, over = 0, None # We have the right size when any smaller size is not passable. conf_below = samples(size=size - 1, error=error, at=at) conf_at = samples(size=size, error=error, at=at) print("", "size: ", size, float(f"{conf_below:.2e}"), float(f"{conf_at:.2e}")) while not (conf_below < confidence <= conf_at): if conf_at < confidence: # Update "under". Scale up if we haven't found "over". under = max(under, size) if (over == None): step *= 2 # Take the step. size += step else: # Update "over". Take step. Scale down. over = min(over if (over != None) else float('inf'), size) size = size - step step = step // 2 # Recompute the confidence at and below this step size. conf_below = samples(size - 1, error=error, at=at) conf_at = samples(size, error=error, at=at) print("", "size: ", size, float(f"{conf_below:.2e}"), float(f"{conf_at:.2e}")) # Correct for strange sample size error that can happen bc # of alignment of "at" and one of the sample values. if conf_at < conf_below: size = size - 1 conf_at = conf_below conf_below = samples(size - 1, error=error, at=at) print("", "size: ", size, float(f"{conf_below:.2e}"), float(f"{conf_at:.2e}")) # Return the computed best sample size. return size