def get_dimwise_prob_metrics(X_real: np.array, X_fake: np.array, y_real: np.array = None, y_fake: np.array = None, measure='mean', n_num_cols: int = 0): if measure in ['mean', 'avg']: real = X_real.mean(axis=0) fake = X_fake.mean(axis=0) elif measure == 'std': real = X_real.std(axis=0) fake = X_fake.std(axis=0) else: raise ValueError( f'"measure" must be "mean" or "std" but "{measure}" was specified.' ) corr_value = pearsonr(real, fake)[0] rmse_value = np.sqrt(mean_squared_error(real, fake)) if n_num_cols > 0: num_corr_value = pearsonr(real[:n_num_cols], fake[:n_num_cols])[0] num_rmse_value = np.sqrt( mean_squared_error(real[:n_num_cols], fake[:n_num_cols])) else: num_rmse_value, num_corr_value = -1, -1 if X_real.shape[1] - n_num_cols > 0: cat_corr_value = pearsonr(real[n_num_cols:], fake[n_num_cols:])[0] cat_rmse_value = np.sqrt( mean_squared_error(real[n_num_cols:], fake[n_num_cols:])) else: cat_rmse_value, cat_corr_value = -1, -1, return rmse_value, corr_value, num_rmse_value, num_corr_value, cat_rmse_value, cat_corr_value
def update_parameters(self, epsilons: np.array, r_plus: np.array, r_minus: np.array): """ This method update internal mu and sigma according to evaluation results of perturbated parameters mus, sigmas, baseline, maximum_reward are updated. epsilons: parameter_number x sample_number r_plus: sample_number r_minus: sample_number """ self.__mu += self.mu_delta(epsilons, r_plus, r_minus) self.__mu = np.minimum( self.__mu, self.__parameter_bound["upper"]) # prevent too large self.__mu = np.maximum( self.__mu, self.__parameter_bound["lower"]) # prevent too small self.__sigma += self.sigma_delta(epsilons, r_plus, r_minus) self.__sigma = np.minimum( self.__sigma, self.__sigma_upper_bound) # prevent too large sigma assert np.all(self.__sigma > 0), "got negative sigma\n{}".format( self.__sigma) print("updated sigma", self.__sigma) self.__maximum_reward = max(self.__maximum_reward, r_plus.max(), r_minus.max()) b = self.__baseline.add_new_value( (r_plus.mean() + r_minus.mean()) / 2.)
def fit(self, X: np.array, y: np.array): EPS = 1e-10 if self.scale: X, self.X_offset, self.X_scale = scale(X) n_samples, n_features = X.shape _, self.n_classes = y.shape self.p_classes = np.zeros(self.n_classes) self.mean_class = np.zeros((self.n_classes, n_features)) self.cov_class = np.zeros((n_features, n_features)) # within for i in range(self.n_classes): self.p_classes[i] = sum(y[:, i]) / n_samples Xi = X[y[:, i] == 1] self.mean_class[i] = Xi.mean(0) self.cov_class += np.cov(Xi.T) self.S_within = self.cov_class # between self.S_between = (self.mean_class - X.mean(0)).T @ (self.mean_class - X.mean(0)) pinv = np.linalg.pinv(self.S_within) if self.solver == "svd": u, s, v = np.linalg.svd(pinv @ self.S_between) self.proj = v.T # self.bias = TODO elif self.solver == "eig": xc = pinv @ self.S_between evalue, evector = np.linalg.eigh(xc) self.proj = evector[::-1].T # self.bias = TODO else: raise NotImplementedError
def gpr_distance(x: np.array, y: np.array, theta: float) -> float: """ Calculates the distance between two Gaussians under the Generic Parametric Representation (GPR) approach. According to the original work https://www.researchgate.net/publication/322714557 (p.70): "This is a fast and good proxy for distance d_theta when the first two moments ... predominate". But it's not a good metric for heavy-tailed distributions. Parameter theta defines what type of information dependency is being tested: - for theta = 0 the distribution information is tested - for theta = 1 the dependence information is tested - for theta = 0.5 a mix of both information types is tested With theta in [0, 1] the distance lies in range [0, 1] and is a metric. (See original work for proof, p.71) :param x: (np.array/pd.Series) X vector. :param y: (np.array/pd.Series) Y vector (same number of observations as X). :param theta: (float) Type of information being tested. Falls in range [0, 1]. :return: (float) Distance under GPR approach. """ # Calculating the GPR distance distance = theta * (1 - spearmans_rho(x, y)) / 2 + \ (1 - theta) * (1 - ((2 * x.std() * y.std()) / (x.std()**2 + y.std()**2))**(1/2) * np.exp(- (1 / 4) * (x.mean() - y.mean())**2 / (x.std()**2 + y.std()**2))) return distance**(1 / 2)
def _mean_learning_curve_profile(self, sampled_nlls: np.array, training_nlls: np.array): learning_curves = { "sampled": float(np.float(sampled_nlls.mean())), "training": float(np.float(training_nlls.mean())) } return learning_curves
def b_formula(x_list: np.array, y_list: np.array, denominator: float): """TODO Docs""" b = (y_list.mean() * x_list.dot(x_list) - x_list.mean() * x_list.dot(y_list)) \ / denominator return b
def _correlation(x: np.array, vals: np.array): x = x[:, np.newaxis] mu_x = x.mean() # cells mu_vals = vals.mean(axis=0) # cells by gene --> cells by genes sigma_x = x.std() sigma_vals = vals.std(axis=0) return ( (vals * x).mean(axis=0) - mu_vals * mu_x) / (sigma_vals * sigma_x)
def _compute_p_value(cls, serie_1: np.array, serie_2: np.array) -> float: total_std = cls._get_total_std(serie_1, serie_2) stat = (serie_1.mean() - serie_2.mean()) / ( total_std * math.sqrt(1 / len(serie_1) + 1 / len(serie_2))) pvalue = stats.norm.cdf(stat) if np.isnan(pvalue): return 1.0 return pvalue
def periodogram_covar(x: np.array, y: np.array, tau: int, p: int): '''Takes in numpy arrays x and y, an int value tau for the lag and another int p for the maximum lag and returns the periodogram estimate of the covariance. ''' assert np.allclose(x.mean(), 0), 'Signal x must be 0 mean!' assert np.allclose(y.mean(), 0), 'Signal y must be 0 mean!' T = len(x) - p if tau == 0: return (1 / T) * np.dot(x[p:], y[p:]) else: return (1 / T) * np.dot(x[p:], y[p - tau:-tau])
def pearson_similarity(x: np.array, y: np.array) -> float: """ Calculate a Pearson correlation coefficient given 1-D data arrays x and y Args: x, y: two points in n-space Returns: Pearson correlation between x and y """ x = x - x.mean() y = y - y.mean() return (x * y).sum() / np.sqrt(np.square(x).sum()) / np.sqrt( np.square(y).sum())
def preprocess_data(data: np.array): data_mean = data.mean() data_std = data.std() print("The data mean value is", data_mean) print("The data std value is", data_std) data -= data_mean data /= data_std # check again to double check print("After normalization the data has mean value", data.mean()) print("After normalization the data has standard deviation", data.std()) return data
def vector(x: np.array, y: np.array): """ Correlate each column in y with a vector x :param x: np.ndarray vector of length n :param y: np.ndarray matrix of shape (n, k) :returns: vector of length n """ # x = x[:, np.newaxis] # for working with matrices mu_x = x.mean() # cells mu_y = y.mean(axis=0) # cells by gene --> cells by genes sigma_x = x.std() sigma_y = y.std(axis=0) return ((y * x).mean(axis=0) - mu_y * mu_x) / (sigma_y * sigma_x)
def CCC(self, predictions: np.array, labels: np.array): """ Concordance Correlation Coefficient (CCC) metric. Args: predictions (np.array): Model predictions. labels (np.array): Data labels. """ predictions = np.concatenate(predictions).reshape(-1, ) labels = np.concatenate(labels).reshape(-1, ) mean_cent_prod = ((predictions - predictions.mean()) * (labels - labels.mean())).mean() return (2 * mean_cent_prod) / (predictions.var() + labels.var() + (predictions.mean() - labels.mean())**2)
def fit(self, X: np.array, y: Optional[np.array] = None, epochs: int = 50) -> None: """ Fit the model to the data. Args: X (np.array): point cloud y (np.array): for compatibility epochs (int): the number of epochs Returns: None """ self._reinit(X.mean(), X.std(), X.shape[1:]) for epoch in range(1, epochs): for point in X: # error is the distance from closest to the point closest_neuron, second_closest, error = self._get_winners( point) # accumulating the local error self._errors[closest_neuron] += error**2 # move neurons closer to the point self._move_neurons(closest_neuron, second_closest, point) # deleting inactive edges for dead_edge in [ edge for edge, age in self._edge_age.items() if age > self.max_age ]: self._remove_edge(dead_edge) if epoch % self.birth_period == 0: self._create_neuron()
def calc_mean_color(img: np.array) -> Tuple[int]: """ :param img: :return: """ return img.mean(axis=0).mean(axis=0)
def return_high_pass_filtered_depth(z: np.array, max_period: float, numtaps: int = 101): """ Take in two dim array of depth (pings, beams) and return a high pass filtered mean depth for each ping. Following the JHC 'Dynamic Motion Residuals...' paper which suggests using a 4 * max period cutoff. I've found a 6 * max period seems to retain more of the signal that we want, but I don't really know what I'm doing here yet. Parameters ---------- z numpy array (ping, beam) for depth max_period float, max period of the attitude arrays (roll, pitch, heave) numtaps filter length, must be odd Returns ------- np.array HPF ping-wise mean depth """ meandepth = z.mean(axis=1) zerocentered_meandepth = meandepth - meandepth.mean() # butterworth filter I never quite got to work in a way i understood # sos = butter(numtaps, 1 / max_period, btype='highpass', output='sos') # filt = sosfilt(sos, meandepth) coef = build_highpass_filter_coeff(1 / (max_period * 4), numtaps=numtaps) filt_depth = lfilter(coef, 1.0, zerocentered_meandepth) # trim the bad sections from the start of the filtered depth trimfilt_depth = filt_depth[int(numtaps / 2):] return trimfilt_depth
def _compute_number_of_replicas(self, distance_to_center: np.array) -> np.array: mean_dist = distance_to_center.mean() def to_weight(d): if "single" in self.exploration_type: return np.exp(-(d / mean_dist)**2) elif "multi" in self.exploration_type: return expit((d / mean_dist)**2) else: raise ValueError("{} is not a valid exploration type".format( self.exploration_type)) weights = np.array([to_weight(d) for d in distance_to_center]) n_replicas = np.zeros(self.swarm_size, dtype=int) replica = self.swarm_size res = [] while replica > 0: replica = int(replica) weights[0:replica] /= weights[0:replica].sum() for weight_order, idx in enumerate(reversed(np.argsort(weights))): if weight_order >= replica: break fractional_replicas = replica * weights[idx] # for rounding, # see https://stackoverflow.com/questions/28617841/rounding-to-nearest-int-with-numpy-rint-not-consistent-for-5 n_replicas[idx] += int(np.floor(fractional_replicas) + 0.5) if n_replicas.sum() >= self.swarm_size: n_replicas[idx] -= n_replicas.sum() - self.swarm_size replica = self.swarm_size - n_replicas.sum() res.append(replica) return n_replicas
def normalize(array: np.array) -> np.array: ''' Input: array: a numpy array of dimension m x n, where m := number of samples and n := vector size. ''' return (array - array.mean(axis=1).reshape(len(array), 1)) / (array.std( axis=1).reshape(len(array), 1))
def pca(X: array) -> array: """Principal Component Analysis input: X, matrix with training data stored as flattened arrays, in rows return: projection matrix (with important dimensions first), variance and mean.""" # get dimensions num_data, dim = X.shape # centre data mean_X = X.mean(axis=0) X = X - mean_X if dim > num_data: # PCA - compact trick used M = dot(X, X.T) # covariance matrix e, EV = linalg.eigh(M) # eigenvalues and eigenvectors tmp = dot(X.T, EV).T # this is the compact trick V = tmp[::-1] # reverse since last eigenvectors are the ones we want S = sqrt(e)[::-1] # reverse since eigenvalues are in increasing order for i in range(V.shape[1]): V[:,i] /= S else: # PCA - SVD used U, S, V = linalg.svd(X) V = V[:num_data] # only makes sense ot return the first num_data # return the projection matrix, the variance and the mean return V, S, mean_X
def local_autoscale_ms(img: np.array) -> np.array: ''' :return: Linearly normilized image. Output image will have 0 mean and 1 std. ''' return (img - img.mean()) / img.std()
def point_is_outlier(point:np.array,lastn:np.array,var_threshold:float,length:int=-1,output_current_ratio:bool=False)->tuple: """ Determines whether a data point is an outlier through variance and adds it to fixed-size set. For the first point, the function expects lastn=None and length=desired set size. For the others, it expects lastn to be the set returned by the last iteration If (point-mean)**2/(len(set)*variance(set))-1>var_threshold, returns (True,newset) otherwise, returns (False,newset) """ if lastn is None: if output_current_ratio: return (False,np.array([point]),0) return (False,np.array([point])) else: if(len(lastn)<length): lastn=np.insert(lastn,0,np.zeros(point.shape),0) if output_current_ratio: return (False,lastn,0) else: return (False,lastn) #Shift the current values left lastn[0:-1] = lastn[1:] #Put our last point in the rightmost position lastn[-1] = point #Calculate the variance var = lastn.var() if var == 0: #To avoid NaNs var = 1e-10 #Mean mean = lastn.mean(0) ratio = ((point-mean)**2).sum()/(len(lastn)*var) if output_current_ratio: return (ratio>var_threshold,lastn,ratio) else: return (ratio>var_threshold,lastn)
def calculate_metrics(scores: np.array, true_labels: np.array, score_name: str, verbose=True): # ROC-AUC & APS roc_auc = roc_auc_score(true_labels, scores) aps = average_precision_score(true_labels, scores) # Mean score on validation mean_score = scores.mean() # F1-score & optimal threshold # if opt_threshold is None: # validation # precision, recall, thresholds = precision_recall_curve(y_true=true_labels, probas_pred=scores) # f1_scores = (2 * precision * recall / (precision + recall)) # f1 = np.nanmax(f1_scores) # opt_threshold = thresholds[np.nanargmax(f1_scores)] # else: # testing # y_pred = (scores > opt_threshold).astype(int) # f1 = f1_score(y_true=true_labels, y_pred=y_pred) if verbose: print(f'ROC-AUC on {score_name}: {roc_auc}. APS on {score_name}: {aps}. Mean {score_name}: {mean_score}') # print(f'F1-score on {type}: {f1}. Optimal threshold on {type}: {opt_threshold}') return {f"roc-auc_{score_name}": roc_auc, f"aps_{score_name}": aps, f"mean_{score_name}": mean_score}
def transform_using_values(arr_in: np.array, values: list, cval=-1, cval_mean=False): ''' Applies an affine transformation to `arr_in` using the parameter values in `values`. ''' assert len(values) == 6 scale_x = values[0] scale_y = values[1] shear_radians = values[2] rotate_radians = values[3] offset_x = values[4] offset_y = values[5] # Image must be shifted by minus half each dimension, then transformed, then shifted back. # This way, rotations and shears will be about the centre of the image rather than the top-left corner. shift_x = -0.5 * arr_in.shape[1] shift_y = -0.5 * arr_in.shape[0] a0 = scale_x * math.cos(rotate_radians) a1 = -scale_y * math.sin(rotate_radians + shear_radians) a2 = a0 * shift_x + a1 * shift_y + offset_x - shift_x b0 = scale_x * math.sin(rotate_radians) b1 = scale_y * math.cos(rotate_radians + shear_radians) b2 = b0 * shift_x + b1 * shift_y + offset_y - shift_y tform = skimage.transform.AffineTransform(matrix=np.array([[a0, a1, a2], [b0, b1, b2], [0, 0, 1]])) if cval_mean: cval = arr_in.mean() arr_out = skimage.transform.warp(arr_in.astype(float), tform.inverse, cval=cval) return arr_out
def x_radius(x: np.array) -> (int, np.array): """ Return mean radius for matrix of distributions of radiuses Example : 1,2,3,4 5 6 7 8 ,9, 10 ..... [[0..... 0.4 0.2 0.5 0 .......] [0..... 0.2 0 0.8 0 .......] mean is mean_x = [0..... 0.3 0.2 0.65 0 .......] multiply by size of radiuses [1 2 3 4 5 6 7 8 9] 5*0.3 + 6*0.2 + 7*0.65 = 7.25 is a mean radius over the whole matrix of radiuses distributions Args: x: np.array - matrix of normalized radiuses distribution of train image of shape (x, 29), [i, j] cell shows which part of all hough circles on image #i take circles with radius of j pixels Returns: int: - mean radius in pixels np.array: - mean radius bin array (mean_x of shape (29, )) """ mean_x = x.mean(axis=0) return np.inner(mean_x, list(range(1, 30))), mean_x
def calculate_p_value(samples: np.array): mean = samples.mean() std = samples.std() mu = 1 / 3 z_score = (mean - mu) / std return stats.norm.sf(z_score)
def gen_features_way_1(x: np.array): x = x / np.abs(x).sum() features = [x.mean(), (x**2).mean(), x.var()] features += [ np.percentile(x, p) for p in np.linspace(0, 100, PERCENTILE_COUNT) ] return features
def single_sample_t_test(observations: np.array, mean0, alpha): """ This is an implementation of single-sided and single sample test of the mean of a normal distribution with unknown variance. In the context of this project, observations are some form of monthly returns or monthly return difference. NOTE: Requires that observations are pre-computed. """ mean_obs = observations.mean() sample_std = math.sqrt( square(observations - mean_obs).sum() / (len(observations) - 1)) t_statistic = (mean_obs - mean0) / (sample_std / math.sqrt(len(observations))) # test if t_statistic > t(alpha, n-1) # ppf(q, df, loc=0, scale=1) Percent point function (inverse of cdf — percentiles). critical_value = t.ppf(1 - alpha, df=len(observations) - 1) p_value = (1.0 - t.cdf(t_statistic, df=len(observations) - 1)) if t_statistic > critical_value: return "Reject H0 (mean of observations are greater) with t_statistic={}, p-value={}, critical_value={} and alpha={}".format( t_statistic, p_value, critical_value, alpha) else: return "Filed to reject H0 (mean of observation are not greater) with t_statistic={}, p-value={}, critical_value={} and alpha={}".format( t_statistic, p_value, critical_value, alpha)
def zoom_out(image: np.array, boxes: np.array): """ Perform zooming out of an image by placing the image in a larger canvas of filler values. filler will be the mean of the image Helps learning smaller values :param img: np.array, Depth Image (1, h, w) :param boxes: np.array, bounding boxes of the objects :return: expanded image, updated coordinates of bounding box """ h = image.shape[0] w = image.shape[1] max_scale = const.MAX_ZOOM_OUT scale = random.uniform(1, max_scale) new_h = int(h * scale) new_w = int(w * scale) filler = image.mean() new_image = np.ones((new_h, new_w), dtype=np.float) * filler left = random.randint(0, new_w - w) right = left + w top = random.randint(0, new_h - h) bottom = top + h new_image[top:bottom, left:right] = image new_boxes = boxes + np.array([left, top, left, top], dtype=np.float32) return new_image, new_boxes
def autocorr(x: np.array, lags: range) -> np.array: """Make an autocorrelation curve""" mean = x.mean() var = np.var(x) xp = x - mean corr = np.correlate(xp, xp, "full")[len(x) - 1 :] / var / len(x) return corr[: len(lags)]
def print_valurange_summary(onebigline: np.array, username: str): n = len(onebigline) nh = n // 2 print('user: '******' vals: ', n, ' mean: ', onebigline.mean(), ' std: ', onebigline.std(), ' min: ', onebigline.min(), ' max: ', onebigline.max()) print(onebigline[:4], "...", onebigline[nh:nh + 4], "...", onebigline[-4:]) print("")