def linear_resist_fit_robust(x, y, p=0.5): np.seterr(all='raise') # print('Running linear resist fit...') n_gene = x.shape[0] # print('number of genes: ', n_gene) x_reg = x y_reg = y loss_pre = float('Inf') # Set the initial points of slope and intercept for i in range(3000): slope, intercept = stats.siegelslopes(x_reg, y_reg) abline_values = np.asarray([slope * x_iter + intercept for x_iter in x]) square_list = np.square(y - abline_values) square_list_index_sort = np.argsort(square_list) # sub_index = square_list_index_sort[1:int(n_gene * 0.3) + 1] sub_index = square_list_index_sort[0: int(n_gene * p) + 1] loss = np.sum(square_list[sub_index]) delta_loss = abs(loss_pre - loss) if delta_loss < 0.00001: print('convergence') break else: loss_pre = loss # Update x and y for next iteration of linear regression x_reg = x[sub_index] y_reg = y[sub_index] print(i, loss, delta_loss) return abline_values
def linear_fit(tt, xx, yy, eyy, method='ls'): #xx = 0.67*xx log_x = np.log10(xx) log_y = np.log10(yy) log_e_y = eyy / yy / np.log(10.) if method == 'ls': popt, pcov = curve_fit(lin_func, log_x, log_y, sigma=log_e_y) a, b = popt ea, eb = np.sqrt(np.diag(pcov)) elif method == 'bces': log_e_x = eyy * 1.e-20 / np.log(10.) / yy cov = np.zeros_like(log_x) a_bces, b_bces, aerr_bces, berr_bces, covab = bces.bces.bces( log_x, log_e_x, log_y, log_e_y, cov) a = a_bces[3] ea = aerr_bces[3] b = b_bces[3] eb = berr_bces[3] #b = 10.**b_bces[3] #e_b = berr_bces[3] * 10.**b_bces[3] * np.log(10) elif method == 'siegel_h': a, b = stats.siegelslopes(log_y, log_x) eb, ea = 0, 0 elif method == 'siegel_s': a, b = stats.siegelslopes(log_y, log_x, method='separate') eb, ea = 0, 0 elif method == 'theil_sen': a, b, am, ap = stats.theilslopes(log_y, log_x, 0.68) eb, ea = a - am, 0 elif method == 'rlm': log_X = sm.add_constant(log_x) resrlm = sm.RLM(log_y, log_X).fit() b, a = resrlm.params eb, ea = resrlm.bse #a,b = popt #ea ,eb = np.sqrt(np.diag(pcov)) par = [a, 10**b] per = [ea, 10.**b * np.log(10) * eb] fit = pow_law_func(tt, par[0], par[1]) return par, per, fit
def timing_parameters(geom, image, peak_time, hillas_parameters, cleaning_mask=None): """ Function to extract timing parameters from a cleaned image. Parameters ---------- geom: ctapipe.instrument.CameraGeometry Camera geometry image : array_like Pixel values peak_time : array_like Time of the pulse extracted from each pixels waveform hillas_parameters: ctapipe.containers.HillasParametersContainer Result of hillas_parameters cleaning_mask: optionnal, array, dtype=bool The pixels that survived cleaning, e.g. tailcuts_clean The non-masked pixels must verify signal > 0 Returns ------- timing_parameters: TimingParametersContainer """ unit = geom.pix_x.unit if cleaning_mask is not None: image = image[cleaning_mask] geom = geom[cleaning_mask] peak_time = peak_time[cleaning_mask] if (image < 0).any(): raise ValueError("The non-masked pixels must verify signal >= 0") h = hillas_parameters pix_x, pix_y, x, y, length, width = all_to_value( geom.pix_x, geom.pix_y, h.x, h.y, h.length, h.width, unit=unit ) longi, _ = camera_to_shower_coordinates( pix_x, pix_y, x, y, hillas_parameters.psi.to_value(u.rad) ) # use polyfit just to get the covariance matrix and errors (_s, _i), cov = np.polyfit(longi, peak_time, deg=1, w=np.sqrt(image), cov=True) slope_err, intercept_err = np.sqrt(np.diag(cov)) # re-fit using a robust-to-outlier algorithm slope, intercept = siegelslopes(x=longi, y=peak_time) predicted_time = polyval(longi, (intercept, slope)) deviation = np.sqrt(np.sum((peak_time - predicted_time) ** 2) / peak_time.size) return TimingParametersContainer( slope=slope / unit, intercept=intercept, deviation=deviation, slope_err=slope_err / unit, intercept_err=intercept_err, )
def linear_resist_fit_robust_mix(x, y, p_default=0.5, verbose=False): iter_limit = 20 np.seterr(all='raise') # print('Running linear resist fit...') n_gene = x.shape[0] # print('number of genes: ', n_gene) select_list, p = preprocess(x, y) # print(f'p: {p}') n_select = np.sum(select_list) x_select = x[select_list].copy() y_select = y[select_list].copy() x_reg = x_select y_reg = y_select loss_pre = float('Inf') # Robust regression on the whole dataset to ignore outliers slope, intercept = stats.siegelslopes(y_reg, x_reg) abline_values = np.asarray([slope * x_iter + intercept for x_iter in x_select]) square_list = np.square(y_select - abline_values) square_list_index_sort = np.argsort(square_list) sub_index = square_list_index_sort[0:int(n_select * p) + 1] x_reg = x_select[sub_index] y_reg = y_select[sub_index] if verbose: print(f'[siegelslopes] slope:{slope}, intercept:{intercept}') # Set the initial points of slope and intercept for i in range(iter_limit): slope, intercept, r_value, p_value, std_err = stats.linregress(x_reg, y_reg) # slope, intercept = stats.siegelslopes(y_reg, x_reg) abline_values = np.asarray([slope * x_iter + intercept for x_iter in x_select]) square_list = np.square(y_select - abline_values) square_list_index_sort = np.argsort(square_list) sub_index = square_list_index_sort[1:int(n_select * p) + 1] loss = np.sum(square_list[sub_index]) delta_loss = abs(loss_pre - loss) if delta_loss < 0.00001: if verbose: print('convergence') break else: loss_pre = loss # Update x and y for next iteration of linear regression x_reg = x_select[sub_index] y_reg = y_select[sub_index] if i == iter_limit: if not verbose: print('[Resist Fit] Reach iteration limit') # abline_values = np.asarray([slope * x_iter + intercept for x_iter in x]) abline_values = slope * x + intercept # abline_values = slope * x # print(f'# of 0s in abline: {np.sum(abline_values == 0)}') # abline_values[select_zero_genes(x, y)] = 0 abline_values[x == 0] = 0 # print(f'# of 0s in new abline: {np.sum(abline_values == 0)}') if verbose: print(f'[Resist Fit] slope: {slope}, intercept: {intercept}') print(f'depth(y_select): {np.sum(y_select)}, \n' f'depth(x_select): {np.sum(x_select)}, \n' f'depth(x): {np.sum(x)},\n' f'depth(norm): {np.sum(abline_values)},\n' f'depth(y): {np.sum(y)}\n' f'y_select/x_select: {np.sum(y_select) / np.sum(x_select)}') return abline_values, slope, intercept
def _resistant_fit_linear( source: np.ndarray, target: np.ndarray, p: np.float32 = 0.75, verbose: bool = False, init_step: str = "ransac" ) -> Tuple[np.ndarray, np.float32, np.float32]: """ Use resistant fit to normalize cell x (source) to the reference cell y (target). :param source: the gene expression of the cell x :param target: the gene expression of the cell y, reference cell :param p: the size of biological feature set :param verbose: verbose flag for debug :return: y_regression: the normalized 1d array slope: the final slope from EM Regression intercept: the final intercept from EM Regression """ ######################################## # Select valid genes for regression ######################################## iter_limit = 20 np.seterr(all='raise') select_mask = _preprocess(source, target) n_select = np.sum(select_mask) x_select = source[select_mask].copy() # Note that len(x_select) <= source y_select = target[select_mask].copy() ############################################################ # Init EM step: robust regression on all genes ############################################################ # Robust regression on the whole dataset to ignore outliers if init_step == "ransac": ransac = linear_model.RANSACRegressor(random_state=42) ransac.fit(x_select.copy().reshape(-1, 1), y_select.copy().reshape(-1, 1)) slope, intercept = float(ransac.estimator_.coef_), float( ransac.estimator_.intercept_) elif init_step == "siegel": slope, intercept = stats.siegelslopes(y_select, x_select) elif init_step == "theil": slope, intercept, _, _ = stats.theilslopes(y_select, x_select) else: raise NameError( "init_step must be chosen from list ['ransac', 'siegel', 'theil']") y_regression = np.asarray( [slope * x_iter + intercept for x_iter in x_select]) square_list = np.square(y_select - y_regression) square_list_index_sort = np.argsort(square_list) sub_index = square_list_index_sort[0:int(n_select * p) + 1] # Set Biological Feature Set (BFS) x_bfs = x_select[sub_index] y_bfs = y_select[sub_index] if verbose: logger.info(f'[Init EM step] slope:{slope}, intercept:{intercept}') ############################################################ # Resistant Fit Regression on BFS ############################################################ loss_pre = np.Inf for i in range(iter_limit): # E step: Linear regression on BFS slope, intercept, r_value, p_value, std_err = stats.linregress( x_bfs, y_bfs) y_regression = np.asarray( [slope * x_iter + intercept for x_iter in x_select]) square_list = np.square(y_select - y_regression) square_list_index_sort = np.argsort(square_list) sub_index = square_list_index_sort[1:int(n_select * p) + 1] loss = np.sum(square_list[sub_index]) delta_loss = abs(loss_pre - loss) if delta_loss < 0.00001: if verbose: logger.info('convergence') break else: loss_pre = loss # M step: Update x and y for next iteration of linear regression x_bfs = x_select[sub_index] y_bfs = y_select[sub_index] if i == iter_limit: if not verbose: logger.info('[Resist Fit] Reach iteration limit') ############################################################ # Normalize cell y based on regression model ############################################################ y_regression = slope * source + intercept # If x is zero then clip y to 0 # TODO: x is unlikely to be zero. y_regression[source == 0] = 0 if verbose: logger.info(f'[Resist Fit] slope: {slope}, intercept: {intercept}') logger.info( f'depth(y_select): {np.sum(y_select)}, \n' f'depth(x_select): {np.sum(x_select)}, \n' f'depth(x): {np.sum(source)},\n' f'depth(norm): {np.sum(y_regression)},\n' f'depth(y): {np.sum(target)}\n' f'y_select/x_select: {np.sum(y_select) / np.sum(x_select)}') return np.float32(y_regression), np.float32(slope), np.float32(intercept)
# ax_2 = fig.add_subplot(132) ax_3 = fig.add_subplot(111) # ax_1.plot(fpos_arr_l, hfd_arr_l) # ax_2.plot(fpos_arr_r, hfd_arr_r) ax_3.scatter(fpos_arr, hfd_arr, color='green') # flip left side around so HFR values INCREASE with index print('fit') # robust_right = robust_line_fit(fpos_arr_r, hfd_arr_r) # robust_left = robust_line_fit(np.flipud(fpos_arr_l), np.flipud(hfd_arr_l)) # robust_best_pos = (robust_left[1]-robust_right[1])/(robust_right[0]-robust_left[0]) # print(f"Robust left -> {robust_left}") # print(f"Robust right -> {robust_right}") # print(f"Robust intersection/best focus -> {robust_best_pos}") siegel_left_fit = siegelslopes(hfd_arr_l, fpos_arr_l) siegel_right_fit = siegelslopes(hfd_arr_r, fpos_arr_r) siegel_left_zero = -siegel_left_fit[1] / siegel_left_fit[0] siegel_right_zero = -siegel_right_fit[1] / siegel_right_fit[0] siegel_best_pos = (siegel_left_fit[1] - siegel_right_fit[1]) / ( siegel_right_fit[0] - siegel_left_fit[0]) logging.info(f'siegel left fit = {siegel_left_fit}') logging.info(f'siegel right fit = {siegel_right_fit}') logging.info(f'siegel best pos = {siegel_best_pos}') ax_3.plot(fpos_arr[:midx + 5], siegel_left_fit[0] * fpos_arr[:midx + 5] + siegel_left_fit[1]) ax_3.plot(fpos_arr[midx - 5:], siegel_right_fit[0] * fpos_arr[midx - 5:] + siegel_right_fit[1]) ax_3.axvline(siegel_best_pos, color='red')
from scipy import stats import matplotlib.pyplot as plt x = np.linspace(-5, 5, num=150) y = x + np.random.normal(size=x.size) y[11:15] += 10 # add outliers y[-5:] -= 7 # Compute the slope and intercept. For comparison, also compute the # least-squares fit with `linregress`: res = stats.siegelslopes(y, x) lsq_res = stats.linregress(x, y) # Plot the results. The Siegel regression line is shown in red. The green # line shows the least-squares fit for comparison. fig = plt.figure() ax = fig.add_subplot(111) ax.plot(x, y, 'b.') ax.plot(x, res[1] + res[0] * x, 'r-') ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') plt.show()