def nofz_bins(pz0,pointz,pzdist,mask0,w,pointw,bins=3,pzmask=None,spec=False,point=False): if hasattr(bins,'__len__'): xbins=np.digitize(pointz,bins)-1 nofz=np.zeros((len(bins),pz0.bins)) else: edge=lin.linear_methods.find_bin_edges(pointz[mask0],bins,pointw[mask0]) print edge xbins=np.digitize(pointz,edge)-1 nofz=np.zeros((bins+1,pz0.bins)) if point: nofz[0,:],b=np.histogram(pzdist[pzmask&mask0],bins=np.append(pz0.binlow,pz0.binhigh[-1]),weights=w[pzmask&mask0]) nofz[0,:]/=np.sum(nofz[0,:])*(pz0.bin[1]-pz0.bin[0]) for i in xrange(pz0.tomo-1): mask=(xbins==i) nofz[i+1,:],b=np.histogram(pzdist[pzmask&mask0&mask],bins=np.append(pz0.binlow,pz0.binhigh[-1]),weights=w[pzmask&mask&mask0]) nofz[i+1,:]/=np.sum(nofz[i+1,:])*(pz0.bin[1]-pz0.bin[0]) else: print np.sum(pzmask&mask0),len(pzmask),len(mask0) nofz[0,:]=np.sum((pz0.pz_full[pzmask&mask0].T*w[pzmask&mask0]).T,axis=0) nofz[0,:]/=np.sum(nofz[0,:])*(pz0.bin[1]-pz0.bin[0]) for i in xrange(pz0.tomo-1): mask=(xbins==i) nofz[i+1,:]=np.sum((pz0.pz_full[pzmask&mask0&mask].T*w[pzmask&mask&mask0]).T,axis=0) nofz[i+1,:]/=np.sum(nofz[i+1,:])*(pz0.bin[1]-pz0.bin[0]) specnofz=np.zeros((pz0.tomo,pz0.bins)) if spec: from weighted_kde import gaussian_kde tmp=gaussian_kde(pz0.spec_full[pzmask&mask0],weights=w[pzmask&mask0],bw_method='scott') specnofz[0,:]=tmp(pz0.bin) specnofz[0,:]/=np.sum(specnofz[0,:])*(pz0.bin[1]-pz0.bin[0]) for i in xrange(pz0.tomo-1): mask=(xbins==i) tmp=gaussian_kde(pz0.spec_full[pzmask&mask0&mask],weights=w[pzmask&mask0&mask],bw_method='scott') specnofz[i+1,:]=tmp(pz0.bin) specnofz[i+1,:]/=np.sum(specnofz[i+1,:])*(pz0.bin[1]-pz0.bin[0]) return nofz,specnofz
def compute_kernels(slices,weights,pbar=False): if pbar: progress_bar = ProgressBar(slices.size,message="computing kernels") else: print "computing kernels" kernels = [] for s in slices: kernels.append(gaussian_kde(s,weights=weights)) if pbar: progress_bar() return np.array(kernels)
def plot_kde(dat, weights = None, rng = None, resolution = 10, style = 'normal', bw_method = 'scott', plot = True, **kwargs): """ Calculates & plots kernel density estimator Keyword Arguments: dat -- Data array weights -- Weights of data (default None) rng -- Range to plot (default None = from min to max) resolution -- Plot points per unit of rng (default 10) bw_method -- Bandwith selection method. Can be 'scott' (default), 'silverman' or a constant style -- Can be 'normal' for normal lines or 'shady' for thicker, more transparent lines plot -- If true (default) kde will be plotted. Otherwise [x,y] is returned. **kwargs -- will be passed to matplotlib.plot """ if(weights is None): weights = np.ones(len(dat)) if(rng is None): rng = [np.min(dat), np.max(dat) ] if(style == 'shady'): if(not 'alpha' in kwargs): kwargs['alpha'] = 0.5 if(not 'linewidth' in kwargs): kwargs['linewidth'] = 1.8 cov = Covariator([dat], weights) inv_cov, normf = cov(bw_method) kde = gaussian_kde([dat], weights, inv_cov, normf) x = np.linspace(rng[0],rng[1], np.round( resolution * (rng[1] - rng[0]))) y = kde.evaluate([x]) if(plot): plt.plot(x,y,**kwargs) else: return [x, y]
def main(example, df, possible_subjects): """ Find similar tutors to example tutor. Calculate weighted kernal density estimate (KDE) of pricing distribution of similar tutors. Similarity conditions: (1) Jaccard index between subjects tutored >= 0.3 (2) Radius that a tutor is willing to travel encompasses the center of the zip code of the example tutor. (3) Cosine similarity between profile features is >=0.5 for nearest neighbor, max-priced tutor, and min-priced tutor. Otherwise use cosine similarity to weight KDE. INPUTS example = (Series) with same format as df, but with only the input tutor from the website. df = (DataFrame) with all NYC tutors. possible_subjects = (List) of subjects that tutors tutor. This is restricted to the top 100 most popular subjects as previously calculated in cleanup_features.py OUTPUTS nearest_neighbor = (Series) with same format as df with most similar tutor to example. max_tut = (Series) of tutor that charges the highest hourly rate of tutors with cosine similarity > 0.5. min_tut = (Series) of tutor that charges the lowest hourly rate of tutors with cosine similarity < 0.5. img_io = KDE plot image. Actually in memory but behaves like a file (written to disk with StringIO) """ # Drop example tutor if in df try: df.drop(df[example['url_id'] == df['url_id']].index.values, inplace=True) df.reset_index(drop=True, inplace=True) except: pass # Tutor is not in database # Check for graduate degree df = graduate_degrees(example, df) # Filter by Jaccard index and location. sim_tuts = subject_similarity(example, df, possible_subjects) sim_tuts = location_overlap(example, sim_tuts) # Relevant features for computing similarity rel_feats = ['avg_review_length',\ 'badge_hours',\ 'days_since_last_review',\ 'has_rating',\ 'number_of_ratings',\ 'number_of_reviews',\ 'profile_picture',\ 'rating',\ 'has_ivy_degree',\ 'has_background_check',\ 'response_time',\ 'avg_review_sentiment'] # Convert similar tutors to matrix. Normalize features. # In parlance of machine learning, X are features, y is hourly rate. X = sim_tuts[rel_feats].as_matrix().astype(np.float) y = sim_tuts['hourly_rate'].as_matrix().astype(np.float) scaler = preprocessing.StandardScaler() X = scaler.fit_transform(X) X_example = example[rel_feats].as_matrix().astype(np.float) y_example = np.float(example['hourly_rate']) X_example = scaler.transform(X_example) # Get cosine similarity between example tutor and tutor db. cos_tuts = np.empty(X.shape[0]) for i in xrange(X.shape[0]): cos_tuts[i] = cosine_similarity(X[i, :], X_example) # Sort by similarity sorted_idx = np.argsort(cos_tuts)[::-1] cos_tuts = cos_tuts[sorted_idx] y = y[sorted_idx] sim_tuts.reset_index(drop=True, inplace=True) # Only keep tutors with similarity > 0.5 sim_tuts = sim_tuts.iloc[sorted_idx][cos_tuts > .5] # Calculate three outputted tutors. nearest_neighbor = sim_tuts.iloc[0] # Highest similarity max_tut = sim_tuts[sim_tuts['hourly_rate'] == sim_tuts['hourly_rate'].max()].iloc[0] min_tut = sim_tuts[sim_tuts['hourly_rate'] == sim_tuts['hourly_rate'].min()].iloc[0] scaling = scale_kde(y, cos_tuts) kde = gaussian_kde(y[cos_tuts > 0], weights=cos_tuts[cos_tuts > 0]) x = np.linspace(0, y.max() + 50, y.max() + 50 + 1) pdf = kde(x) * scaling # Probability density function (estimated) img_io = make_kde_plot(x, pdf) return nearest_neighbor, max_tut, min_tut, img_io
def getDensity(points,mask,bandwidth=25.,scales=n.array([4.,4,2])): bandwidth = float(bandwidth) # pdb.set_trace() mean = n.mean(points,1) # meanx = mean[0] # points[0] = (-2)*(points[0]-meanx) + points[0] m=mask.ga()[mask.slices] print 'reducing mask!' # m[:100] = 0 mscaled = sitk.gafi(beads.scaleStack(scales[::-1]/mask.spacing[::-1],sitk.gifa(m))) # mscaled = amscaled = mscaled.swapaxes(0,2) # bounds = n.array(m.shape)[::-1]*mask.spacing/scales bounds = amscaled.shape x,y,z = n.mgrid[0:bounds[0],0:bounds[1],0:bounds[2]] x_grid = n.array([x.flatten(),y.flatten(),z.flatten()]) gkde = weighted_kde.gaussian_kde(points,bw_method=bandwidth) diag = n.array([bandwidth,bandwidth,bandwidth]) gkde.covariance = n.diag(diag) gkde.inv_cov = n.diag(1/diag) dens = gkde.evaluate(x_grid) dens = dens.reshape(x.shape)*amscaled dens = dens/n.sum(dens) # pdb.set_trace() # pdb.set_trace() kernel = multivariate_normal.pdf(x_grid.swapaxes(0,1),n.array(x.shape)/2.,gkde.covariance).reshape(x.shape) kk = kernel>(kernel.max()/10.) kbounds = n.array(kk.nonzero()) kmin = n.min(kbounds,1) kmax = n.max(kbounds,1) kernel = kernel[kmin[0]:kmax[0],kmin[1]:kmax[1],kmin[2]:kmax[2]] print 'calculating weightmap' weightmap = sitk.Convolution(sitk.Cast(sitk.gifa(mscaled),6),sitk.Cast(sitk.gifa(kernel),6), boundaryCondition=sitk.ConvolutionImageFilter.ZERO_PAD) kernelsum = n.sum(kernel) nweightmap = sitk.gafi(weightmap) weights = kernelsum/ndimage.map_coordinates(nweightmap,points[::-1,:],mode='nearest') wgkde = weighted_kde.gaussian_kde(points,bw_method=bandwidth,weights=weights) wgkde.covariance = gkde.covariance wgkde.inv_cov = gkde.inv_cov wdens = wgkde.evaluate(x_grid) wdens = wdens.reshape(x.shape)*amscaled wdens = wdens/n.sum(wdens) xs = n.array(amscaled.nonzero()) probs = wdens[amscaled.nonzero()] probs = probs/n.sum(probs) # tifffile.imshow(n.array([dpens,wdens]),vmin=1,projfunc=n.max,projdim=2) return wdens,dens,xs,probs
def nofz_bins(pz0, pointz, pzdist, mask0, w, pointw, bins=3, pzmask=None, spec=False, point=False): if hasattr(bins, '__len__'): xbins = np.digitize(pointz, bins) - 1 nofz = np.zeros((len(bins), pz0.bins)) else: edge = lin.linear_methods.find_bin_edges(pointz[mask0], bins, pointw[mask0]) print edge xbins = np.digitize(pointz, edge) - 1 nofz = np.zeros((bins + 1, pz0.bins)) if point: nofz[0, :], b = np.histogram(pzdist[pzmask & mask0], bins=np.append( pz0.binlow, pz0.binhigh[-1]), weights=w[pzmask & mask0]) nofz[0, :] /= np.sum(nofz[0, :]) * (pz0.bin[1] - pz0.bin[0]) for i in xrange(pz0.tomo - 1): mask = (xbins == i) nofz[i + 1, :], b = np.histogram( pzdist[pzmask & mask0 & mask], bins=np.append(pz0.binlow, pz0.binhigh[-1]), weights=w[pzmask & mask & mask0]) nofz[i + 1, :] /= np.sum( nofz[i + 1, :]) * (pz0.bin[1] - pz0.bin[0]) else: print np.sum(pzmask & mask0), len(pzmask), len(mask0) nofz[0, :] = np.sum( (pz0.pz_full[pzmask & mask0].T * w[pzmask & mask0]).T, axis=0) nofz[0, :] /= np.sum(nofz[0, :]) * (pz0.bin[1] - pz0.bin[0]) for i in xrange(pz0.tomo - 1): mask = (xbins == i) nofz[i + 1, :] = np.sum((pz0.pz_full[pzmask & mask0 & mask].T * w[pzmask & mask & mask0]).T, axis=0) nofz[i + 1, :] /= np.sum( nofz[i + 1, :]) * (pz0.bin[1] - pz0.bin[0]) specnofz = np.zeros((pz0.tomo, pz0.bins)) if spec: from weighted_kde import gaussian_kde tmp = gaussian_kde(pz0.spec_full[pzmask & mask0], weights=w[pzmask & mask0], bw_method='scott') specnofz[0, :] = tmp(pz0.bin) specnofz[0, :] /= np.sum( specnofz[0, :]) * (pz0.bin[1] - pz0.bin[0]) for i in xrange(pz0.tomo - 1): mask = (xbins == i) tmp = gaussian_kde(pz0.spec_full[pzmask & mask0 & mask], weights=w[pzmask & mask0 & mask], bw_method='scott') specnofz[i + 1, :] = tmp(pz0.bin) specnofz[i + 1, :] /= np.sum( specnofz[i + 1, :]) * (pz0.bin[1] - pz0.bin[0]) return nofz, specnofz
def weighted_nz_distributions(df, binning, weights=None, tomo_bins=np.array([0, 5.0]), z_phot=None, n_resample=50): """ :param df: pandas data-frame :param binning: center of redshift bins :param weights: optional weighting scheme with same len as df :param tomo_bins: in which z-bins array exp [0.0, 0.2, 0.6, 1.8] :param n_resample : Amount of resamples to estimate mean and variance on the mean. :return: dictionaries with estimates of weighted n(z) and bootstrap estimates """ assert isinstance(df, pd.DataFrame), 'df must be a pandas DataFrame' assert isinstance(binning, np.ndarray), 'binning must be a numpy array' if weights: assert weights in df.columns, str(weights) + ' not in df.columns' df[weights] = (df[weights] / df[weights].sum()).values # normalize weights else: df[weights] = 1.0 / float(len(df)) # set uniform weights if none given assert isinstance(z_phot, np.ndarray), 'z_phot must be a numpy array' assert len(z_phot) == len( df), 'Length of z_phot must be equal to that of df' df['phot_sel'] = z_phot # Make the selection photo-z a part of the DataFrame assert 'z_spec' in df.columns, 'The df needs a "z_spec" in df.columns' pdf_names = [ 'pdf_' + str(i) for i in range(500) if 'pdf_' + str(i) in df.columns ] phot_iter = {} spec_iter = {} # In the following section the tomographic bins are treated for j in xrange(0, len(tomo_bins) - 1): sel = (df.phot_sel > tomo_bins[j]) & (df.phot_sel <= tomo_bins[j + 1]) if sel.sum() > 0: df_sel = df[sel] phot_iter[j + 1] = {} spec_iter[j + 1] = {} for i in xrange(n_resample): df_sample = df_sel.sample(n=len(df_sel), replace=True, weights=df_sel[weights]) kde_w_spec_pdf = gaussian_kde(df_sample.z_spec.values, bw_method='silverman') kde_w_spec_pdf = kde_w_spec_pdf(binning) phot_iter[j + 1][i + 1] = _normalize_pdf( df_sample[pdf_names].sum(), binning[1] - binning[0]).values spec_iter[j + 1][i + 1] = kde_w_spec_pdf phot_iter[j + 1][0] = _normalize_pdf( df_sel[pdf_names].sum(), binning[1] - binning[0]).values kde_w_spec_pdf = gaussian_kde(df_sel.z_spec.values, bw_method='silverman') spec_iter[j + 1][0] = kde_w_spec_pdf(binning) # In the following section the full n(z) is treated i.e not in tomographic bins sel = (df.phot_sel > tomo_bins[0]) & (df.phot_sel <= tomo_bins[len(tomo_bins) - 1]) df_sel = df[sel] phot_iter[0] = {} spec_iter[0] = {} for i in xrange(n_resample): df_sample = df_sel.sample(n=len(df_sel), replace=True, weights=df_sel[weights]) kde_w_spec_pdf = gaussian_kde(df_sample.z_spec.values, bw_method='silverman') kde_w_spec_pdf = kde_w_spec_pdf(binning) phot_iter[0][i + 1] = _normalize_pdf(df_sample[pdf_names].sum(), binning[1] - binning[0]).values spec_iter[0][i + 1] = kde_w_spec_pdf phot_iter[0][0] = _normalize_pdf(df_sel[pdf_names].sum(), binning[1] - binning[0]).values kde_w_spec_pdf = gaussian_kde(df_sel.z_spec.values, bw_method='silverman') spec_iter[0][0] = kde_w_spec_pdf(binning) data_for_wl = {'binning': binning, 'phot': phot_iter, 'spec': spec_iter} return phot_iter, spec_iter, data_for_wl
# For each point like estimation code, no weights working... strange... for res in point_results: data_1 = pf.open(res)[1].data phz = data_1[z_pht] if hasattr(weight_true, "__len__"): print 'entro -----------' weights = data_1[inArgs['weight_s']] weights_norm = np.sum(weights) / len(weights) weights = weights / weights_norm else: weights = False print #phz = phz*weights density = gaussian_kde(phz, weights=weights) # density = gaussian_kde(phz) density.covariance_factor = lambda: .05 density._compute_covariance() ys = density(bincenters) llist.append(ys) lab = res.split('/')[-1].split('_')[0] print lab labe.append(lab) temp = plt.plot(bincenters, ys, antialiased=True, linewidth=2, label=lab) cc = temp[0].get_color() tz = data_1['Z_SPEC']