Example #1
0
  def nofz_bins(pz0,pointz,pzdist,mask0,w,pointw,bins=3,pzmask=None,spec=False,point=False):

    if hasattr(bins,'__len__'):
      xbins=np.digitize(pointz,bins)-1
      nofz=np.zeros((len(bins),pz0.bins))
    else:
      edge=lin.linear_methods.find_bin_edges(pointz[mask0],bins,pointw[mask0])
      print edge
      xbins=np.digitize(pointz,edge)-1
      nofz=np.zeros((bins+1,pz0.bins))

    if point:
      nofz[0,:],b=np.histogram(pzdist[pzmask&mask0],bins=np.append(pz0.binlow,pz0.binhigh[-1]),weights=w[pzmask&mask0])
      nofz[0,:]/=np.sum(nofz[0,:])*(pz0.bin[1]-pz0.bin[0])

      for i in xrange(pz0.tomo-1):
        mask=(xbins==i)
        nofz[i+1,:],b=np.histogram(pzdist[pzmask&mask0&mask],bins=np.append(pz0.binlow,pz0.binhigh[-1]),weights=w[pzmask&mask&mask0])
        nofz[i+1,:]/=np.sum(nofz[i+1,:])*(pz0.bin[1]-pz0.bin[0])

    else:
      print np.sum(pzmask&mask0),len(pzmask),len(mask0)
      nofz[0,:]=np.sum((pz0.pz_full[pzmask&mask0].T*w[pzmask&mask0]).T,axis=0)
      nofz[0,:]/=np.sum(nofz[0,:])*(pz0.bin[1]-pz0.bin[0])

      for i in xrange(pz0.tomo-1):
        mask=(xbins==i)
        nofz[i+1,:]=np.sum((pz0.pz_full[pzmask&mask0&mask].T*w[pzmask&mask&mask0]).T,axis=0)
        nofz[i+1,:]/=np.sum(nofz[i+1,:])*(pz0.bin[1]-pz0.bin[0])

    specnofz=np.zeros((pz0.tomo,pz0.bins))

    if spec:
      from weighted_kde import gaussian_kde
      tmp=gaussian_kde(pz0.spec_full[pzmask&mask0],weights=w[pzmask&mask0],bw_method='scott')
      specnofz[0,:]=tmp(pz0.bin)
      specnofz[0,:]/=np.sum(specnofz[0,:])*(pz0.bin[1]-pz0.bin[0])

      for i in xrange(pz0.tomo-1):
        mask=(xbins==i)
        tmp=gaussian_kde(pz0.spec_full[pzmask&mask0&mask],weights=w[pzmask&mask0&mask],bw_method='scott')
        specnofz[i+1,:]=tmp(pz0.bin)
        specnofz[i+1,:]/=np.sum(specnofz[i+1,:])*(pz0.bin[1]-pz0.bin[0])

    return nofz,specnofz
Example #2
0
def compute_kernels(slices,weights,pbar=False):
    if pbar: progress_bar = ProgressBar(slices.size,message="computing kernels")
    else: print "computing kernels"
    kernels = []

    for s in slices:
        kernels.append(gaussian_kde(s,weights=weights))
        if pbar: progress_bar()
                     
    return np.array(kernels)
Example #3
0
def plot_kde(dat, weights = None, rng = None, resolution = 10, style = 'normal',
             bw_method = 'scott', plot = True, **kwargs):
    """
    Calculates & plots kernel density estimator
    
    Keyword Arguments:
    dat                 -- Data array
    weights             -- Weights of data (default None)
    rng                 -- Range to plot (default None = from min to max)
    resolution          -- Plot points per unit of rng (default 10)
    bw_method           -- Bandwith selection method. Can be 'scott' (default),
                           'silverman' or a constant
    style               -- Can be 'normal' for normal lines or 'shady' for
                           thicker, more transparent lines
    plot                -- If true (default) kde will be plotted. Otherwise
                           [x,y] is returned.
    **kwargs            -- will be passed to matplotlib.plot
    """
    if(weights is None):
        weights = np.ones(len(dat))

    if(rng is None):
        rng = [np.min(dat), np.max(dat) ]
        
    if(style == 'shady'):
        if(not 'alpha' in kwargs):
            kwargs['alpha'] = 0.5
        if(not 'linewidth' in kwargs):
            kwargs['linewidth'] = 1.8

    cov = Covariator([dat], weights)
    inv_cov, normf = cov(bw_method)
    kde = gaussian_kde([dat], weights, inv_cov, normf)

    x = np.linspace(rng[0],rng[1], np.round( resolution * (rng[1] - rng[0])))
    y = kde.evaluate([x])
    
    if(plot):
        plt.plot(x,y,**kwargs)
    else:
        return [x, y]
Example #4
0
def main(example, df, possible_subjects):
    """
    Find similar tutors to example tutor. Calculate weighted kernal density estimate (KDE) of pricing distribution of similar tutors.

    Similarity conditions:
        (1) Jaccard index between subjects tutored >= 0.3
        (2) Radius that a tutor is willing to travel encompasses the center of the zip code of the example tutor.
        (3) Cosine similarity between profile features is >=0.5 for nearest neighbor, max-priced tutor, and min-priced tutor. Otherwise use cosine similarity to weight KDE.

    INPUTS
    example = (Series) with same format as df, but with only the input tutor from the website.
    df = (DataFrame) with all NYC tutors.
    possible_subjects = (List) of subjects that tutors tutor. This is restricted to the top 100 most popular subjects as previously calculated in cleanup_features.py

    OUTPUTS
    nearest_neighbor = (Series) with same format as df with most similar tutor to example.
    max_tut = (Series) of tutor that charges the highest hourly rate of tutors with cosine similarity > 0.5.
    min_tut = (Series) of tutor that charges the lowest hourly rate of tutors with cosine similarity < 0.5.
    img_io = KDE plot image. Actually in memory but behaves like a file (written to disk with StringIO)

    """

    # Drop example tutor if in df
    try:
        df.drop(df[example['url_id'] == df['url_id']].index.values,
                inplace=True)
        df.reset_index(drop=True, inplace=True)
    except:
        pass  # Tutor is not in database

    # Check for graduate degree
    df = graduate_degrees(example, df)

    # Filter by Jaccard index and location.
    sim_tuts = subject_similarity(example, df, possible_subjects)
    sim_tuts = location_overlap(example, sim_tuts)

    # Relevant features for computing similarity
    rel_feats = ['avg_review_length',\
                 'badge_hours',\
                 'days_since_last_review',\
                 'has_rating',\
                 'number_of_ratings',\
                 'number_of_reviews',\
                 'profile_picture',\
                 'rating',\
                 'has_ivy_degree',\
                 'has_background_check',\
                 'response_time',\
                 'avg_review_sentiment']

    # Convert similar tutors to matrix. Normalize features.
    # In parlance of machine learning, X are features, y is hourly rate.
    X = sim_tuts[rel_feats].as_matrix().astype(np.float)
    y = sim_tuts['hourly_rate'].as_matrix().astype(np.float)
    scaler = preprocessing.StandardScaler()
    X = scaler.fit_transform(X)

    X_example = example[rel_feats].as_matrix().astype(np.float)
    y_example = np.float(example['hourly_rate'])
    X_example = scaler.transform(X_example)

    # Get cosine similarity between example tutor and tutor db.
    cos_tuts = np.empty(X.shape[0])
    for i in xrange(X.shape[0]):
        cos_tuts[i] = cosine_similarity(X[i, :], X_example)

    # Sort by similarity
    sorted_idx = np.argsort(cos_tuts)[::-1]
    cos_tuts = cos_tuts[sorted_idx]
    y = y[sorted_idx]
    sim_tuts.reset_index(drop=True, inplace=True)

    # Only keep tutors with similarity > 0.5
    sim_tuts = sim_tuts.iloc[sorted_idx][cos_tuts > .5]

    # Calculate three outputted tutors.
    nearest_neighbor = sim_tuts.iloc[0]  # Highest similarity
    max_tut = sim_tuts[sim_tuts['hourly_rate'] ==
                       sim_tuts['hourly_rate'].max()].iloc[0]
    min_tut = sim_tuts[sim_tuts['hourly_rate'] ==
                       sim_tuts['hourly_rate'].min()].iloc[0]

    scaling = scale_kde(y, cos_tuts)

    kde = gaussian_kde(y[cos_tuts > 0], weights=cos_tuts[cos_tuts > 0])
    x = np.linspace(0, y.max() + 50, y.max() + 50 + 1)

    pdf = kde(x) * scaling  # Probability density function (estimated)

    img_io = make_kde_plot(x, pdf)

    return nearest_neighbor, max_tut, min_tut, img_io
Example #5
0
def getDensity(points,mask,bandwidth=25.,scales=n.array([4.,4,2])):

    bandwidth = float(bandwidth)

    # pdb.set_trace()
    mean = n.mean(points,1)
    # meanx = mean[0]

    # points[0] = (-2)*(points[0]-meanx) + points[0]

    m=mask.ga()[mask.slices]
    print 'reducing mask!'
    # m[:100] = 0
    mscaled = sitk.gafi(beads.scaleStack(scales[::-1]/mask.spacing[::-1],sitk.gifa(m)))

    # mscaled =

    amscaled = mscaled.swapaxes(0,2)


    # bounds = n.array(m.shape)[::-1]*mask.spacing/scales
    bounds = amscaled.shape


    x,y,z = n.mgrid[0:bounds[0],0:bounds[1],0:bounds[2]]
    x_grid = n.array([x.flatten(),y.flatten(),z.flatten()])

    gkde = weighted_kde.gaussian_kde(points,bw_method=bandwidth)

    diag = n.array([bandwidth,bandwidth,bandwidth])
    gkde.covariance = n.diag(diag)
    gkde.inv_cov = n.diag(1/diag)

    dens = gkde.evaluate(x_grid)
    dens = dens.reshape(x.shape)*amscaled

    dens = dens/n.sum(dens)
    # pdb.set_trace()

    # pdb.set_trace()

    kernel = multivariate_normal.pdf(x_grid.swapaxes(0,1),n.array(x.shape)/2.,gkde.covariance).reshape(x.shape)
    kk = kernel>(kernel.max()/10.)
    kbounds = n.array(kk.nonzero())
    kmin = n.min(kbounds,1)
    kmax = n.max(kbounds,1)
    kernel = kernel[kmin[0]:kmax[0],kmin[1]:kmax[1],kmin[2]:kmax[2]]

    print 'calculating weightmap'
    weightmap = sitk.Convolution(sitk.Cast(sitk.gifa(mscaled),6),sitk.Cast(sitk.gifa(kernel),6),
                                 boundaryCondition=sitk.ConvolutionImageFilter.ZERO_PAD)

    kernelsum = n.sum(kernel)
    nweightmap = sitk.gafi(weightmap)
    weights = kernelsum/ndimage.map_coordinates(nweightmap,points[::-1,:],mode='nearest')


    wgkde = weighted_kde.gaussian_kde(points,bw_method=bandwidth,weights=weights)
    wgkde.covariance = gkde.covariance
    wgkde.inv_cov = gkde.inv_cov

    wdens = wgkde.evaluate(x_grid)
    wdens = wdens.reshape(x.shape)*amscaled

    wdens = wdens/n.sum(wdens)

    xs = n.array(amscaled.nonzero())
    probs = wdens[amscaled.nonzero()]
    probs = probs/n.sum(probs)
    # tifffile.imshow(n.array([dpens,wdens]),vmin=1,projfunc=n.max,projdim=2)
    return wdens,dens,xs,probs
Example #6
0
    def nofz_bins(pz0,
                  pointz,
                  pzdist,
                  mask0,
                  w,
                  pointw,
                  bins=3,
                  pzmask=None,
                  spec=False,
                  point=False):

        if hasattr(bins, '__len__'):
            xbins = np.digitize(pointz, bins) - 1
            nofz = np.zeros((len(bins), pz0.bins))
        else:
            edge = lin.linear_methods.find_bin_edges(pointz[mask0], bins,
                                                     pointw[mask0])
            print edge
            xbins = np.digitize(pointz, edge) - 1
            nofz = np.zeros((bins + 1, pz0.bins))

        if point:
            nofz[0, :], b = np.histogram(pzdist[pzmask & mask0],
                                         bins=np.append(
                                             pz0.binlow, pz0.binhigh[-1]),
                                         weights=w[pzmask & mask0])
            nofz[0, :] /= np.sum(nofz[0, :]) * (pz0.bin[1] - pz0.bin[0])

            for i in xrange(pz0.tomo - 1):
                mask = (xbins == i)
                nofz[i + 1, :], b = np.histogram(
                    pzdist[pzmask & mask0 & mask],
                    bins=np.append(pz0.binlow, pz0.binhigh[-1]),
                    weights=w[pzmask & mask & mask0])
                nofz[i + 1, :] /= np.sum(
                    nofz[i + 1, :]) * (pz0.bin[1] - pz0.bin[0])

        else:
            print np.sum(pzmask & mask0), len(pzmask), len(mask0)
            nofz[0, :] = np.sum(
                (pz0.pz_full[pzmask & mask0].T * w[pzmask & mask0]).T, axis=0)
            nofz[0, :] /= np.sum(nofz[0, :]) * (pz0.bin[1] - pz0.bin[0])

            for i in xrange(pz0.tomo - 1):
                mask = (xbins == i)
                nofz[i + 1, :] = np.sum((pz0.pz_full[pzmask & mask0 & mask].T *
                                         w[pzmask & mask & mask0]).T,
                                        axis=0)
                nofz[i + 1, :] /= np.sum(
                    nofz[i + 1, :]) * (pz0.bin[1] - pz0.bin[0])

        specnofz = np.zeros((pz0.tomo, pz0.bins))

        if spec:
            from weighted_kde import gaussian_kde
            tmp = gaussian_kde(pz0.spec_full[pzmask & mask0],
                               weights=w[pzmask & mask0],
                               bw_method='scott')
            specnofz[0, :] = tmp(pz0.bin)
            specnofz[0, :] /= np.sum(
                specnofz[0, :]) * (pz0.bin[1] - pz0.bin[0])

            for i in xrange(pz0.tomo - 1):
                mask = (xbins == i)
                tmp = gaussian_kde(pz0.spec_full[pzmask & mask0 & mask],
                                   weights=w[pzmask & mask0 & mask],
                                   bw_method='scott')
                specnofz[i + 1, :] = tmp(pz0.bin)
                specnofz[i + 1, :] /= np.sum(
                    specnofz[i + 1, :]) * (pz0.bin[1] - pz0.bin[0])

        return nofz, specnofz
Example #7
0
def weighted_nz_distributions(df,
                              binning,
                              weights=None,
                              tomo_bins=np.array([0, 5.0]),
                              z_phot=None,
                              n_resample=50):
    """
    :param df: pandas data-frame
    :param binning: center of redshift bins
    :param weights: optional weighting scheme with same len as df
    :param tomo_bins: in which z-bins array exp [0.0, 0.2, 0.6, 1.8]
    :param n_resample : Amount of resamples to estimate mean and variance on the mean.
    :return: dictionaries with estimates of weighted n(z) and bootstrap estimates
    """

    assert isinstance(df, pd.DataFrame), 'df must be a pandas DataFrame'
    assert isinstance(binning, np.ndarray), 'binning must be a numpy array'
    if weights:
        assert weights in df.columns, str(weights) + ' not in df.columns'
        df[weights] = (df[weights] /
                       df[weights].sum()).values  # normalize weights
    else:
        df[weights] = 1.0 / float(len(df))  # set uniform weights if none given

    assert isinstance(z_phot, np.ndarray), 'z_phot must be a numpy array'
    assert len(z_phot) == len(
        df), 'Length of z_phot must be equal to that of df'
    df['phot_sel'] = z_phot  # Make the selection photo-z a part of the DataFrame
    assert 'z_spec' in df.columns, 'The df needs a "z_spec" in df.columns'
    pdf_names = [
        'pdf_' + str(i) for i in range(500) if 'pdf_' + str(i) in df.columns
    ]

    phot_iter = {}
    spec_iter = {}

    # In the following section the tomographic bins are treated

    for j in xrange(0, len(tomo_bins) - 1):
        sel = (df.phot_sel > tomo_bins[j]) & (df.phot_sel <= tomo_bins[j + 1])
        if sel.sum() > 0:
            df_sel = df[sel]

            phot_iter[j + 1] = {}
            spec_iter[j + 1] = {}

            for i in xrange(n_resample):
                df_sample = df_sel.sample(n=len(df_sel),
                                          replace=True,
                                          weights=df_sel[weights])
                kde_w_spec_pdf = gaussian_kde(df_sample.z_spec.values,
                                              bw_method='silverman')
                kde_w_spec_pdf = kde_w_spec_pdf(binning)

                phot_iter[j + 1][i + 1] = _normalize_pdf(
                    df_sample[pdf_names].sum(), binning[1] - binning[0]).values
                spec_iter[j + 1][i + 1] = kde_w_spec_pdf

            phot_iter[j + 1][0] = _normalize_pdf(
                df_sel[pdf_names].sum(), binning[1] - binning[0]).values
            kde_w_spec_pdf = gaussian_kde(df_sel.z_spec.values,
                                          bw_method='silverman')
            spec_iter[j + 1][0] = kde_w_spec_pdf(binning)

    # In the following section the full n(z) is treated i.e not in tomographic bins

    sel = (df.phot_sel > tomo_bins[0]) & (df.phot_sel <=
                                          tomo_bins[len(tomo_bins) - 1])
    df_sel = df[sel]
    phot_iter[0] = {}
    spec_iter[0] = {}

    for i in xrange(n_resample):
        df_sample = df_sel.sample(n=len(df_sel),
                                  replace=True,
                                  weights=df_sel[weights])
        kde_w_spec_pdf = gaussian_kde(df_sample.z_spec.values,
                                      bw_method='silverman')
        kde_w_spec_pdf = kde_w_spec_pdf(binning)

        phot_iter[0][i + 1] = _normalize_pdf(df_sample[pdf_names].sum(),
                                             binning[1] - binning[0]).values
        spec_iter[0][i + 1] = kde_w_spec_pdf

    phot_iter[0][0] = _normalize_pdf(df_sel[pdf_names].sum(),
                                     binning[1] - binning[0]).values
    kde_w_spec_pdf = gaussian_kde(df_sel.z_spec.values, bw_method='silverman')
    spec_iter[0][0] = kde_w_spec_pdf(binning)

    data_for_wl = {'binning': binning, 'phot': phot_iter, 'spec': spec_iter}

    return phot_iter, spec_iter, data_for_wl
Example #8
0
# For each point like estimation code, no weights working... strange...
for res in point_results:

    data_1 = pf.open(res)[1].data
    phz = data_1[z_pht]
    if hasattr(weight_true, "__len__"):
        print 'entro -----------'
        weights = data_1[inArgs['weight_s']]
        weights_norm = np.sum(weights) / len(weights)
        weights = weights / weights_norm
    else:
        weights = False
        print
        #phz = phz*weights
    density = gaussian_kde(phz, weights=weights)
    #    density = gaussian_kde(phz)
    density.covariance_factor = lambda: .05
    density._compute_covariance()
    ys = density(bincenters)

    llist.append(ys)
    lab = res.split('/')[-1].split('_')[0]
    print lab
    labe.append(lab)

    temp = plt.plot(bincenters, ys, antialiased=True, linewidth=2, label=lab)

    cc = temp[0].get_color()

    tz = data_1['Z_SPEC']