def conditional_entropy(x, y):
    """Return H(Y|X).

    Parameters
    ----------
    x: numpy.ndarray of float values
    y: numpy.ndarray of integer values

    Returns
    -------
    float
        Conditional entropy value
    """
    # discretize X
    hx, bx = histogram(x, bins=x.size / 10, density=True)

    Py = compute_distribution(y)
    Px = compute_distribution(digitize(x, bx))

    res = 0
    for ey in set(y):
        # P(X | Y)
        x1 = x[y == ey]
        condPxy = compute_distribution(digitize(x1, bx))

        for k in condPxy:
            v = condPxy[k]
            res += (v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey])))
    return res
Beispiel #2
0
def conditional_entropy(x, y):
    """
    x: vettore di numeri reali
    y: vettore di interi
    calcola H(Y|X)
    """
    # discretizzazione di X

    hx, bx = histogram(x, bins=x.size / 10, density=True)

    Py = compute_distribution(y)

    Px = compute_distribution(digitize(x, bx))

    res = 0
    for ey in set(y):
        # P(X | Y)
        x1 = x[y == ey]
        condPxy = compute_distribution(digitize(x1, bx))

        for k, v in condPxy.iteritems():
            res += (v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey])))

    en_x = entropy(digitize(x, bx))

    return res, en_x
Beispiel #3
0
def getBeamFluxSpline(beam, plasma, t, lim1, lim2, points=1000):
    """ generates a spline off of the beampath.  Assumes
    that the change in flux is MONOTONIC"""

    lim = beam.norm.s
    beam.norm.s = scipy.linspace(0, lim[-1], points)
    h = time.time()
    psi = plasma.eq.rz2rmid(beam.r()[0],
                            beam.r()[2], t)  #evaluates all psi's at once
    print(time.time() - h)
    outspline = len(t) * [0]
    inspline = len(t) * [0]
    for i in range(t.size):
        temp = lim1
        mask = scipy.logical_and(scipy.isfinite(psi[i]), psi[i] < lim2 + .02)

        try:
            minpos = scipy.argmin(psi[i][mask])
            test = psi[i][mask][minpos]
        except ValueError:
            test = lim2 + .03

        #plt.plot(beam.x()[0][mask],psi[i][mask])
        #plt.show()
        sizer = psi[i][mask].size
        if not test > lim2:

            #plt.plot(beam.x()[0][mask][0:minpos],psi[i][mask][0:minpos],beam.x()[0][mask][minpos:],psi[i][mask][minpos:])
            #plt.show()
            #limout = scipy.insert(lim,(2,2),(beam.norm.s[mask][minpos],beam.norm.s[mask][minpos]))  # add minimum flux s for bound testing
            if lim1 < test:
                temp = test

            try:
                temp1 = scipy.clip(
                    scipy.digitize((lim1, lim2), psi[i][mask][minpos::-1]), 0,
                    minpos)
                outspline[i] = beam.norm.s[mask][minpos::-1][temp1]

            except ValueError:
                tempmask = (psi[i][mask] < lim2)[0]
                outspline[i] = scipy.array(
                    [beam.norm.s[mask][minpos], beam.norm.s[mask][tempmask]])

            try:
                temp2 = scipy.clip(
                    scipy.digitize((lim1, lim2), psi[i][mask][minpos:]), 0,
                    sizer - minpos - 1)
                inspline[i] = beam.norm.s[mask][minpos:][temp2]

            except ValueError:
                inspline[i] = scipy.array(
                    [beam.norm.s[mask][minpos], beam.norm.s[mask][-1]])

        else:
            outspline[i] = scipy.array([[], []])
            inspline[i] = scipy.array([[], []])

    return (outspline, inspline)
Beispiel #4
0
def getBeamFluxSpline(beam,plasma,t,lim1,lim2,points = 1000):
    """ generates a spline off of the beampath.  Assumes
    that the change in flux is MONOTONIC"""

    lim = beam.norm.s
    beam.norm.s = scipy.linspace(0,lim[-1],points)
    h = time.time()
    psi = plasma.eq.rz2rmid(beam.r()[0],beam.r()[2],t) #evaluates all psi's at once
    print(time.time()-h)
    outspline = len(t)*[0]
    inspline = len(t)*[0]
    for i in xrange(t.size):
        temp = lim1
        mask = scipy.logical_and(scipy.isfinite(psi[i]),psi[i] < lim2+.02)

        try:
            minpos = scipy.argmin(psi[i][mask])
            test = psi[i][mask][minpos]
        except ValueError:
            test = lim2+.03
            
        #plt.plot(beam.x()[0][mask],psi[i][mask])
        #plt.show()
        sizer = psi[i][mask].size
        if not test > lim2:

        #plt.plot(beam.x()[0][mask][0:minpos],psi[i][mask][0:minpos],beam.x()[0][mask][minpos:],psi[i][mask][minpos:])
        #plt.show()
        #limout = scipy.insert(lim,(2,2),(beam.norm.s[mask][minpos],beam.norm.s[mask][minpos]))  # add minimum flux s for bound testing
            if lim1 < test:
                temp = test

            try:
                temp1 = scipy.clip(scipy.digitize((lim1,lim2),psi[i][mask][minpos::-1]),0,minpos)
                outspline[i] = beam.norm.s[mask][minpos::-1][temp1]
            
            except ValueError:
                tempmask = (psi[i][mask] < lim2)[0]
                outspline[i] = scipy.array([beam.norm.s[mask][minpos],beam.norm.s[mask][tempmask]])

            try:
                temp2 = scipy.clip(scipy.digitize((lim1,lim2),psi[i][mask][minpos:]),0,sizer-minpos-1)
                inspline[i] = beam.norm.s[mask][minpos:][temp2]
                
            except ValueError:
                inspline[i] = scipy.array([beam.norm.s[mask][minpos],beam.norm.s[mask][-1]])

        else:
            outspline[i] = scipy.array([[],[]])
            inspline[i] = scipy.array([[],[]])

    return (outspline,inspline)
Beispiel #5
0
def trend(tree, signal, shot):
    temp = MDS.Tree(tree, signal)
    xt = temp.getNode(signal).dim_of().data()
    x = temp.getNode(signal).dim_of().data()
    yt, y = globalpowerCalc(shot)
    a = scipy.digitize(xt, yt)
    return x, y[a]
Beispiel #6
0
def trend(tree,signal,shot):
    temp = MDS.Tree(tree,signal)
    xt = temp.getNode(signal).dim_of().data() 
    x = temp.getNode(signal).dim_of().data() 
    yt,y = globalpowerCalc(shot)
    a = scipy.digitize(xt,yt)
    return x,y[a]
Beispiel #7
0
def globalpowerCalc(shot):

    Tree = MDSplus.Tree('spectroscopy', shot)
    output = None
    temp2 = 2 * scipy.pi * (.68) * Tree.getNode(
        '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').data(
        )  #does factor have the 4pi?
    temp2t = Tree.getNode(
        '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').dim_of(
        ).data()
    for i in scipy.arange(20) + 2:
        string = str(i)

        if i < 10:
            string = '0' + string
    # try:
        temp = Tree.getNode(
            '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_' +
            string).data()

        tempt = Tree.getNode(
            '\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_' +
            string).dim_of().data()

        a = scipy.digitize(tempt, temp2t)
        if output is None:
            output = temp * temp2[i][a]
        else:
            output = output + temp * temp2[i][a]
        #except ValueError:
        #    print('no')

    return (tempt, output)
Beispiel #8
0
    def assess_calibration(self):
        """Assess if PredPol is calibrated by conditioning on predicted intensity
        and checking the correlation between number of crimes and demographics.

        Returns: a 2D array where the first dimension is the number of days in
        the test set and the second dimension is the number of bins for the
        range of predicted intensities, as computed by `sp.histogram_bin_edges`.
        The entry in the ith row and jth column is the Pearson correlation
        coefficient between race and actual number of crimes in the jth bin of
        predicted intensity for the ith day.
        """
        black = self.pred_obj.grid_cells.black
        not_nan = sp.logical_not(sp.isnan(black.values))

        bins = sp.histogram_bin_edges(self.get_predicted_intensities(), bins='auto')
        correlations = sp.empty((len(self.lambda_columns), len(bins)))
        correlations[:] = sp.nan
        for i, (lambda_col, actual_col) in self._iterator():
            idx_bins = sp.digitize(self.results[lambda_col], bins)
            for j in range(len(bins)):
                idx_selected = sp.logical_and(idx_bins == j, not_nan)
                if sp.sum(idx_selected) > 2:
                    actual = self.results.loc[idx_selected, actual_col]
                    demographics = black.loc[idx_selected]
                    correlations[i, j] = sp.stats.pearsonr(actual, demographics)[0]
        return correlations
Beispiel #9
0
def globalpowerCalc(shot):

    Tree = MDSplus.Tree('spectroscopy',shot)
    output = None
    temp2 = 2*scipy.pi*(.68)*Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').data() #does factor have the 4pi?
    temp2t = Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.BRIGHT').dim_of().data()
    for i in scipy.arange(20)+2:
        string = str(i)

        if i < 10:
            string = '0'+string
       # try:
        temp = Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_'+string).data()
        
        tempt = Tree.getNode('\SPECTROSCOPY::TOP.BOLOMETER.RESULTS.DIODE.BPLY.AREA:CHORD_'+string).dim_of().data()
        

        a = scipy.digitize(tempt,temp2t)
        if output is None:
            output = temp*temp2[i][a]
        else:
            output = output + temp*temp2[i][a]
        #except ValueError:
        #    print('no')

    return (tempt,output)
Beispiel #10
0
def observe_mask(X, Y, m, n, fov=5, slit_size=None, ngals=40, width=1):
    """
    can either give the slit size, or the number of objects. If the number
    of objects is given, the slit size parameter is ignored, and instead the
    slit size is calculated to have that number of objects in one observation.
    """
    ngal = len(X)
    j = scipy.arange(ngal, dtype=int)
    dist = scipy.hypot(X, Y)
    # the BCG is always the first element (bcg = 0)
    bcg = j[dist == 0][0]
    # rotate the coordinates to the mask direction
    theta = scipy.arctan(m)
    Xrot = X * scipy.cos(theta) - Y * scipy.sin(theta)
    Yrot = X * scipy.sin(theta) + Y * scipy.cos(theta)
    pylab.plot(Xrot, Yrot, 'k.', mew=2)
    # bin the galaxies in Xrot
    in_fov = j[(abs(Xrot) < fov[0] / 2.) & (abs(Yrot) < fov[1] / 2.)]
    Xrot = Xrot[in_fov]
    Yrot = Yrot[in_fov]
    n_in_fov = len(in_fov)
    j = scipy.arange(n_in_fov, dtype=int)
    if ngals:
        slit_size = fov[0] / float(ngals)
        Xbins = scipy.linspace(-fov[0] / 2., fov[0] / 2., ngals + 1)
    else:
        Xbins = scipy.arange(-fov[0] / 2., fov[0] / 2., slit_size / 60.)
    Xbins += slit_size / 2.
    Xbinned = scipy.digitize(Xrot, Xbins)
    # observe one galaxy per Xbin with a Gaussian probability with width
    # defined below (in arcmin), and excluding the bin containing the
    # BCG, which is necessarily observed (we always do!).
    # This width is such that I preferentially observe galaxies near the
    # center of the image.
    observed = [j[Xbinned == i][scipy.argmin(abs(Yrot[Xbinned == i] -
                                                 random.normal(0, width)))] \
                for i in xrange(len(Xbins)-1) \
                if (len(Yrot[Xbinned == i]) > 0) & (i != Xbinned[0])]
    observed = scipy.append(bcg, observed)
    # plot (just once, to show -- and make sure!)
    #for x in Xbins:
    #pylab.axvline(x, ls='-', color='0.7')
    #pylab.plot(0, 0, 'o', ms=8, mfc='orange', mec='orange')
    #pylab.plot(Xrot[observed], Yrot[observed], 'rx', ms=6, mew=2)
    ## field of view
    #pylab.plot([-fov[0]/2., -fov[0]/2.], [-fov[1]/2., fov[1]/2.], 'b-', lw=2)
    #pylab.plot([fov[0]/2., fov[0]/2.], [-fov[1]/2., fov[1]/2.], 'b-', lw=2)
    #pylab.plot([-fov[0]/2., fov[0]/2.], [-fov[1]/2., -fov[1]/2.], 'b-', lw=2)
    #pylab.plot([-fov[0]/2., fov[0]/2.], [fov[1]/2., fov[1]/2.], 'b-', lw=2)
    #pylab.xlabel('rotated x (arcmin)')
    #pylab.ylabel('rotated y (arcmin)')
    #pylab.xlim(-6, 6)
    #pylab.ylim(-6, 6)
    #output = 'plots/mask1_sample.png'
    #pylab.savefig(output, format=output[-3:])
    #pylab.close()
    #print 'Saved to', output
    #exit()
    return in_fov[observed]
 def share_slices(counts):
     cumcounts = scipy.cumsum(counts)
     cedges = scipy.linspace(0, cumcounts[-1] + 1, ncuts + 1)
     cutnumber = scipy.digitize(cumcounts, cedges) - 1
     assert (cutnumber >= 0).all() and (cutnumber < ncuts).all()
     return [
         scipy.flatnonzero(cutnumber == icut) for icut in range(ncuts)
     ]
Beispiel #12
0
	def ROI_nominal(self):
		"""
		Set the ROIs nominally from file, for each slot separately.
		"""
		
		for iS in range(self.num_slots):
			self.raw_ROI[:, iS] = sp.digitize(self.data[:, iS], 
									self.ROI_bins[:, iS]) - 1
			self.corr_ROI[:, iS] = self.raw_ROI[:, iS]
Beispiel #13
0
def estimate_position_from_quadratures(eta, phix, N_phi=30, N_x=101):
    phi = phix[:,0]
    x = phix[:,1]/sqrt(eta)
    phi_edges = scipy.linspace(0, 2.*scipy.pi, N_phi)
    phi_centers = (phi_edges[:-1]+phi_edges[1:])/2.
    phi_idx = scipy.digitize(phi, phi_edges)
    xs = [x[phi_idx==n+1] for n in range(len(phi_centers))]
    means = scipy.array([scipy.mean(x) for x in xs])
    stds = scipy.array([scipy.std(x) for x in xs])
    m = interp1d(phi_centers, means)
    return -m(pi), m(pi/2.), stds.max()
Beispiel #14
0
def conditional_entropy(x, y):
    """
    x: vector de numeros reales
    y: vector de numeros enteros
    devuelve H(Y|X)
    """
    # discretizacion de X
    hx, bx = histogram(x, bins=x.size / 10, density=True)

    Py = compute_distribution(y)
    Px = compute_distribution(digitize(x, bx))

    res = 0
    for ey in set(y):
        # P(X | Y)
        x1 = x[y == ey]
        condPxy = compute_distribution(digitize(x1, bx))

        for k, v in condPxy.iteritems():
            res += (v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey])))
    return res
Beispiel #15
0
def conditional_entropy(x, y):
    """
    x: vector de numeros reales
    y: vector de numeros enteros
    devuelve H(Y|X)
    """
    # discretizacion de X
    hx, bx = histogram(x, bins=x.size / 10, density=True)

    Py = compute_distribution(y)
    Px = compute_distribution(digitize(x, bx))

    res = 0
    for ey in set(y):
        # P(X | Y)
        x1 = x[y == ey]
        condPxy = compute_distribution(digitize(x1, bx))

        for k, v in condPxy.iteritems():
            res += v * Py[ey] * (log2(Px[k]) - log2(v * Py[ey]))
    return res
Beispiel #16
0
    def phase_of_times(self, times, sampling_rate=1000.):
        """
        Give the phases of the oscillation at the specific 'times'
        
        The under underlying precision of phase sampling is given by 'sampling_rate'
        
        Return 'nan' for timepoints outside the range where the oscillation phase is known (Oscillation.time_line)
        
        Note: an oscillation detected with a very small sampling rate compared to its frequency will have a drift in its reconstructed phase. 
        It is advised to have an original sampling rate of at least 4 times the oscillation frequency
        """
        if self.time_line.size > 1:
            old_dt = self.time_line[1] - self.time_line[0]
            x = numpy.arange(self.time_start, self.time_stop + old_dt,
                             1. / sampling_rate)
        else:
            x = self.time_line
        v = self.value_line

        # BAD
        #y = numpy.angle(v)
        #y = signal.resample( y, x.size)

        # bad 2
        #~ y = numpy.cos(numpy.angle(v))
        #~ y = signal.resample( y, x.size)
        #~ ind = numpy.diff(y)>0
        #~ ind = numpy.concatenate( (ind , [ind[-1]]))
        #~ y2 = numpy.arccos(y)
        #~ y2[ind] = -y2[ind]

        #ok
        # Before resampling, in order to avoid slow down due the use of ifft in scipy.resample
        # y is padded with 0 proportionnally to the distance from x.size to the next 2**N
        # QUESTION: does it lead to some strange edge effects???
        N = numpy.ceil(numpy.log2(x.size))
        vv = numpy.r_[
            v, numpy.zeros(numpy.floor(v.size * (2**N - x.size) / x.size))]
        vv = signal.resample(vv, 2**N)
        v = vv[:x.size]

        #~ y = numpy.cos(numpy.angle(v))
        y2 = numpy.angle(v)

        d = digitize(times, x)
        d[d == len(
            v
        )] = 0  # points above the highest time value where the oscillation phase is known
        phases = y2[d]
        phases[
            d ==
            0] = nan  # all points outside the range where the oscillation is known
        return phases
Beispiel #17
0
    def phase_of_times(self,  times , sampling_rate = 1000.):
        """
        Give the phases of the oscillation at the specific 'times'
        
        The under underlying precision of phase sampling is given by 'sampling_rate'
        
        Return 'nan' for timepoints outside the range where the oscillation phase is known (Oscillation.time_line)
        
        Note: an oscillation detected with a very small sampling rate compared to its frequency will have a drift in its reconstructed phase. 
        It is advised to have an original sampling rate of at least 4 times the oscillation frequency
        """
        if self.time_line.size>1:
            old_dt = self.time_line[1]-self.time_line[0]
            x = numpy.arange(self.time_start, self.time_stop+old_dt, 1./sampling_rate)
        else:
            x=self.time_line
        v = self.value_line
        
        # BAD
        #y = numpy.angle(v)
        #y = signal.resample( y, x.size)
        
        
        
        # bad 2
        #~ y = numpy.cos(numpy.angle(v))
        #~ y = signal.resample( y, x.size)
        #~ ind = numpy.diff(y)>0
        #~ ind = numpy.concatenate( (ind , [ind[-1]]))
        #~ y2 = numpy.arccos(y)
        #~ y2[ind] = -y2[ind]
        
        #ok
        # Before resampling, in order to avoid slow down due the use of ifft in scipy.resample
        # y is padded with 0 proportionnally to the distance from x.size to the next 2**N 
        # QUESTION: does it lead to some strange edge effects???
        N=numpy.ceil(numpy.log2(x.size))
        vv=numpy.r_[v,numpy.zeros(numpy.floor(v.size*(2**N-x.size)/x.size))]
        vv = signal.resample( vv, 2**N)
        v = vv[:x.size]

        #~ y = numpy.cos(numpy.angle(v))
        y2 = numpy.angle(v)



        d = digitize( times , x )
        d[d==len(v)] = 0 # points above the highest time value where the oscillation phase is known
        phases = y2[d]
        phases[ d==0 ] = nan # all points outside the range where the oscillation is known
        return phases
        def weight_radial(catalogue, rwidth=rwidth, redges=redges):

            self.logger.info('Radial integral constraint.')

            distance = catalogue.distance()
            dmin, dmax = distance.min(), distance.max()
            self.logger.info('Comoving distances: {:.1f} - {:.1f}.'.format(
                dmin, dmax))

            if redges is not None:
                radialedges = scipy.array(redges)
                rwidth = scipy.mean(scipy.diff(radialedges))
                rmin, rmax = radialedges.min(), radialedges.max()
                if (rmin > dmin) or (rmax < dmax):
                    raise ValueError(
                        'Provided radial-edges ({:.1f} - {:.1f}) do not encompass the full survey ({:.1f} - {:.1f}).'
                        .format(rmin, rmax, dmin, dmax))
                self.logger.info(
                    'Provided radial-edges of width: {:.1f} and range: {:.1f} - {:.1f}.'
                    .format(rwidth, rmin, rmax))
                nbins = len(radialedges) - 1
            else:
                self.logger.info(
                    'Provided radial-width: {:.1f}.'.format(rwidth))
                nbins = scipy.rint((dmax - dmin) / rwidth).astype(int)
                radialedges = scipy.linspace(dmin, dmax + 1e-9, nbins + 1)

            self.logger.info(
                'There are {:d} radial-bins with an average of {:.1f} objects.'
                .format(nbins,
                        len(catalogue) * 1. / nbins))
            ibin = scipy.digitize(distance, radialedges, right=False) - 1

            for iaddbin in range(catalogue.attrs['naddbins']):
                mask = catalogue['iaddbin'] == iaddbin
                wcounts = scipy.bincount(ibin[mask],
                                         weights=catalogue['Weight'][mask],
                                         minlength=nbins)
                catalogue['Weight'][mask] /= wcounts[ibin[mask]]

            attrs = {'radialedges': radialedges, 'nbins': nbins}

            def bin(catalogue):
                return scipy.digitize(
                    catalogue.distance(), radialedges, right=False) - 1

            return attrs, bin
def setUpNextState(initial_state, server_prob, n_process=n_process):
    sum_initial_state = sum(initial_state)
    arrival_times = getRandomArrivalServiceTimes(n_process, arrival_rate, None)[0]
    time_start = arrival_times[sum_initial_state]
    # next two lines optimizes to avoid the processing of whole queeue, instead only processes which are needed
    # are processed
    arrival_times = arrival_times[arrival_times <= (time_start+time_interval)]
    n_process= arrival_times.size

    initial_states = zip(initial_state, [arrival_times[sum_initial_state]] * len(initial_state))
    server_address_table_forced = random.permutation(concatenate([ones(state) * i for i,state in enumerate(initial_state)]))
    server_address_table = concatenate([server_address_table_forced, digitize(uniform.rvs(size = n_process- sum_initial_state), cumsum(server_prob))])
    server_arrival_times = [arrival_times[server_address_table == i] for i in range(n_server)]
    server_service_times = [getRandomArrivalServiceTimes((server_address_table == i).sum(), None, service_rate[i])[1] for i in range(n_server)]
    results = map(mm1, server_arrival_times, server_service_times, initial_states)
    final_state =  [r['queue_size_by_time'](time_start+time_interval, max_no_people) if r else 0 for r in results] 
    return tuple(final_state)
Beispiel #20
0
    def constant_interval(self, time_points):
        lvt = self.length_vs_time()
        max_time = lvt[0][lvt[0].size - 1]
        bins = sp.arange(0, max_time, max_time/time_points)
        time_spots = sp.digitize(lvt[0], bins)
        ts = bins
        ys = sp.zeros(bins.size)
        for i in range(0, lvt[0].size):
            spot = time_spots[i]
            ys[spot-1] = lvt[1][i]

        # Fill in zero values with previous length
        max_val = 0
        for i in range(0, ts.size-1):
            if ys[i] > max_val:
                max_val = ys[i]
            else:
                ys[i] = max_val
        return sp.array([ts, ys])
Beispiel #21
0
    def rebin(self,xnew):
        """
        Rebin the spectrum on a new grid named xnew
        """

        #Does not need equal spaced bins, but why would you not?
        xnew.sort()

        fbin  = sp.zeros(xnew.size)
        efbin = sp.zeros(xnew.size)

        #up sampling is just interpolation
        m = (self.wv >= xnew[0])*(self.wv <= xnew[-1])
        if self.wv[m].size <= xnew.size - 1:
            fbin,efbin  = self.interp(xnew)
            
        else:
        #down sampling--
        #1) define bins so that xnew is at the center.
        #2) interpolate to account for fractional pixel weights
        #3) take the mean within each bin
            db  = 0.5*sp.diff(xnew)
            b2  = xnew[1::] - db
            b2  = sp.insert(b2,0,xnew[0])

            insert = sp.searchsorted(self.wv,b2)
            xinsert = sp.insert(self.wv,insert,xnew)
            xinsert = sp.unique(xinsert)
            yinsert,zinsert = self.interp(xinsert)

            i = sp.digitize(xinsert,b2)
            for j in range(b2.size):
                iuse = sp.where(i == j+1)[0]
                fbin[j]  = sp.mean(yinsert[iuse])
                efbin[j] = sp.mean(zinsert[iuse])

        self._wv = xnew
        if self.ef is not None:        
            self._ef = efbin            
        self.f = fbin
        assert self.wv.size == self.f.size
Beispiel #22
0
    def rebin(self, xnew):
        """
        Rebin the spectrum on a new grid named xnew
        """

        #Does not need equal spaced bins, but why would you not?
        xnew.sort()

        fbin = sp.zeros(xnew.size)
        efbin = sp.zeros(xnew.size)

        #up sampling is just interpolation
        m = (self.wv >= xnew[0]) * (self.wv <= xnew[-1])
        if self.wv[m].size <= xnew.size - 1:
            fbin, efbin = self.interp(xnew)

        else:
            #down sampling--
            #1) define bins so that xnew is at the center.
            #2) interpolate to account for fractional pixel weights
            #3) take the mean within each bin
            db = 0.5 * sp.diff(xnew)
            b2 = xnew[1::] - db
            b2 = sp.insert(b2, 0, xnew[0])

            insert = sp.searchsorted(self.wv, b2)
            xinsert = sp.insert(self.wv, insert, xnew)
            xinsert = sp.unique(xinsert)
            yinsert, zinsert = self.interp(xinsert)

            i = sp.digitize(xinsert, b2)
            for j in range(b2.size):
                iuse = sp.where(i == j + 1)[0]
                fbin[j] = sp.mean(yinsert[iuse])
                efbin[j] = sp.mean(zinsert[iuse])

        self._wv = xnew
        if self.ef is not None:
            self._ef = efbin
        self.f = fbin
        assert self.wv.size == self.f.size
Beispiel #23
0
    def run(self, nbins=25):
        r"""
        Computes the pore size function of the image.

        This method calculates the distance transform of the void space, then
        computes a histogram of the occurances of each distance value.

        Parameters
        ----------
        nbins : int
            The number of bins into which the distance values should be sorted.
            The default is 25.

        """
        temp_img = spim.distance_transform_edt(self.image)
        dvals = temp_img[self.image].flatten()
        rmax = sp.amax(dvals)
        bins = sp.linspace(1, rmax, nbins)
        binned = sp.digitize(x=dvals, bins=bins)
        vals = namedtuple('PoreSizeFunction', ('distance', 'frequency'))
        vals.distance = bins
        vals.frequency = sp.bincount(binned, minlength=nbins)[1:]
        return vals
Beispiel #24
0
def get_freq_modes_over_f(power_mat, window_function, frequency, n_modes, plots=False):
    """Fines the most correlated frequency modes and fits thier noise."""

    n_f = len(frequency)
    d_f = sp.mean(sp.diff(frequency))
    dt = 1.0 / 2.0 / frequency[-1]
    n_chan = power_mat.shape[-1]
    n_time = window_function.shape[0]
    # The threshold for assuming there isn't enough data to measure anything.
    no_data_thres = 10.0 / n_time
    # Initialize the dictionary that will hold all the parameters.
    output_params = {}
    # First take the low frequency part of the spetrum matrix and average over
    # enough bins to get a well conditioned matrix.
    low_f_mat = sp.mean(power_mat[: 4 * n_chan, :, :].real, 0)
    # Factor the matrix to get the most correlated modes.
    e, v = linalg.eigh(low_f_mat)
    # Make sure they are sorted.
    if not sp.alltrue(sp.diff(e) >= 0):
        raise RuntimeError("Eigenvalues not sorted")
    # Power matrix striped of the biggest modes.
    reduced_power = sp.copy(power_mat)
    mode_list = []
    # Solve for the spectra of these modes.
    for ii in range(n_modes):
        this_mode_params = {}
        # Get power spectrum and window function for this mode.
        mode = v[:, -1 - ii]
        mode_power = sp.sum(mode * power_mat.real, -1)
        mode_power = sp.sum(mode * mode_power, -1)
        mode_window = sp.sum(mode[:, None] ** 2 * window_function, 1)
        mode_window = sp.sum(mode_window * mode[None, :] ** 2, 1)
        # Protect against no data.
        if sp.mean(mode_window).real < no_data_thres:
            this_mode_params["amplitude"] = 0.0
            this_mode_params["index"] = 0.0
            this_mode_params["f_0"] = 1.0
            this_mode_params["thermal"] = T_infinity ** 2 * dt
        else:
            # Fit the spectrum.
            p = fit_overf_const(mode_power, mode_window, frequency)
            # Put all the parameters we measured into the output.
            this_mode_params["amplitude"] = p[0]
            this_mode_params["index"] = p[1]
            this_mode_params["f_0"] = p[2]
            this_mode_params["thermal"] = p[3]
        this_mode_params["mode"] = mode
        output_params["over_f_mode_" + str(ii)] = this_mode_params
        # Remove the mode from the power matrix.
        tmp_amp = sp.sum(reduced_power * mode, -1)
        tmp_amp2 = sp.sum(reduced_power * mode[:, None], -2)
        tmp_amp3 = sp.sum(tmp_amp2 * mode, -1)
        reduced_power -= tmp_amp[:, :, None] * mode
        reduced_power -= tmp_amp2[:, None, :] * mode[:, None]
        reduced_power += tmp_amp3[:, None, None] * mode[:, None] * mode
        mode_list.append(mode)
    # Initialize the compensation matrix, that will be used to restore thermal
    # noise that gets subtracted out.  See Jan 29, Feb 17th, 2012 of Kiyo's
    # notes.
    compensation = sp.eye(n_chan, dtype=float)
    for mode1 in mode_list:
        compensation.flat[:: n_chan + 1] -= 2 * mode1 ** 2
        for mode2 in mode_list:
            mode_prod = mode1 * mode2
            compensation += mode_prod[:, None] * mode_prod[None, :]
    # Now that we've striped the noisiest modes, measure the auto power
    # spectrum, averaged over channels.
    auto_spec_mean = reduced_power.view()
    auto_spec_mean.shape = (n_f, n_chan ** 2)
    auto_spec_mean = auto_spec_mean[:, :: n_chan + 1].real
    auto_spec_mean = sp.mean(auto_spec_mean, -1)
    diag_window = window_function.view()
    diag_window.shape = (n_time, n_chan ** 2)
    diag_window = diag_window[:, :: n_chan + 1]
    auto_spec_window = sp.mean(diag_window, -1)
    if sp.mean(auto_spec_window).real < no_data_thres:
        auto_cross_over = 0.0
        auto_index = 0.0
        auto_thermal = 0
    else:
        auto_spec_params = fit_overf_const(auto_spec_mean, auto_spec_window, frequency)
        auto_thermal = auto_spec_params[3]
        if auto_spec_params[0] <= 0 or auto_spec_params[3] <= 0 or auto_spec_params[1] > -0.599:
            auto_cross_over = 0.0
            auto_index = 0.0
        else:
            auto_index = auto_spec_params[1]
            auto_cross_over = auto_spec_params[2] * (auto_spec_params[0] / auto_spec_params[3]) ** (-1.0 / auto_index)
            # if auto_cross_over < d_f:
            #    auto_index = 0.
            #    auto_cross_over = 0.
    # Plot the mean auto spectrum if desired.
    if plots:
        h = plt.gcf()
        a = h.add_subplot(*h.current_subplot)
        norm = sp.mean(auto_spec_window).real
        auto_plot = auto_spec_mean / norm
        plotable = auto_plot > 0
        lines = a.loglog(frequency[plotable], auto_plot[plotable])
        c = lines[-1].get_color()
        # And plot the fit in a light color.
        if auto_cross_over > d_f / 4.0:
            spec = npow.overf_power_spectrum(auto_thermal, auto_index, auto_cross_over, dt, n_time)
        else:
            spec = sp.zeros(n_time, dtype=float)
        spec += auto_thermal
        spec[0] = 0
        spec = npow.convolve_power(spec, auto_spec_window)
        spec = npow.prune_power(spec)
        spec = spec[1:].real
        if norm > no_data_thres:
            spec /= norm
        plotable = spec > 0
        a.loglog(frequency[plotable], spec[plotable], c=c, alpha=0.4, linestyle=":")
    output_params["all_channel_index"] = auto_index
    output_params["all_channel_corner_f"] = auto_cross_over
    # Finally measure the thermal part of the noise in each channel.
    cross_over_ind = sp.digitize([auto_cross_over * 4], frequency)[0]
    cross_over_ind = max(cross_over_ind, n_f // 2)
    cross_over_ind = min(cross_over_ind, int(9.0 * n_f / 10.0))
    thermal = reduced_power[cross_over_ind:, :, :].real
    n_high_f = thermal.shape[0]
    thermal.shape = (n_high_f, n_chan ** 2)
    thermal = sp.mean(thermal[:, :: n_chan + 1], 0)
    thermal_norms = sp.mean(diag_window, 0).real
    bad_inds = thermal_norms < no_data_thres
    thermal_norms[bad_inds] = 1.0
    # Compensate for power lost in mode subtraction.
    compensation[:, bad_inds] = 0
    compensation[bad_inds, :] = 0
    for ii in xrange(n_chan):
        if bad_inds[ii]:
            compensation[ii, ii] = 1.0
    thermal = linalg.solve(compensation, thermal)
    # Normalize
    thermal /= thermal_norms
    thermal[bad_inds] = T_infinity ** 2 * dt
    # Occationally the compensation fails horribly on a few channels.
    # When this happens, zero out the offending indices.
    thermal[thermal < 0] = 0
    output_params["thermal"] = thermal
    # Now that we know what thermal is, we can subtract it out of the modes we
    # already measured.
    for ii in range(n_modes):
        mode_params = output_params["over_f_mode_" + str(ii)]
        thermal_contribution = sp.sum(mode_params["mode"] ** 2 * thermal)
        # Subtract a maximum of 90% of the white noise to keep things positive
        # definate.
        new_white = max(mode_params["thermal"] - thermal_contribution, 0.1 * mode_params["thermal"])
        if mode_params["thermal"] < 0.5 * T_infinity ** 2 * dt:
            mode_params["thermal"] = new_white
    return output_params
server_prob = array([0.25, 0.25, 0.5])
n_server = server_prob.size
n_process = 100
time_interval = 10

initial_state = (7,2,3)
arrival_times = getRandomArrivalServiceTimes(n_process, arrival_rate, None)[0]
sum_initial_state = sum(initial_state)
# preparing initial state for each mm1 simulation

initial_states = zip(initial_state, [arrival_times[sum_initial_state]] * len(initial_state))

# maps kth process to ith server
server_address_table_forced = random.permutation(concatenate([ones(state) * i for i,state in enumerate(initial_state)]))
print "forced server address table", server_address_table_forced
server_address_table = concatenate([server_address_table_forced, digitize(uniform.rvs(size = n_process-sum_initial_state), cumsum(server_prob))])
server_arrival_times = [arrival_times[server_address_table == i] for i in range(n_server)]
server_service_times = [
    getRandomArrivalServiceTimes((server_address_table == i).sum(), None, service_rate[i])[1]
    for i in range(n_server)
]

results = map(mm1, server_arrival_times, server_service_times, initial_states)
print "Mean QueueSize(1)", array([mean(result['queue_size']) for result in results])
print "Results[0]['queue_size']", results[0]['queue_size']
print "Results[1]['queue_size']", results[1]['queue_size']
print "Results[2]['queue_size']", results[2]['queue_size']
time_start = arrival_times[sum_initial_state] # I don't know why it shouldn't be sum_initial_state +1 instead
print "queue_size_by_time", time_start, [r['queue_size_by_time'](time_start) for r in results]
print "queue_size_by_time", time_start+time_interval, [r['queue_size_by_time'](time_start+time_interval) for r in results]
Beispiel #26
0
def collapse_correlation_1d(corr, f_lags, a_lags, weights=None):
    r"""Takes a 2D correlation function and collapses to a 1D correlation
    function.

    Parameters
    ----------
    corr: 2D array
        Covariance matrix in terms of frequency lag and angular lag.
        The first output from `rebin_corr_freq_lag` right now.
    f_lags: 1D array
        The frequency lags in terms of Hz.
        The third output from `rebin_corr_freq_lag` right now.
    a_lags: 1D array
        The angular lags in terms of degrees.
    weights: 2D array
        The weights of `corr`.
        The second output from `rebin_corr_freq_lag` right now.

    Returns
    -------
    out_corr: 1D array
        The 1D autocorrelation.
    out_weights:
        The weights for `out_corr`.
    x_axis: tuple of 3 1D arrays
        `x_axis[1]` is the x - values that correspond to `out_corr`.
        `x_axis[0]` and `x_axis[2]` are the left and rightmost points
         covered by each lag bin.

    Notes
    -----
    `a_lags` are not the same as the lags from the .ini file.
    The lags from the .ini file are the right side of each lag bin,
    but you want the centre of the bin when you plot.
    To get the right values, you must do: (ask Eric or Liviu)
        lags = sp.array(F.params['lags'])
        a_lags = copy.deepcopy(lags)
        a_lags[0] = 0
        a_lags[1:] -= sp.diff(lags)/2.0
    """

    if corr.ndim != 2:
        msg = "Must start with a 2D correlation function."
        raise ValueError(msg)

    if len(f_lags) != corr.shape[0] or len(a_lags) != corr.shape[1]:
        msg = ("corr.shape must be (len(f_lags), len(a_lags)).  Passed: " +
               repr(corr.shape) + " vs (" + repr(len(f_lags)) + ", " +
               repr(len(a_lags)) + ").")

        raise ValueError(msg)

    if weights is None:
        weights = sp.ones_like(corr)

    corr = corr * weights
    # Hard code conversion factors to MPc/h for now.
    a_fact = 34.0  # Mpc/h per degree at 800MHz.
    f_fact = 4.5  # Mpc/h per MHz at 800MHz.
    # Hard code lags in MPc/h.
    #nbins = 10
    nbins = 15
    lags = sp.empty(nbins)
    lags[0] = 2.0
    lags[1] = 4.0

    for bin_index in range(2, nbins):
        lags[bin_index] = 1.5 * lags[bin_index - 1]

    # Calculate the total 1D lags.
    separation = a_lags
    separation = (a_fact * separation[sp.newaxis, :])**2
    separation = separation + (f_fact * f_lags[:, sp.newaxis] / 1.0e6)**2
    separation = sp.sqrt(separation)

    # Initialize memory for outputs.
    out_corr = sp.zeros(nbins)
    out_weights = sp.zeros(nbins)

    # Rebin.
    for lag_index in range(separation.shape[0]):
        bin_inds = sp.digitize(separation[lag_index, :], lags)
        for bin_index in range(nbins):
            out_corr[bin_index] += sp.sum(corr[lag_index,
                                               bin_inds == bin_index])
            out_weights[bin_index] += sp.sum(weights[lag_index,
                                                     bin_inds == bin_index])
    # Normalize.
    bad_inds = out_weights < 1.0e-20
    out_weights[bad_inds] = 1.0
    out_corr /= out_weights
    out_weights[bad_inds] = 0.0

    # Make real lags to be returned.
    x_left = sp.empty(nbins)
    x_left[0] = 0
    x_left[1:] = lags[:-1]
    x_right = lags
    x_centre = (x_right + x_left) / 2.0

    return out_corr, out_weights, (x_left, x_centre, x_right)
Beispiel #27
0
def get_freq_modes_over_f(power_mat,
                          window_function,
                          frequency,
                          n_modes,
                          plots=False):
    """Fines the most correlated frequency modes and fits thier noise."""

    n_f = len(frequency)
    d_f = sp.mean(sp.diff(frequency))
    dt = 1. / 2. / frequency[-1]
    n_chan = power_mat.shape[-1]
    n_time = window_function.shape[0]
    # The threshold for assuming there isn't enough data to measure anything.
    no_data_thres = 10. / n_time
    # Initialize the dictionary that will hold all the parameters.
    output_params = {}
    # First take the low frequency part of the spetrum matrix and average over
    # enough bins to get a well conditioned matrix.
    low_f_mat = sp.mean(power_mat[:4 * n_chan, :, :].real, 0)
    # Factor the matrix to get the most correlated modes.
    e, v = linalg.eigh(low_f_mat)
    # Make sure they are sorted.
    if not sp.alltrue(sp.diff(e) >= 0):
        raise RuntimeError("Eigenvalues not sorted")
    # Power matrix striped of the biggest modes.
    reduced_power = sp.copy(power_mat)
    mode_list = []
    # Solve for the spectra of these modes.
    for ii in range(n_modes):
        this_mode_params = {}
        # Get power spectrum and window function for this mode.
        mode = v[:, -1 - ii]
        mode_power = sp.sum(mode * power_mat.real, -1)
        mode_power = sp.sum(mode * mode_power, -1)
        mode_window = sp.sum(mode[:, None]**2 * window_function, 1)
        mode_window = sp.sum(mode_window * mode[None, :]**2, 1)
        # Protect against no data.
        if sp.mean(mode_window).real < no_data_thres:
            this_mode_params['amplitude'] = 0.
            this_mode_params['index'] = 0.
            this_mode_params['f_0'] = 1.
            this_mode_params['thermal'] = T_infinity**2 * dt
        else:
            # Fit the spectrum.
            p = fit_overf_const(mode_power, mode_window, frequency)
            # Put all the parameters we measured into the output.
            this_mode_params['amplitude'] = p[0]
            this_mode_params['index'] = p[1]
            this_mode_params['f_0'] = p[2]
            this_mode_params['thermal'] = p[3]
        this_mode_params['mode'] = mode
        output_params['over_f_mode_' + str(ii)] = this_mode_params
        # Remove the mode from the power matrix.
        tmp_amp = sp.sum(reduced_power * mode, -1)
        tmp_amp2 = sp.sum(reduced_power * mode[:, None], -2)
        tmp_amp3 = sp.sum(tmp_amp2 * mode, -1)
        reduced_power -= tmp_amp[:, :, None] * mode
        reduced_power -= tmp_amp2[:, None, :] * mode[:, None]
        reduced_power += tmp_amp3[:, None, None] * mode[:, None] * mode
        mode_list.append(mode)
    # Initialize the compensation matrix, that will be used to restore thermal
    # noise that gets subtracted out.  See Jan 29, Feb 17th, 2012 of Kiyo's
    # notes.
    compensation = sp.eye(n_chan, dtype=float)
    for mode1 in mode_list:
        compensation.flat[::n_chan + 1] -= 2 * mode1**2
        for mode2 in mode_list:
            mode_prod = mode1 * mode2
            compensation += mode_prod[:, None] * mode_prod[None, :]
    # Now that we've striped the noisiest modes, measure the auto power
    # spectrum, averaged over channels.
    auto_spec_mean = reduced_power.view()
    auto_spec_mean.shape = (n_f, n_chan**2)
    auto_spec_mean = auto_spec_mean[:, ::n_chan + 1].real
    auto_spec_mean = sp.mean(auto_spec_mean, -1)
    diag_window = window_function.view()
    diag_window.shape = (n_time, n_chan**2)
    diag_window = diag_window[:, ::n_chan + 1]
    auto_spec_window = sp.mean(diag_window, -1)
    if sp.mean(auto_spec_window).real < no_data_thres:
        auto_cross_over = 0.
        auto_index = 0.
        auto_thermal = 0
    else:
        auto_spec_params = fit_overf_const(auto_spec_mean, auto_spec_window,
                                           frequency)
        auto_thermal = auto_spec_params[3]
        if (auto_spec_params[0] <= 0 or auto_spec_params[3] <= 0
                or auto_spec_params[1] > -0.599):
            auto_cross_over = 0.
            auto_index = 0.
        else:
            auto_index = auto_spec_params[1]
            auto_cross_over = auto_spec_params[2] * (
                auto_spec_params[0] / auto_spec_params[3])**(-1. / auto_index)
            #if auto_cross_over < d_f:
            #    auto_index = 0.
            #    auto_cross_over = 0.
    # Plot the mean auto spectrum if desired.
    if plots:
        h = plt.gcf()
        a = h.add_subplot(*h.current_subplot)
        norm = sp.mean(auto_spec_window).real
        auto_plot = auto_spec_mean / norm
        plotable = auto_plot > 0
        lines = a.loglog(frequency[plotable], auto_plot[plotable])
        c = lines[-1].get_color()
        # And plot the fit in a light color.
        if auto_cross_over > d_f / 4.:
            spec = npow.overf_power_spectrum(auto_thermal, auto_index,
                                             auto_cross_over, dt, n_time)
        else:
            spec = sp.zeros(n_time, dtype=float)
        spec += auto_thermal
        spec[0] = 0
        spec = npow.convolve_power(spec, auto_spec_window)
        spec = npow.prune_power(spec)
        spec = spec[1:].real
        if norm > no_data_thres:
            spec /= norm
        plotable = spec > 0
        a.loglog(frequency[plotable],
                 spec[plotable],
                 c=c,
                 alpha=0.4,
                 linestyle=':')
    output_params['all_channel_index'] = auto_index
    output_params['all_channel_corner_f'] = auto_cross_over
    # Finally measure the thermal part of the noise in each channel.
    cross_over_ind = sp.digitize([auto_cross_over * 4], frequency)[0]
    cross_over_ind = max(cross_over_ind, n_f // 2)
    cross_over_ind = min(cross_over_ind, int(9. * n_f / 10.))
    thermal = reduced_power[cross_over_ind:, :, :].real
    n_high_f = thermal.shape[0]
    thermal.shape = (n_high_f, n_chan**2)
    thermal = sp.mean(thermal[:, ::n_chan + 1], 0)
    thermal_norms = sp.mean(diag_window, 0).real
    bad_inds = thermal_norms < no_data_thres
    thermal_norms[bad_inds] = 1.
    # Compensate for power lost in mode subtraction.
    compensation[:, bad_inds] = 0
    compensation[bad_inds, :] = 0
    for ii in xrange(n_chan):
        if bad_inds[ii]:
            compensation[ii, ii] = 1.
    thermal = linalg.solve(compensation, thermal)
    # Normalize
    thermal /= thermal_norms
    thermal[bad_inds] = T_infinity**2 * dt
    # Occationally the compensation fails horribly on a few channels.
    # When this happens, zero out the offending indices.
    thermal[thermal < 0] = 0
    output_params['thermal'] = thermal
    # Now that we know what thermal is, we can subtract it out of the modes we
    # already measured.
    for ii in range(n_modes):
        mode_params = output_params['over_f_mode_' + str(ii)]
        thermal_contribution = sp.sum(mode_params['mode']**2 * thermal)
        # Subtract a maximum of 90% of the white noise to keep things positive
        # definate.
        new_white = max(mode_params['thermal'] - thermal_contribution,
                        0.1 * mode_params['thermal'])
        if mode_params['thermal'] < 0.5 * T_infinity**2 * dt:
            mode_params['thermal'] = new_white
    return output_params
Beispiel #28
0
 def getBins(self, x, grid, domain):
     edges = scipy.r_[domain[0], (grid[1:] + grid[:-1]) / 2., domain[-1]]
     bins = scipy.digitize(x, edges) - 1
     return bins
Beispiel #29
0
def rebin_corr_freq_lag(corr,
                        freq1,
                        freq2=None,
                        weights=None,
                        nfbins=20,
                        return_fbins=False):
    r"""Collapses frequency pair correlation function to frequency lag.

    Basically this constructs the 2D correlation function.

    Parameters
    ----------
    corr: 3D array
        Covariance matrix which is a function of frequency and frequency prime
        and angular lag.
    freq1, freq2: tuple of floats
        The REAL frequencies. ie. 744000Hz, not 0, 1, 2...
        freq2 is used if using a map at a different redshift, but we haven't
        looked at this yet.
    weights: 3D array
        The weights of the correlation. It is found in pair.counts right now.
    nfbins: int
        How many lag bins out you go in frequency. A higher number means a
        more accurate result at high lag.
    return_fbins: bool
        If `True`, `fbins` is returned.

    Returns
    -------
    out_corr: 2D array
        `corr` from before but now only in terms of frequency lag and
        angular lag.
    out_weights: 2D array
        `weights` from before but now in 2D. The weights for `out_corr`
    fbins: 1D array
        The frequency lags in terms of Hz. Much like how `lags` in the rest of
        this module is angular lag in degrees.

    """

    if freq2 is None:
        freq2 = freq1
    # Default is equal weights.
    if weights is None:
        weights = sp.ones_like(corr)
    corr = corr * weights

    nf1 = corr.shape[0]
    nf2 = corr.shape[1]
    nlags = corr.shape[2]
    # Frequency bin size.
    delta_freq = min(abs(sp.diff(freq1)))
    # Frequency bin upper edges.
    fbins = (sp.arange(nfbins) + 0.5) * delta_freq
    # Allowcate memory for outputs.
    out_corr = sp.zeros((nfbins, nlags))
    out_weights = sp.zeros((nfbins, nlags))

    # Loop over all frequency pairs and bin by lag.
    for freq1_index in range(nf1):
        for freq2_index in range(nf2):
            f_lag = abs(freq1[freq1_index] - freq2[freq2_index])
            bin_ind = sp.digitize([f_lag], fbins)[0]
            if bin_ind < nfbins:
                out_corr[bin_ind, :] += corr[freq1_index, freq2_index, :]
                out_weights[bin_ind, :] += weights[freq1_index, freq2_index, :]

    # Normalize dealing with 0 weight points explicitly.
    bad_inds = out_weights < 1.0e-20
    out_weights[bad_inds] = 1.0
    out_corr /= out_weights
    out_weights[bad_inds] = 0.0
    out_corr[bad_inds] = 0.0

    if return_fbins:
        return out_corr, out_weights, fbins - delta_freq * 0.5
    else:
        return out_corr, out_weights
def rebin_corr_freq_lag(corr, freq1, freq2=None, weights=None, nfbins=20,
                        return_fbins=False):
    r"""Collapses frequency pair correlation function to frequency lag.

    Basically this constructs the 2D correlation function.

    Parameters
    ----------
    corr: 3D array
        Covariance matrix which is a function of frequency and frequency prime
        and angular lag.
    freq1, freq2: tuple of floats
        The REAL frequencies. ie. 744000Hz, not 0, 1, 2...
        freq2 is used if using a map at a different redshift, but we haven't
        looked at this yet.
    weights: 3D array
        The weights of the correlation. It is found in pair.counts right now.
    nfbins: int
        How many lag bins out you go in frequency. A higher number means a
        more accurate result at high lag.
    return_fbins: bool
        If `True`, `fbins` is returned.

    Returns
    -------
    out_corr: 2D array
        `corr` from before but now only in terms of frequency lag and
        angular lag.
    out_weights: 2D array
        `weights` from before but now in 2D. The weights for `out_corr`
    fbins: 1D array
        The frequency lags in terms of Hz. Much like how `lags` in the rest of
        this module is angular lag in degrees.

    """

    if freq2 is None:
        freq2 = freq1
    # Default is equal weights.
    if weights is None:
        weights = sp.ones_like(corr)
    corr = corr * weights

    nf1 = corr.shape[0]
    nf2 = corr.shape[1]
    nlags = corr.shape[2]
    # Frequency bin size.
    delta_freq = min(abs(sp.diff(freq1)))
    # Frequency bin upper edges.
    fbins = (sp.arange(nfbins) + 0.5) * delta_freq
    # Allowcate memory for outputs.
    out_corr = sp.zeros((nfbins, nlags))
    out_weights = sp.zeros((nfbins, nlags))

    # Loop over all frequency pairs and bin by lag.
    for freq1_index in range(nf1):
        for freq2_index in range(nf2):
            f_lag = abs(freq1[freq1_index] - freq2[freq2_index])
            bin_ind = sp.digitize([f_lag], fbins)[0]
            if bin_ind < nfbins:
                out_corr[bin_ind, :] += corr[freq1_index, freq2_index, :]
                out_weights[bin_ind, :] += weights[freq1_index, freq2_index, :]

    # Normalize dealing with 0 weight points explicitly.
    bad_inds = out_weights < 1.0e-20
    out_weights[bad_inds] = 1.0
    out_corr /= out_weights
    out_weights[bad_inds] = 0.0
    out_corr[bad_inds] = 0.0

    if return_fbins:
        return out_corr, out_weights, fbins - delta_freq * 0.5
    else:
        return out_corr, out_weights
def gen_ld_plots(snps_hdf5_file=snp_file,
                 max_dist=2000,
                 min_maf=0,
                 bin_size=10,
                 fig_dir=results_dir,
                 filter_pop=None,
                 genes=filtered_genes):

    #Calculating LD just for chromosomal and chromids genes:
    gene_groups = pd.read_csv(genes)
    chrom_genes = gene_groups['Gene.group'].tolist()

    pop_map = parse_pop_map()

    xs = []
    ys = []

    #from itertools import izip
    h5f = h5py.File(snps_hdf5_file, mode="r")
    gene_groups = sorted(h5f.keys())
    ld_dist_dict = {'all': {}, 'nonsyn': {}, 'syn': {}}
    distances = range(0, max_dist)

    for dist in distances:
        ld_dist_dict['all'][dist] = {'r2_sum': 0.0, 'snp_count': 0.0}
        #ld_dist_dict['nonsyn'][dist]={'r2_sum':0.0, 'snp_count':0.0}
        #ld_dist_dict['syn'][dist]={'r2_sum':0.0, 'snp_count':0.0}

    for i, gg in enumerate(chrom_genes):
        #for i, gg in enumerate(gene_groups):
        gg = gg[5::]
        print(gg)
        if gg in chrom_genes:
            print(gg)
        #gg = str(gg.encode('utf-8'))
        #print(type(gg))
        if i % 100 == 0:
            print('%d: Gene %s' % (i, gg))

        g = h5f[str(gg)]
        print(g)

        # Look at genes that have at least 10 SNPS
        if g['codon_snp_freqs'].size > 10:

            if filter_pop is not None:
                strains = g['strains'][...]

                indiv_filter = sp.zeros((len(strains)), dtype='bool8')

                for s_i, s in enumerate(strains):
                    try:
                        s = str(s, 'utf-8')
                        if pop_map[s]['genospecies'] == filter_pop:
                            indiv_filter[s_i] = True
                    except:
                        continue
                if sp.sum(indiv_filter) < 2:
                    continue

                codon_snps = g['codon_snps'][...]
                print(codon_snps)

                codon_snps = codon_snps[:, indiv_filter]
                print(codon_snps.shape)
                norm_codon_snps = sp.transpose(codon_snps)
                freqs = sp.mean(norm_codon_snps, 0)
                norm_codon_snps = (norm_codon_snps - freqs) / sp.sqrt(
                    freqs * (1 - freqs))
                norm_codon_snps = sp.transpose(norm_codon_snps)
                mafs = sp.minimum(freqs, 1 - freqs)
                maf_filter = mafs > min_maf
                if sp.sum(maf_filter) > 1:

                    all_norm_snps = norm_codon_snps
                    all_positions = g['codon_snp_positions'][...]
                    norm_snps = all_norm_snps[maf_filter]
                    positions = all_positions[maf_filter]
                    M, N = norm_snps.shape
                    is_synonimous_snp = g['is_synonimous_snp'][...]
                    is_nonsynonimous_snp = ~is_synonimous_snp
                    syn_snp_filter = is_synonimous_snp * maf_filter
                    nonsyn_snp_filter = is_nonsynonimous_snp * maf_filter

                    if sp.sum(syn_snp_filter) > sp.sum(nonsyn_snp_filter):
                        all_norm_snps = norm_codon_snps
                        all_positions = g['codon_snp_positions'][...]
                        norm_snps = all_norm_snps[maf_filter]
                        positions = all_positions[maf_filter]
                        M, N = norm_snps.shape

                        ld_mat = sp.dot(norm_snps, norm_snps.T) / float(N)
                        assert M == len(positions), 'A bug detected.'
                        for i in range(M - 1):
                            for j in range(i + 1, M):
                                dist = positions[j] - positions[i]
                                if dist < max_dist:
                                    ld_dist_dict['all'][dist][
                                        'r2_sum'] += ld_mat[i, j]**2
                                    ld_dist_dict['all'][dist][
                                        'snp_count'] += 1.0

    print(ld_dist_dict)
    pairs = 0
    #for plot_type in ld_dist_dict.keys():
    avg_r2s = []
    plot_distances = []
    for dist in distances:
        if ld_dist_dict['all'][dist]['snp_count'] >= 1:
            avg_r2 = ld_dist_dict['all'][dist]['r2_sum'] / float(
                ld_dist_dict['all'][dist]['snp_count'])
            pairs += 1
            avg_r2s.append(avg_r2)
            plot_distances.append(dist)

    plot_distances = sp.array(plot_distances)
    avg_r2s = sp.array(avg_r2s)

    print(avg_r2s)
    bins = sp.arange(0, max(plot_distances), bin_size)
    digitize = sp.digitize(plot_distances, bins)
    for bin_i in range(len(bins)):
        bin_filter = digitize == (bin_i + 1)
        if len(plot_distances[bin_filter]) > 0:
            xs.append(sp.mean(plot_distances[bin_filter]))
            ys.append(sp.mean(avg_r2s[bin_filter]))

        # plt.plot(xs, ys, color='k', linestyle='None', marker='.', alpha=0.5)
        # plt.xlabel(r'Pairwise distance ($d$)')
        # plt.ylabel(r'Squared correlation ($r^2$)')
        # if filter_pop is not None:
        # 	plt.title('LD decay of 0.99 < ANI <= 01')
        # 	plt.savefig('%s/ld_%s_codons_nuc_0.99_1_gsA_chromosome_maf_01_core_%s.pdf'%(fig_dir,plot_type,filter_pop))

    plot_list = pd.DataFrame({
        'X': xs,
        'Y': ys,
    })
    plot_list.to_csv(
        "{dir_res}/plotting_intergenic_LD_{maf}_{bin_size}_{geno}.csv".format(
            dir_res=fig_dir, maf=min_maf, bin_size=bin_size, geno=filter_pop))
    return (plot_list)
def collapse_correlation_1d(corr, f_lags, a_lags, weights=None):
    r"""Takes a 2D correlation function and collapses to a 1D correlation
    function.

    Parameters
    ----------
    corr: 2D array
        Covariance matrix in terms of frequency lag and angular lag.
        The first output from `rebin_corr_freq_lag` right now.
    f_lags: 1D array
        The frequency lags in terms of Hz.
        The third output from `rebin_corr_freq_lag` right now.
    a_lags: 1D array
        The angular lags in terms of degrees.
    weights: 2D array
        The weights of `corr`.
        The second output from `rebin_corr_freq_lag` right now.

    Returns
    -------
    out_corr: 1D array
        The 1D autocorrelation.
    out_weights:
        The weights for `out_corr`.
    x_axis: tuple of 3 1D arrays
        `x_axis[1]` is the x - values that correspond to `out_corr`.
        `x_axis[0]` and `x_axis[2]` are the left and rightmost points
         covered by each lag bin.

    Notes
    -----
    `a_lags` are not the same as the lags from the .ini file.
    The lags from the .ini file are the right side of each lag bin,
    but you want the centre of the bin when you plot.
    To get the right values, you must do: (ask Eric or Liviu)
        lags = sp.array(F.params['lags'])
        a_lags = copy.deepcopy(lags)
        a_lags[0] = 0
        a_lags[1:] -= sp.diff(lags)/2.0
    """

    if corr.ndim != 2:
        msg = "Must start with a 2D correlation function."
        raise ValueError(msg)

    if len(f_lags) != corr.shape[0] or len(a_lags) != corr.shape[1]:
        msg = ("corr.shape must be (len(f_lags), len(a_lags)).  Passed: "
               + repr(corr.shape) + " vs (" + repr(len(f_lags)) + ", "
               + repr(len(a_lags)) + ").")

        raise ValueError(msg)

    if weights is None:
        weights = sp.ones_like(corr)

    corr = corr * weights
    # Hard code conversion factors to MPc/h for now.
    a_fact = 34.0  # Mpc/h per degree at 800MHz.
    f_fact = 4.5   # Mpc/h per MHz at 800MHz.
    # Hard code lags in MPc/h.
    #nbins = 10
    nbins = 15
    lags = sp.empty(nbins)
    lags[0] = 2.0
    lags[1] = 4.0

    for bin_index in range(2, nbins):
        lags[bin_index] = 1.5 * lags[bin_index - 1]

    # Calculate the total 1D lags.
    separation = a_lags
    separation = (a_fact * separation[sp.newaxis, :]) ** 2
    separation = separation + (f_fact * f_lags[:, sp.newaxis] / 1.0e6) ** 2
    separation = sp.sqrt(separation)

    # Initialize memory for outputs.
    out_corr = sp.zeros(nbins)
    out_weights = sp.zeros(nbins)

    # Rebin.
    for lag_index in range(separation.shape[0]):
        bin_inds = sp.digitize(separation[lag_index, :], lags)
        for bin_index in range(nbins):
            out_corr[bin_index] += sp.sum(corr[lag_index,
                                               bin_inds == bin_index])
            out_weights[bin_index] += sp.sum(weights[lag_index,
                                                     bin_inds == bin_index])
    # Normalize.
    bad_inds = out_weights < 1.0e-20
    out_weights[bad_inds] = 1.0
    out_corr /= out_weights
    out_weights[bad_inds] = 0.0

    # Make real lags to be returned.
    x_left = sp.empty(nbins)
    x_left[0] = 0
    x_left[1:] = lags[:-1]
    x_right = lags
    x_centre = (x_right + x_left) / 2.0

    return out_corr, out_weights, (x_left, x_centre, x_right)
def corr_est(map1, map2, noise1, noise2, freq1, freq2,
             lags=(), speedup=False, verbose=False):
    r"""Calculate the cross correlation function of the maps.

    The cross correlation function is a function of f1, f2 and angular lag.
    The angular lag bins are passed, all pairs of frequencies are
    calculated.

    Parameters
    ----------
    lags: array like
        Angular lags bins (upper side bin edges).
    speedup: boolean
        Speeds up the correlation. This works fine, yes? Should be the
        normal way if so.

    Returns
    -------
    corr: array
        The correlation between 2 maps.
    counts: array
        The weighting of the correlation based on the maps' weights.

    """
    map1_ra = map1.get_axis('ra')
    map2_ra = map2.get_axis('ra')
    map1_dec = map1.get_axis('dec')
    map2_dec = map2.get_axis('dec')

    input_map1 = map1[freq1, :, :]
    input_map2 = map2[freq2, :, :]
    input_noise1 = noise1[freq1, :, :]
    input_noise2 = noise2[freq2, :, :]

    # Noise weight
    input_map1 *= input_noise1
    input_map2 *= input_noise2

    nlags = len(lags)
    nfreq = len(freq1)
    corr = sp.zeros((nfreq, nfreq, nlags), dtype=float)
    counts = sp.zeros(corr.shape, dtype=float)
    # Noting that if DEC != 0, then a degree of RA is less than a degree.
    ra_fact = sp.cos(sp.pi * map1.info['dec_centre'] / 180.0)

    # Calculate the pairwise lags.
    dra = (map1_ra[:, None] - map2_ra[None, :]) * ra_fact
    ddec = map1_dec[:, None] - map2_dec[None, :]
    lag = dra[:, None, :, None] ** 2 + ddec[None, :, None, :] ** 2
    lag = sp.sqrt(lag)
    # Bin this up.
    lag_inds = sp.digitize(lag.flatten(), lags)

    if speedup:
        print "Starting Correlation (sparse version)"

        (nr1, nd1) = (len(map1_ra), len(map1_dec))
        (nr2, nd2) = (len(map2_ra), len(map2_dec))
        (r1ind, d1ind) = (sp.arange(nr1), sp.arange(nd1))
        (r2ind, d2ind) = (sp.arange(nr2), sp.arange(nd2))
        ra1_pairind = r1ind.repeat(nr2 * nd1 * nd2)
        ra2_pairind = sp.tile(r2ind.repeat(nd2), (1, nr1 * nd1)).flatten()
        dec1_pairind = sp.tile(d1ind.repeat(nr2 * nd2), (1, nr1)).flatten()
        dec2_pairind = sp.tile(d2ind, (1, nr1 * nr2 * nd1)).flatten()

        # precalculate the pair indices for a given lag
        # could also imagine calculating the map slices here
        posmaskdict = {}
        for klag in range(nlags):
            mask = (lag_inds == klag)
            posmaskdict[repr(klag)] = (ra1_pairind[mask],
                                       ra2_pairind[mask],
                                       dec1_pairind[mask],
                                       dec2_pairind[mask])

        for if1 in range(len(freq1)):
            for jf2 in range(len(freq2)):
                start = time.time()

                data1 = input_map1[if1, :, :]
                data2 = input_map2[jf2, :, :]
                weights1 = input_noise1[if1, :, :]
                weights2 = input_noise2[jf2, :, :]

                for klag in range(nlags):
                    (r1m, r2m, d1m, d2m) = posmaskdict[repr(klag)]
                    dprod = data1[r1m, d1m] * data2[r2m, d2m]
                    wprod = weights1[r1m, d1m] * weights2[r2m, d2m]
                    corr[if1, jf2, klag] += sp.sum(dprod)
                    counts[if1, jf2, klag] += sp.sum(wprod)

                if verbose:
                    print if1, jf2, (time.time() - start)
                    print counts[if1, jf2, :]
    else:
        print "Starting Correlation (full version)"
        for if1 in range(len(freq1)):
            for jf2 in range(len(freq2)):
                start = time.time()
                # Calculate the pairwise products.
                data1 = input_map1[if1, :, :]
                data2 = input_map2[jf2, :, :]
                weights1 = input_noise1[if1, :, :]
                weights2 = input_noise2[jf2, :, :]
                dprod = data1[..., None, None] * data2[None, None, ...]
                wprod = weights1[..., None, None] * \
                        weights2[None, None, ...]
                for klag in range(nlags):
                    mask = (lag_inds == klag)
                    corr[if1, jf2, klag] += sp.sum(dprod.flatten()[mask])
                    counts[if1, jf2, klag] += sp.sum(wprod.flatten()[mask])

                if verbose:
                    print if1, jf2, (time.time() - start)
                    print counts[if1, jf2, :]

    mask = (counts < 1e-20)
    counts[mask] = 1
    corr /= counts
    corr[mask] = 0
    counts[mask] = 0

    return corr, counts
Beispiel #34
0
 def getBins(self,x,grid,domain):
     edges = scipy.r_[domain[0],(grid[1:]+grid[:-1])/2.,domain[-1]]
     bins = scipy.digitize(x,edges)-1
     return bins
Beispiel #35
0
def count_ld_indep_regions(res_file, num_traits=None, ss_file=None, ld_reg_map='/project/PCMA/faststorage/1_DATA/fourier_ls.hdf5'):
    # parse results..
    print 'Parsing PCMA results'
    if ss_file is not None:
        chrom_res_dict = parse_PCMA_results(ss_file, res_file)
    else:
        chrom_res_dict = parse_PCMA_comb_results(res_file, num_traits)
    
    # Filter for good SNPs?
    
    # parse ldetect map
    print 'Loading ldetect map'
    ldr = h5py.File(ld_reg_map, 'r')
    
    num_new_hits = 0
    num_comb_hits = 0
    num_marg_hits = 0
    num_shared_hits = 0
    num_missed_hits = 0
        
    chrom_bin_dict = {} 
    
    res_summary_dict = {}
    for chrom in range(1, 23):
        print 'Working on chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        res_dict = chrom_res_dict[chrom_str]
        chrom_bins = ldr[chrom_str]
        bin_indices = sp.digitize(res_dict['positions'], chrom_bins)
        chrom_bin_dict[chrom_str] = {'bin_indices':bin_indices, 'chrom_bins':chrom_bins, 'num_bins':len(chrom_bins) - 1}
        
        # Count things..
        print 'Counting hits'
        # assert len(chrom_bins)-1==bin_indices.max()+1, 'WTF?'
        for bin_i in range(bin_indices.max() + 1):
            bin_filter = bin_indices == bin_i
            if sp.any(bin_filter):
                min_marg_pv = (res_dict['min_marg_ps'][bin_filter]).min()
                marg_hit = min_marg_pv < 5E-8
                comb_ps = res_dict['comb_ps'][bin_filter]
                min_i = comb_ps.idxmin()  
                min_comb_pv = comb_ps[min_i]
                min_sid = res_dict['sids'][min_i]
                
                comb_hit = min_comb_pv < 5E-8
    
                if marg_hit:
                    num_marg_hits += 1
                    if comb_hit:
                        num_shared_hits += 1
                        num_comb_hits += 1
                    else:
                        num_missed_hits += 1
                elif comb_hit:
                    num_new_hits += 1
                    num_comb_hits += 1
                    
                    start_pos = chrom_bins[bin_i]
                    if bin_i < len(chrom_bins) - 1:
                        end_pos = chrom_bins[bin_i + 1]
                    else:
                        end_pos = -1
                    res_summary_dict[bin_i] = {'min_marg_pv':min_marg_pv, 'min_comb_pv':min_comb_pv,
                                             'min_PC_pv': res_dict['pc_ps'].loc[min_i], 'min_sid':min_sid,
                                             'chromsome':chrom, 'positions_bin':(start_pos, end_pos)}
                    # More information on new hits somewhere
        
    print '\nResults summary: \n# new hits: %d \n# missed hits: %d \n# of shared hits: %d \n# multivar. hits: %d \n# marg. hits: %d \n' % (num_new_hits, num_missed_hits, num_shared_hits, num_comb_hits, num_marg_hits)
    print res_summary_dict
Beispiel #36
0
def corr_est(map1,
             map2,
             noise1,
             noise2,
             freq1,
             freq2,
             lags=(),
             speedup=False,
             verbose=False):
    r"""Calculate the cross correlation function of the maps.

    The cross correlation function is a function of f1, f2 and angular lag.
    The angular lag bins are passed, all pairs of frequencies are
    calculated.

    Parameters
    ----------
    lags: array like
        Angular lags bins (upper side bin edges).
    speedup: boolean
        Speeds up the correlation. This works fine, yes? Should be the
        normal way if so.

    Returns
    -------
    corr: array
        The correlation between 2 maps.
    counts: array
        The weighting of the correlation based on the maps' weights.

    """
    map1_ra = map1.get_axis('ra')
    map2_ra = map2.get_axis('ra')
    map1_dec = map1.get_axis('dec')
    map2_dec = map2.get_axis('dec')

    input_map1 = map1[freq1, :, :]
    input_map2 = map2[freq2, :, :]
    input_noise1 = noise1[freq1, :, :]
    input_noise2 = noise2[freq2, :, :]

    # Noise weight
    input_map1 *= input_noise1
    input_map2 *= input_noise2

    nlags = len(lags)
    nfreq = len(freq1)
    corr = sp.zeros((nfreq, nfreq, nlags), dtype=float)
    counts = sp.zeros(corr.shape, dtype=float)
    # Noting that if DEC != 0, then a degree of RA is less than a degree.
    ra_fact = sp.cos(sp.pi * map1.info['dec_centre'] / 180.0)

    # Calculate the pairwise lags.
    dra = (map1_ra[:, None] - map2_ra[None, :]) * ra_fact
    ddec = map1_dec[:, None] - map2_dec[None, :]
    lag = dra[:, None, :, None]**2 + ddec[None, :, None, :]**2
    lag = sp.sqrt(lag)
    # Bin this up.
    lag_inds = sp.digitize(lag.flatten(), lags)

    if speedup:
        print "Starting Correlation (sparse version)"

        (nr1, nd1) = (len(map1_ra), len(map1_dec))
        (nr2, nd2) = (len(map2_ra), len(map2_dec))
        (r1ind, d1ind) = (sp.arange(nr1), sp.arange(nd1))
        (r2ind, d2ind) = (sp.arange(nr2), sp.arange(nd2))
        ra1_pairind = r1ind.repeat(nr2 * nd1 * nd2)
        ra2_pairind = sp.tile(r2ind.repeat(nd2), (1, nr1 * nd1)).flatten()
        dec1_pairind = sp.tile(d1ind.repeat(nr2 * nd2), (1, nr1)).flatten()
        dec2_pairind = sp.tile(d2ind, (1, nr1 * nr2 * nd1)).flatten()

        # precalculate the pair indices for a given lag
        # could also imagine calculating the map slices here
        posmaskdict = {}
        for klag in range(nlags):
            mask = (lag_inds == klag)
            posmaskdict[repr(klag)] = (ra1_pairind[mask], ra2_pairind[mask],
                                       dec1_pairind[mask], dec2_pairind[mask])

        for if1 in range(len(freq1)):
            for jf2 in range(len(freq2)):
                start = time.time()

                data1 = input_map1[if1, :, :]
                data2 = input_map2[jf2, :, :]
                weights1 = input_noise1[if1, :, :]
                weights2 = input_noise2[jf2, :, :]

                for klag in range(nlags):
                    (r1m, r2m, d1m, d2m) = posmaskdict[repr(klag)]
                    dprod = data1[r1m, d1m] * data2[r2m, d2m]
                    wprod = weights1[r1m, d1m] * weights2[r2m, d2m]
                    corr[if1, jf2, klag] += sp.sum(dprod)
                    counts[if1, jf2, klag] += sp.sum(wprod)

                if verbose:
                    print if1, jf2, (time.time() - start)
                    print counts[if1, jf2, :]
    else:
        print "Starting Correlation (full version)"
        for if1 in range(len(freq1)):
            for jf2 in range(len(freq2)):
                start = time.time()
                # Calculate the pairwise products.
                data1 = input_map1[if1, :, :]
                data2 = input_map2[jf2, :, :]
                weights1 = input_noise1[if1, :, :]
                weights2 = input_noise2[jf2, :, :]
                dprod = data1[..., None, None] * data2[None, None, ...]
                wprod = weights1[..., None, None] * \
                        weights2[None, None, ...]
                for klag in range(nlags):
                    mask = (lag_inds == klag)
                    corr[if1, jf2, klag] += sp.sum(dprod.flatten()[mask])
                    counts[if1, jf2, klag] += sp.sum(wprod.flatten()[mask])

                if verbose:
                    print if1, jf2, (time.time() - start)
                    print counts[if1, jf2, :]

    mask = (counts < 1e-20)
    counts[mask] = 1
    corr /= counts
    corr[mask] = 0
    counts[mask] = 0

    return corr, counts
 def bin(catalogue):
     return scipy.digitize(
         catalogue.distance(), radialedges, right=False) - 1
Beispiel #38
0
def count_ld_indep_regions(
        res_file,
        num_traits=None,
        ss_file=None,
        ld_reg_map='/project/PCMA/faststorage/1_DATA/fourier_ls.hdf5'):
    # parse results..
    print 'Parsing PCMA results'
    if ss_file is not None:
        chrom_res_dict = parse_PCMA_results(ss_file, res_file)
    else:
        chrom_res_dict = parse_PCMA_comb_results(res_file, num_traits)

    # Filter for good SNPs?

    # parse ldetect map
    print 'Loading ldetect map'
    ldr = h5py.File(ld_reg_map, 'r')

    num_new_hits = 0
    num_comb_hits = 0
    num_marg_hits = 0
    num_shared_hits = 0
    num_missed_hits = 0

    chrom_bin_dict = {}

    res_summary_dict = {}
    for chrom in range(1, 23):
        print 'Working on chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        res_dict = chrom_res_dict[chrom_str]
        chrom_bins = ldr[chrom_str]
        bin_indices = sp.digitize(res_dict['positions'], chrom_bins)
        chrom_bin_dict[chrom_str] = {
            'bin_indices': bin_indices,
            'chrom_bins': chrom_bins,
            'num_bins': len(chrom_bins) - 1
        }

        # Count things..
        print 'Counting hits'
        # assert len(chrom_bins)-1==bin_indices.max()+1, 'WTF?'
        for bin_i in range(bin_indices.max() + 1):
            bin_filter = bin_indices == bin_i
            if sp.any(bin_filter):
                min_marg_pv = (res_dict['min_marg_ps'][bin_filter]).min()
                marg_hit = min_marg_pv < 5E-8
                comb_ps = res_dict['comb_ps'][bin_filter]
                min_i = comb_ps.idxmin()
                min_comb_pv = comb_ps[min_i]
                min_sid = res_dict['sids'][min_i]

                comb_hit = min_comb_pv < 5E-8

                if marg_hit:
                    num_marg_hits += 1
                    if comb_hit:
                        num_shared_hits += 1
                        num_comb_hits += 1
                    else:
                        num_missed_hits += 1
                elif comb_hit:
                    num_new_hits += 1
                    num_comb_hits += 1

                    start_pos = chrom_bins[bin_i]
                    if bin_i < len(chrom_bins) - 1:
                        end_pos = chrom_bins[bin_i + 1]
                    else:
                        end_pos = -1
                    res_summary_dict[bin_i] = {
                        'min_marg_pv': min_marg_pv,
                        'min_comb_pv': min_comb_pv,
                        'min_PC_pv': res_dict['pc_ps'].loc[min_i],
                        'min_sid': min_sid,
                        'chromsome': chrom,
                        'positions_bin': (start_pos, end_pos)
                    }
                    # More information on new hits somewhere

    print '\nResults summary: \n# new hits: %d \n# missed hits: %d \n# of shared hits: %d \n# multivar. hits: %d \n# marg. hits: %d \n' % (
        num_new_hits, num_missed_hits, num_shared_hits, num_comb_hits,
        num_marg_hits)
    print res_summary_dict
from scipy import mean, digitize, cumsum, array, concatenate, sort, split, set_printoptions
from scipy.stats import uniform
from QueueingTheory import mm1, getRandomArrivalServiceTimes
# QueueingTheory Module available on https://gist.github.com/siddhant3s/5665696
set_printoptions(precision = 3)
arrival_rate = 1
service_rate = 4/3.0
n_process = 10000
arrival_times, service_times = getRandomArrivalServiceTimes(n_process, arrival_rate, service_rate)
server_prob = array([0.2, 0.2, 0.2, 0.2, 0.2])
n_server = server_prob.size
# maps kth process to ith server
server_address_table = digitize(uniform.rvs(size = n_process), cumsum(server_prob))
server_arrival_times = [arrival_times[server_address_table == i] for i in range(n_server)]
server_service_times = [service_times[server_address_table == i] for i in range(n_server)]
results = map(mm1, server_arrival_times, server_service_times)
print "Mean Wait(1)", array([mean(result['wait_times']) for result in results])
print "Mean QueueSize(1)", array([mean(result['queue_size']) for result in results])
server_prob_matrix = array([[ 0.2,  0.2,  0.2,  0.2,  0.2],
                            [ 0.2,  0.2,  0.2,  0.2,  0.2],
                            [ 0.2,  0.2,  0.2,  0.2,  0.2],
                            [ 0.2,  0.2,  0.2,  0.2,  0.2],
                            [ 0.2,  0.2,  0.2,  0.2,  0.2]])
server_prob_matrix_cumsumed = cumsum(server_prob_matrix, axis = 1)
server_address_tables = [
    digitize(uniform.rvs(size = len(server_arrival_times[i])), server_prob_matrix_cumsumed[i])
    for i in range(n_server)
]
server_arrival_times = [
    sort(concatenate([results[i]['completion_times'][server_address_tables[i] == k]
                      for i in range(n_server)])) 
Beispiel #40
0
def plot_gw_r2_decay(file_prefix,
                     num_random_xs=200,
                     max_dist=1000000,
                     call_method_id=78,
                     mac_filter=15,
                     debug_filter=1):
    """
	Plots r2 decay on the genome-wide scale
	"""
    dtype = 'single'  #To increase matrix multiplication speed... using 32 bits.
    sd = dp.load_snps_call_method(call_method_id=call_method_id,
                                  debug_filter=debug_filter,
                                  min_mac=mac_filter)
    #sd.filter_mac_snps(mac_filter)
    h_inverse_matrix_file = env[
        'data_dir'] + 'snp_cov_mat_h_inv_cm%d.pickled' % (call_method_id)
    if not os.path.isfile(h_inverse_matrix_file):
        K = sd.get_snp_cov_matrix()
        H_sqrt = lm.cholesky(K)
        H_sqrt_inv = (H_sqrt).I
        with file(h_inverse_matrix_file, 'wb') as f:
            cPickle.dump(H_sqrt_inv, f, protocol=2)
    else:
        with file(h_inverse_matrix_file) as f:
            H_sqrt_inv = cPickle.load(f)

    cps_list = sd.getChrPosSNPList()
    x_cps = random.sample(cps_list, num_random_xs)
    y_cps = cps_list
    result_dict = {}
    n = len(sd.accessions)
    print 'Starting calculation'
    sys.stdout.flush()
    dists = []
    r2s = []
    t_r2s = []
    x_macs = []
    y_macs = []
    n_saved = 0
    s1 = time.time()
    for i, (x_c, x_p, x_snp) in enumerate(x_cps):
        print '%d: chromosome=%d, position=%d' % (i, x_c, x_p)
        #Normalize SNP..
        xs = sp.array(x_snp)
        x_mac = sum(xs)
        t_x_snp = sp.dot(((xs - sp.mean(xs)) / sp.std(xs)), H_sqrt_inv).T
        for (y_c, y_p, y_snp) in reversed(y_cps):
            if x_c != y_c:
                continue
            if abs(x_p - y_p) > max_dist:
                continue
            ys = sp.array(y_snp)
            x_macs.append(x_mac)
            y_macs.append(sum(ys))
            (r, pearson_pval) = st.pearsonr(xs, ys)
            r2 = r * r
            t_y_snp = sp.dot(((ys - sp.mean(ys)) / sp.std(ys)), H_sqrt_inv).T
            (t_r, t_pearson_pval) = st.pearsonr(
                t_x_snp, t_y_snp)  #Done twice, but this is fast..
            t_r, t_pearson_pval = float(t_r), float(t_pearson_pval)
            t_r2 = t_r * t_r
            dists.append(abs(x_p - y_p))
            r2s.append(r2)
            t_r2s.append(t_r2)
            n_saved += 1

    time_secs = time.time() - s1
    print 'It took %d minutes and %d seconds to finish.' % (time_secs / 60,
                                                            time_secs % 60)
    print '%d values were saved.' % n_saved
    sys.stdout.flush()

    #Now plotting and binning..
    for m_dist in [50000, 100000, 200000, 500000, 1000000]:
        kbs = m_dist / 1000
        bin_ids = sp.digitize(dists, sp.arange(0, m_dist, m_dist / 100)) - 1
        bin_dict = {}
        for bid in range(100):
            bin_dict[bid] = {'r2s': [], 't_r2s': []}
        filtered_r2s = []
        filtered_t_r2s = []
        filtered_dists = []
        for bid, r2, t_r2, dist in izip(bin_ids, r2s, t_r2s, dists):
            if dist > m_dist:
                continue
            bin_dict[bid]['r2s'].append(r2)
            filtered_r2s.append(r2)
            bin_dict[bid]['t_r2s'].append(t_r2)
            filtered_t_r2s.append(t_r2)
            filtered_dists.append(dist)

        pylab.figure()
        pylab.plot(filtered_dists,
                   filtered_r2s,
                   alpha=0.3,
                   color='k',
                   marker='.',
                   ls='None')
        pylab.xlabel('Distance (bases)')
        pylab.ylabel(r'$r^2$')
        pylab.savefig(file_prefix + '_%dkb_r2s.png' % (kbs))
        pylab.figure()
        pylab.plot(filtered_dists,
                   filtered_t_r2s,
                   alpha=0.3,
                   color='k',
                   marker='.',
                   ls='None')
        pylab.xlabel('Distance (bases)')
        pylab.ylabel(r'$r^2$')
        pylab.savefig(file_prefix + '_%dkb_t_r2s.png' % (kbs))

        r2_avgs = []
        t_r2_avgs = []
        xs = []
        l = sp.arange(0, m_dist, m_dist / 100) + (m_dist / 200)
        for bid in range(100):
            n = len(bin_dict[bid]['r2s'])
            if n > 0:
                r2_avgs.append(sp.sum(bin_dict[bid]['r2s']) / n)
                t_r2_avgs.append(sp.sum(bin_dict[bid]['t_r2s']) / n)
                xs.append(l[bid])

        pylab.figure()
        pylab.plot(xs,
                   r2_avgs,
                   alpha=0.7,
                   color='b',
                   lw=1.8,
                   label=r'standard $r^2$')
        pylab.plot(xs,
                   t_r2_avgs,
                   alpha=0.7,
                   color='m',
                   lw=1.8,
                   label=r'transformed $r^2$')
        pylab.legend(loc=1)
        pylab.xlabel('Distance (bases)')
        pylab.ylabel(r'$r^2$')
        pylab.savefig(file_prefix + '_%dkb_r2s_avgs.png' % (kbs))