Exemple #1
0
    def test_msort_1(self):

        a = np.array([[1, 4], [3, 1]])
        b = np.msort(a)
        print(b)
        print("********")

        a = np.arange(32.2, 0.2, -1.0)
        b = np.msort(a)
        print(b)
Exemple #2
0
def calculate_eCDF(data, extend=False):
    """Calculate the x- and y-coordinates of an empirical CDF curve.

    This function finds the unique values within a dataset, `data`, and
    calculates the likelihood that a random data point within the set is
    less than or equal to each of those unique values.  The `extend` option
    creates extra values outside the range of `data` corresponding to
    P(X <= x) = 0 and 1, which are useful for plotting eCDFs."""

    ## Get the unique values in `data` and their counts (the histogram).
    counts = Counter(data.ravel())
    ## Sort the unique values
    vals = np.msort(list(counts.keys()))
    ## Calculate the cumulative number of counts, then divide by the total.
    CDF = np.cumsum([counts[val] for val in vals])
    CDF = CDF / CDF[-1]

    ## If `extend`, add points to `vals` and `CDF`
    if extend:
        data_range = vals[-1] - vals[0]
        vals = [vals[0] - (0.01 * data_range)] + list(vals)
        vals = np.asarray(vals + [vals[-1] + (0.01 * data_range)])
        CDF = np.asarray([0] + list(CDF) + [1])

    return vals, CDF
Exemple #3
0
    def plot(self, sub, ints, plot_style="seaborn"):
        """ Plot complete data, highlight subset currently being searched,
            and add vertical lines for discovered intervals. (only intervals of
            the current recursion level appear.)
        """
        import matplotlib.pyplot as plt
        plt.style.use(plot_style)

        if self.is_hist:
            plt.step(list(range(len(self.dat))), self.dat)
            plt.fill_between(list(range(len(self.dat))),
                             self.dat,
                             step="pre",
                             alpha=.4)
            plt.axvspan(sub[0], sub[1] - 1, color="orange", alpha=.3)
            for i in ints:
                plt.axvspan(i[0], i[1], color="green", alpha=.1)
            for i in ints:
                plt.axvline(i[0], color="black")
                plt.axvline(i[1], color="black")
        else:
            dat = np.msort(self.dat)
            plt.hist(dat, bins=30)
            plt.axvspan(dat[sub[0]], dat[sub[1] - 1], color="orange", alpha=.3)
            for i in ints:
                plt.axvspan(dat[i[0]], dat[i[1]], color="green", alpha=.1)
            for i in ints:
                plt.axvline(dat[i[0]], color="black")
                plt.axvline(dat[i[1]], color="black")
        plt.show()
Exemple #4
0
        def test(shape):
            tensor = make_tensor(shape, device, dtype, low=-9, high=9)
            if tensor.size() != torch.Size([]):
                if dtype is torch.bfloat16:
                    expected = torch.from_numpy(np.msort(tensor.float().cpu().numpy())).bfloat16()
                else:
                    expected = torch.from_numpy(np.msort(tensor.cpu().numpy()))
            else:
                expected = tensor  # numpy.msort() does not support empty shapes tensor

            result = torch.msort(tensor)
            self.assertEqual(result, expected)

            out = torch.empty_like(result)
            torch.msort(tensor, out=out)
            self.assertEqual(out, expected)
Exemple #5
0
def paDifCone( thetaMin, thetaMax, outfile, ntrials=100000, nbins=1000 ) :
  nsamps = 0
  pdif = numpy.zeros( ntrials, dtype=float ) 
  costhetaMin = math.cos( math.pi*thetaMin/180. )
  costhetaMax = math.cos( math.pi*thetaMax/180. )
  for n in range( 0, ntrials ) :
    [pa1,x1,y1,z1] = paRandom()
    cosphi = -1.01	# impossible value
    while (cosphi > costhetaMin) or (cosphi < costhetaMax) :
      [pa2,x2,y2,z2] = paRandom()
      cosphi = x1*x2 + y1*y2 + z1*z2	# yeah, this should be dot(a,b)
      nsamps = nsamps + 1
    pdif[n] = abs(pa1 - pa2)
    if pdif[n] > 90. :
      pdif[n] = 180. - pdif[n]          # complement of angle to keep angle acute
  print "ntrials = %d, nsamps = %d" % (ntrials,nsamps)
  pdifSorted = numpy.msort(pdif)
  nskip = ntrials/nbins
  if nskip == 0 :
    nskip = ntrials
  fout = open( outfile, "w" )
  for n in range( 0, ntrials, nskip ) :
    fout.write("%10.3f  %10.3f\n" % (pdifSorted[n], float(n)/float(ntrials) ) )
  fout.write("%10.3f  %10.3f\n" % (pdifSorted[ntrials-1], 1.0 ) )
  return pdifSorted
Exemple #6
0
def mode(x,j):
  """
  calculate the mode for continuous data in array x
  see Numerical Recipes, Chapter 13

  usage:  index_list, probability_list = mode(array_of_data,window)

  returns two lists: 
    1) the index {i.e. the value from the data calculated as (x[i]+x[i+window])/2}
    2) the probability of finding that value

  """
# make sure data is in an array and make sure it is sorted 
# (will not maintain synchronicity between columns though, but that shouldn't matter
# for the mode calculation!)
  x = N.asarray(x)
  x = N.msort(x)

# create the index array
  ind = N.zeros((len(x)-j,x.shape[1]),float)
# create the probability array

  p = N.zeros((len(x)-j,x.shape[1]),float)
  n=len(x)
  for i in range(n-j):
    ind[i] = N.multiply(0.5,add(x[i],x[i+j]))
    p[i] = N.divide(j,N.multiply(n,N.subtract(x[i+j],x[i])))
  return ind, p
def unimodality_dip_test(path: str, plot_show=False) -> bool:
    '''
    #http://www.nicprice.net/diptest/Hartigan_1985_AnnalStat.pdf
    #https://github.com/BenjaminDoran/unidip
    Given the image and conduct dip test to see whether it's unimodal or not.
    @path: image path
    @plot_show: see whether plot the histogram or not
    '''
    img = cv2.imread(path, 0)
    img_array = img.ravel()
    #input an array
    #return True if its unimodal distributed
    data = np.msort(img_array)
    #the probability of unimodal
    uni_prob = dip.diptst(data)[1]
    if uni_prob > 0.5:
        #print(f'This image is unimodel distributed with probability of {uni_prob*100:.2f} %')
        unimodality = True
    else:
        #print(f'This image is at least bimodel distributed with probability of {(1-uni_prob)*100:.2f} %')
        unimodality = False
    if plot_show:
        plt.figure()
        sns.distplot(img.ravel(), bins=256, kde=True, hist=True)
        plt.title('Histogram of the image')
        plt.show()
    return unimodality
def paDifUniform( outfile ) :
  # choose 2 points randomly on surface of sphere; compute PA projected on the plane of
  #  the sky for each of them; then compute angular difference of the two directions;
  #  reorder and compute cumulative distribution function

  ntrials = 100000 #100000
  nsamples = 1000 #1000
  zratio = numpy.zeros( ntrials, dtype=float )
  incldif = 0.
  padif = 0.
  count = 0.
  for n in range( 0, ntrials ) :
    [pa1,incl1,x1,y1,z1] = paRandom() #core field
    [pa2,incl2,x2,y2,z2] = paRandom() #envelope field
    zratio[n] = z1/z2
    if (zratio[n] > ratio1) or (0. < zratio[n] < ratio2):
        padif_tmp = abs(pa1 - pa2)
	padif += numpy.where(padif_tmp > 90., 180.-padif_tmp, padif_tmp)
	incldif_tmp = abs(incl1 - incl2)
        incldif += numpy.where(incldif_tmp > 180., 360.-incldif_tmp, incldif_tmp)
        count += 1.

  zratioSorted = numpy.msort(zratio)
  nskip = ntrials/nsamples
  print "the mean position angle difference in degrees =", padif/count
  print "the mean inclination angle difference in degrees =", incldif/count
  print "probability =", count/ntrials
  fout = open( outfile, "w" )
  for n in range( 0, ntrials, nskip ) :
    fout.write("%10.3f  %10.3f\n" % (zratioSorted[n], float(n)/float(ntrials) ) )
  fout.write("%10.3f  %10.3f\n" % (zratioSorted[ntrials-1], 1.0 ) )
Exemple #9
0
    def find_closest_k(self, point):
        distances = np.ones(self.x.shape[0])
        for i, xm in enumerate(self.norm_x):
            distances[i] = np.linalg.norm(xm - point)
        sorted_dist = np.msort(distances)
        closest_k = (distances <= sorted_dist[self.k-1])

        return self.y[closest_k]
 def get_sample_list(self, sample_dir):
     list_sample = [
         os.listdir(sample_dir)[i].split(".")[0]
         for i in range(len(os.listdir(sample_dir)))
     ]
     list_sample = np.uint32(list_sample)
     list_sample = np.msort(list_sample)
     return list_sample
Exemple #11
0
def sort_func():
    x = np.array([3, 1, 2])
    print("x ", x)
    # 数组排序
    print("np.sort(a) ", np.sort(x))
    # 多维数组排序
    a = np.array([[1, 5, 4], [3, 2, 1]])
    print("x ", a)
    # 数组排序 sort along the last axis
    print("np.sort(a) ", np.sort(a))
    # sort the flattened array
    print(" np.sort(a, axis=None) ", np.sort(a, axis=None))
    #  sort along the first axis
    print(" np.sort(a, axis=0) ", np.sort(a, axis=0))
    # 使用键序列执行间接排序。
    surnames = ('Hertz', 'Galilei', 'Hertz')
    first_names = ('Heinrich', 'Galileo', 'Gustav')
    ind = np.lexsort((first_names, surnames))
    print("ind", ind)
    rs = [surnames[i] + ", " + first_names[i] for i in ind]
    print("rs", rs)

    # 返回将数组分类的索引 返回索引
    x = np.array([3, 1, 2])
    print("np.argsort(x)", np.argsort(x))
    # 排序后重新赋值到新的数组中
    rs = [x[i] for i in ind]
    print("rs", rs)

    # 返回沿第一个轴排序的数组的副本。
    x = np.array([3, 1, 2])
    print("np.msort(x)", np.msort(x))
    y = np.sort_complex([5, 3, 6, 2, 1])
    print("y", y)

    # numpy.argmax 返回沿轴的最大值的索引
    a = np.arange(6).reshape(2, 3)
    print("a", a)
    print("np.argmax(a)", np.argmax(a))

    # numpy.argpartition
    a = np.array([[np.nan, 4], [2, 3]])
    print("np.nanargmax(a)", np.nanargmax(a))

    a = np.arange(6).reshape(2, 3)
    print("a", a)
    print("np.argmin(a)", np.argmin(a))

    # 找到非零的数组元素的索引,按元素分组
    x = np.argwhere(x > 1)
    print("x", x)
    x = np.where(x > 1)
    print("x", x)

    # 查找要插入元素以维持顺序的索引
    x = np.searchsorted([1, 2, 3, 4, 5], 3)
    print("x", x)
Exemple #12
0
 def find_docs_by(self, q: Query) -> np.ndarray:
     binmat = self.matrix
     docs = binmat.columns.to_numpy()
     rows = binmat.index
     ones = np.ones(len(docs), dtype=np.bool_)
     zeroes = np.zeros(len(docs), dtype=np.bool_)
     vectors = [
         binmat.loc[term] if term in rows else zeroes
         for term in get_terms(q)
     ]
     intersection = reduce(np.logical_and, vectors, ones)
     return np.msort(docs[intersection == True])
Exemple #13
0
def AALERGIA(dffa, alpha, dpfa_orig):
    dffa_merged = copy.deepcopy(dffa)
    initial_state = dffa.initial_state

    dffa_merged.RED = np.append(dffa_merged.RED, initial_state)

    initial_blue_states = dffa.frequency_transition_matrix[0][initial_state][:]
    initial_blue_states = initial_blue_states[np.nonzero(initial_blue_states)]

    dffa_merged.BLUE = np.append(dffa_merged.BLUE, initial_blue_states)

    while len(dffa_merged.BLUE) > 0:
        dffa_merged.BLUE = np.msort(dffa_merged.BLUE)
        q_b = dffa_merged.BLUE[0]
        dffa_merged.BLUE = dffa_merged.BLUE[1:len(dffa_merged.BLUE)]
        promote = 1
        labels = dffa.state_labels[dffa_merged.RED]
        state_labels_q_b = dffa_merged.state_labels[q_b]

        label_index = findall(labels, state_labels_q_b)

        for i in range(0, len(label_index)):
            q_r = dffa_merged.RED[label_index[i]]

            thresh = calculate_compatible_parameter(dffa, q_r, q_b, alpha)

            if (AAlergia_compatible(dffa, dpfa_orig, q_r, q_b, 1, 1, alpha,
                                    thresh)):
                dffa_merged = AAlergia_merge(dffa_merged, q_r, q_b)
                promote = 0
                break

        if promote == 1:
            dffa_merged.RED = np.append(dffa_merged.RED, q_b)

        #build new blue set
        qr_succ = dffa_merged.frequency_transition_matrix[0][
            dffa_merged.RED][:]
        qr_succ = qr_succ[np.nonzero(qr_succ)]

        #gets the new blue data (returns data that is not in intersection)
        difference = np.setxor1d(qr_succ,
                                 np.append(dffa_merged.RED, dffa_merged.BLUE))
        ia = np.empty(0, dtype=int)

        for i in range(0, len(qr_succ)):
            for j in range(0, len(difference)):
                if (difference[j] == qr_succ[i]):
                    ia = np.append(ia, qr_succ[i])

        dffa_merged.BLUE = np.append(dffa_merged.BLUE, ia)

    return dffa_merged
Exemple #14
0
def test_dip(data, alpha = 0.05, verbose = True)->bool:
    import unidip.dip as dip
    # sort data
    data = np.msort(data)
    # test
    stat, p, _ = dip.diptst(data)
    # display
    if verbose:
        print('stat=%.3f, p=%.3f' % (stat, p))
    if p > 0.05:
        print('Probably unimodal')
    else:
        print('Probably not unimodal.')
Exemple #15
0
 def __init__(self,
              dat,
              is_hist=False,
              alpha=0.05,
              ntrials=100,
              mrg_dst=1,
              debug=False):
     self.dat = np.msort(np.array(dat)) if not is_hist else np.array(dat)
     self.is_hist = is_hist
     self.alpha = alpha
     self.ntrials = ntrials
     self.mrg_dst = mrg_dst
     self.debug = debug
Exemple #16
0
 def isbimodal(data, method):
     if method == 'hdt':
         # Use Hartigan's dip statistic to decide if distribution deviates from unimodality.
         _, pval, _ = diptst(np.msort(data))
         return (pval is not None) and (pval <= 0.05)
     else:
         # Compare Bayesian Information Content of two Gaussian Mixture Models.
         X = data.reshape(-1, 1)
         gmm2 = mixture.GaussianMixture(n_components=2,
                                        covariance_type='full').fit(X)
         gmm1 = mixture.GaussianMixture(n_components=1,
                                        covariance_type='full').fit(X)
         return gmm2.bic(X) <= gmm1.bic(X)
Exemple #17
0
def lizFindCirclesGrid(circles):
    """
    Passed in array of circles with no false positives, return array of most likely 6x7 grid centers
    in raste
    """
    # generate row and column delimiting values
    cxs = []
    cys = []
    for i in circles:
        cxs.append(i[0])
        cys.append(i[1])

    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    retval_x, bestlabels_x, centers_kmeans_x = cv2.kmeans(
        np.array(np.float32(cxs)), 7, criteria, 10, cv2.KMEANS_PP_CENTERS
    )
    retval_y, bestlabels_y, centers_kmeans_y = cv2.kmeans(
        np.array(np.float32(cys)), 6, criteria, 10, cv2.KMEANS_PP_CENTERS
    )

    # we should see 7 groups of in x, and 6 groups in y
    # delimited by a jump of one piece size in width (~30-50 pixels)
    centers_kmeans_x = np.msort(centers_kmeans_x)
    centers_kmeans_y = np.msort(centers_kmeans_y)

    fullrow = []
    fullcol = []
    for i in centers_kmeans_x:
        fullcol.append(int(i))
    for j in centers_kmeans_y:
        fullrow.append(int(j))

    finalout = []
    # finalout is 42 possible pairs
    for i in range(5, -1, -1):
        for j in range(7):
            finalout.append([fullrow[i], fullcol[j]])

    return finalout, centers_kmeans_x, centers_kmeans_y
def eg3_10():
    """
    3.10 动手实践:简单统计分析
        我们可以用一些阈值来除去异常值,但其实有更好的方法,那就是中位数。将各个变量值按
    大小顺序排列起来,形成一个数列,居于数列中间位置的那个数即为中位数。例如,我们有1、2、
    3、4、5这5个数值,那么中位数就是中间的数字3。下面是计算中位数的步骤。
    """
    # (1) 计算收盘价的中位数。创建一个新的Python脚本文件,命名为simplestats.py。你已经知道
    # 如何从CSV文件中读取数据到数组中了,因此只需要复制一行代码并确保只获取收盘价数据即
    # 可,如下所示:
    c = np.loadtxt('output/data.csv',
                   delimiter=',',
                   usecols=(6, ),
                   unpack=True)
    print c
    # (2) 一个叫做median的函数将帮助我们找到中位数。我们调用它并立即打印出结果。添加下
    # 面这行代码:
    print "median =", np.median(c)
    # (3) 既然这是我们首次使用median函数,我们来检查一下结果是否正确。这可不是因为我
    # 们多疑!当然,我们可以将整个数据文件浏览一遍并人工找到正确的答案,但那样太无趣了。
    # 我们将对价格数组进行排序,并输出排序后居于中间位置的值,这也就是模拟了寻找中位数的
    # 算法。msort函数可以帮我们完成第一步。我们将调用这个函数,获得排序后的数组,并输出
    # 结果。
    sorted_close = np.msort(c)
    print "sorted =", sorted_close
    # 太好了,代码生效了!现在,我们来获取位于中间的那个数字:
    N = len(c)
    print "middle =", sorted_close[(N - 1) / 2]
    """
    # 书上印刷错误,故测试当前这段代码的实现方式
    错误例子:print "middle =", sorted[(N - 1)/2] 
    正确写法:print "middle =", sorted_close[(N - 1)/2]
    tmp = [1, 2, 3, 4, 5, 6]
    print np.msort(tmp)
    print "median = ", np.median(tmp)
    print "middle = ", tmp[(len(tmp) - 1) / 2]
    """
    # (4) 咦,这个值和median函数给出的值不一样,这是怎么回事?经过仔细观察我们发现,
    # median函数返回的结果甚至根本没有在我们的数据文件里出现过。这就更奇怪了!在给NumPy
    # 团队提交bug报告之前,我们先来看下文档。原来这个谜团很容易解开。原因就在于我们的简单
    # 算法模拟只对长度为奇数的数组奏效。对于长度为偶数的数组,中位数的值应该等于中间那两个
    # 数的平均值。因此,输入如下代码:
    print "average middle =", (sorted_close[N / 2] +
                               sorted_close[(N - 1) / 2]) / 2
    # (5) 另外一个我们关心的统计量就是方差。方差能够体现变量变化的程度。在我们的例子中,
    # 方差还可以告诉我们投资风险的大小。那些股价变动过于剧烈的股票一定会给持有者制造麻烦。
    # 在NumPy中,计算方差只需要一行代码,看下面:
    print "variance =", np.var(c)
    # (6) 既然我们不相信NumPy的函数,那就再次根据文档中方差的定义来复核一下结果。注意,
    # 这里方差的定义可能与你在统计学的书中看到的不一致,但这个定义在统计学上更为通用。
    print "variance from definition =", np.mean((c - c.mean())**2)
Exemple #19
0
def calMedian(filename) :
    c = np.loadtxt(filename, delimiter=',',
            skiprows=1,
            usecols=(title['TCLOSE'],),
            unpack=True)
    #print c
    print "median =", np.median(c)

    sorted_close=np.msort(c)
    print "sorted_close =", sorted_close
    N=len(c)
    print "middle =",sorted_close[(N-1)/2]
    print "variance =", np.var(c)
    print "variance from definition =", np.mean((c-c.mean())**2)
Exemple #20
0
def lizFindCirclesGrid(circles):
    """
    Passed in array of circles with no false positives, return array of most likely 6x7 grid centers
    in raste
    """
    #generate row and column delimiting values
    cxs = []
    cys = []
    for i in circles:
        cxs.append(i[0])
        cys.append(i[1])

    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    retval_x, bestlabels_x, centers_kmeans_x = cv2.kmeans(
        np.array(np.float32(cxs)), 7, criteria, 10, cv2.KMEANS_PP_CENTERS)
    retval_y, bestlabels_y, centers_kmeans_y = cv2.kmeans(
        np.array(np.float32(cys)), 6, criteria, 10, cv2.KMEANS_PP_CENTERS)

    #we should see 7 groups of in x, and 6 groups in y
    #delimited by a jump of one piece size in width (~30-50 pixels)
    centers_kmeans_x = np.msort(centers_kmeans_x)
    centers_kmeans_y = np.msort(centers_kmeans_y)

    fullrow = []
    fullcol = []
    for i in centers_kmeans_x:
        fullcol.append(int(i))
    for j in centers_kmeans_y:
        fullrow.append(int(j))

    finalout = []
    #finalout is 42 possible pairs
    for i in range(5, -1, -1):
        for j in range(7):
            finalout.append([fullrow[i], fullcol[j]])

    return finalout, centers_kmeans_x, centers_kmeans_y
Exemple #21
0
def paDifUniform( outfile ) :
  ntrials = 100000
  nsamples = 1000
  pdif = numpy.zeros( ntrials, dtype=float ) 
  for n in range( 0, ntrials ) :
    [pa1,x,y,z] = paRandom()
    [pa2,x,y,z] = paRandom()
    pdif[n] = abs(pa1 - pa2)
    if pdif[n] > 90. :
      pdif[n] = 180. - pdif[n]
  pdifSorted = numpy.msort(pdif)
  nskip = ntrials/nsamples
  fout = open( outfile, "w" )
  for n in range( 0, ntrials, nskip ) :
    fout.write("%10.3f  %10.3f\n" % (pdifSorted[n], float(n)/float(ntrials) ) )
  fout.write("%10.3f  %10.3f\n" % (pdifSorted[ntrials-1], 1.0 ) )
Exemple #22
0
def dip_test(properties, cluster_members, feature=3):
    data = []
    for ii in range(cluster_members):
        data.append(float(properties[ii][feature]))
    
    data = np.array(data)
    data = np.msort(data)
    intervals = dip.diptst(data)

    t_range = np.linspace(0, 0.15, 200)
    kde = gaussian_kde(data)
    plt.plot(t_range, kde(t_range)/100)
    plt.grid()
    plt.title('Pick PDF' + '\n p_value = ' + str(intervals[1]))

    plt.show()
def close_peaks(ind, right):
	# tests whether two peaks are too close to have empty space in between. If yes it removes the second peak as this peak doesn't indicate another storey or wall.
	ind = np.msort(ind) # necessary to get real distances
	close_peaks = False
	distance = [] # list with bin distance of peaks
	for i in range(len(ind)-1):
	 	distance.append(ind[i+1]-ind[i])
	temp = [] # list with index of items which have ot be deleted, because distance to low
	for i in distance:
		if i <= right: # right parameter identifies it as one wall
			temp.append(distance.index(i)+1)
			close_peaks = True
	if close_peaks is True:
		print 'two close peaks --> one floor'
		return np.delete(ind,temp) # second peak deleted, continue with newind, should right be made bigger???
	else:
		return None
def dip_test(properties, cluster_members, feature=3):
    data = []
    outliers = [u'121010', u'091110', u'091109', u'091106']

    for ii in range(cluster_members):
        if not properties[ii][0] in outliers:
            data.append(float(properties[ii][feature]))

    data = np.array(data)
    data = np.msort(data)
    intervals = dip.diptst(data)

    t_range = np.linspace(0, 0.2, 200)
    kde = gaussian_kde(data)
    plt.plot(t_range, kde(t_range) / 100)
    plt.grid()
    plt.title('Distance PDF' + '\n p_value = ' + str(intervals[1]))

    plt.show()
Exemple #25
0
def historical_VaR_ES_Calculation(df,window_years,output_years,var_per,es_per,day_num):
    outputdf = pd.DataFrame()  
    outputdf['DATE'] = df['DATE'][:252*output_years]
    npts=252*window_years
    npaths = npts- day_num
    ntrails=252*output_years
    port_n_rtn=np.log(df['port_cur_value']/(df['port_cur_value'].shift(-5)))
    port_value = 10000 * np.exp(port_n_rtn)
    
    scenario=[]
    for i in range(ntrails):
        scenario.append(port_value[i:i+npaths-1])
    
    # sort the matrix by column
    scenario = np.msort(scenario)
    
    for i in range(252*output_years):
        # VaR
        outputdf.ix[i,'Historical_VaR_'] = df.ix[i,'port_cur_value']-scenario[np.ceil((1-var_per)*npaths),:]
        # ES
        outputdf.ix[i,'Historical_ES_'] = df.ix[i,'PORT_CurrentValue']-mean(scenario[1:np.ceil((1-es_per)*npaths),:])
Exemple #26
0
def sort_string_wfs(prefixes):
    length_array = np.zeros(len(prefixes))
    for i in range(0, len(prefixes)):
        length_array[i] = prefixes[i].count(',')

    length_array_u = np.unique(length_array)
    string_set_new = np.empty(0, dtype=str)
    cursor = 0
    IX = np.empty(0, dtype=int)
    for i in range(0, len(length_array_u)):
        len_s = length_array_u[i]
        indices = np.where(length_array == len_s)
        prefixes_to_sort = prefixes[indices]
        s_temp = np.msort(prefixes_to_sort)
        IXargsort = prefixes_to_sort.argsort()
        for j in range(0, len(s_temp)):
            string_set_new = np.append(string_set_new, s_temp[j])
            IX = np.append(IX, indices[0][IXargsort[j]])

        cursor = cursor + len(indices)
    return string_set_new, IX
Exemple #27
0
    def find_peaks_and_shoulders(self,
                                 x_window: tuple = None,
                                 slope_threshold: float = -0.00001,
                                 amp_threshold: float = 0.1,
                                 _window: int = None,
                                 _conv_threshold: int = None):

        conv_threshold = 3

        if conv_threshold is not None:
            conv_threshold = _conv_threshold

        _peaks = self.find_peaks(x_window = x_window,
                                 slope_threshold = slope_threshold,
                                 amp_threshold = amp_threshold,
                                 _window = _window)

        _shoulders = self.find_shoulders(x_window = x_window,
                                         slope_threshold = slope_threshold,
                                         amp_threshold = amp_threshold,
                                         _window = _window)

        result = list(_peaks)

        conv_threshold = 3

        for shoulder in _shoulders:
            acceptable = []
            for peak in _peaks:
                if abs(peak - shoulder) > conv_threshold:
                    acceptable.append(True)
                else:
                    acceptable.append(False)
                    break
            if np.product(acceptable):
                result.append(shoulder)

        return np.msort(np.array(result))
Exemple #28
0
def get_QQ_vals(data1, data2):
    """Align 2 datasets' eCDFs for plotting on QQ-plot."""

    vals1, CDF1 = get_eCDF(data1, extend=True)
    vals2, CDF2 = get_eCDF(data2, extend=True)

    joint_vals = np.msort(np.unique(np.hstack((vals1, vals2))))

    joint_CDF1 = np.zeros_like(joint_vals)
    joint_CDF2 = np.zeros_like(joint_vals)

    id1, id2 = 0, 0
    for ii, val in enumerate(joint_vals):

        joint_CDF1[ii] = CDF1[id1]
        if (val in vals1) and (id1 + 1 < len(vals1)):
            id1 += 1

        joint_CDF2[ii] = CDF2[id2]
        if (val in vals2) and (id2 + 1 < len(vals2)):
            id2 += 1

    return joint_vals, joint_CDF1, joint_CDF2
Exemple #29
0
def common_fun():
    # 单位矩阵
    i2 = np.eye(4)
    print(i2)
    # 数据存储文件
    np.savetxt('eye.txt', i2)
    # 读取文件
    c, v = np.loadtxt('000032.csv',
                      skiprows=1,
                      delimiter=',',
                      usecols=(4, 5),
                      unpack=True)
    # print(c)
    # 计算成交量加权平均价格(VWAP)
    vwap = np.average(c, weights=v)
    print('vwap = ', vwap)
    # j计算算术平均值
    wap = np.mean(c)
    print('算术平均值为:', wap)
    # 计算时间加权平均价格
    print(c.size)
    t = np.arange(c.size)
    twap = np.average(c, weights=t)
    print('twap = ', twap)
    # 计算区间的中间点
    print('middle is ', np.max(c) - np.min(c))
    print('middle is ', np.ptp(c))
    # 获得中位数
    median = np.median(c)
    print('中位数为:', median)
    # 获得数组的排序
    sorted_c = np.msort(c)
    print('sorted_c:', sorted_c[c.size // 2])
    print('sorted_c:', sorted_c[(c.size - 1) // 2])
    # 计算方差
    variance = np.var(c)
    print('variance is ', variance)
def init(mode='r'):
    """
    Generating the starting positions of the vehicles. Initializing the
    starting velocities to 0.

    mode: 'r', 'e', or 'fk' (random, equidistant, first k positions)
    """
    starting_vel = np.zeros(vehicles, int)

    if mode == 'r':
        starting_pos = np.msort(
            np.random.choice(range(road_zones), vehicles, replace=0))
    elif mode == 'e':
        step = road_zones // vehicles
        # print(step)
        starting_pos = np.arange(0, road_zones, step)[:vehicles]
    elif mode == 'fk':
        starting_pos = np.arange(vehicles)

    # print(starting_pos)

    if verbosity:
        print("Initialization complete...")
    return (starting_pos, starting_vel)
# 3.7计算数组的中位数和方差
import numpy as np
'''
计算数组的中位数和方差
1.中位数 median(a)
    返回数组a的中位数
2.数组排序 msort(a) 
    对数组a升序排序
    返回排序后的数组
3.方差var(a)
    返回数组a的方差
'''
a = np.array([1, 3, 6, 2, 12, 43, 23, 12, 13, 20, 90, 78, 54])
b = np.median(a)
c = np.msort(a)
d = np.var(a)
print(a)
print(b)
print(c)
print(d)
plt.plot(a[mask], b[mask], 'bo')
mask = (b >= 0) & (a <= np.pi / 2)
plt.plot(a[mask], b[mask], 'go')
plt.show()'''

"where函数"
b = np.where(a < 50)
print(b)

"full函数,2×2的全3数组"
np.full((2, 2), 3)

"np.ptp(x)计算x最大差值"
x = 25, 56
a = np.array([1, 23, 45, 56])
list = [1, 60, 56]
print(np.ptp(x))  #可以对数组使用
print(np.min(a))  #可以对数组使用
print(a.min())  #只对数组可用x.min(),列表不行
#print(list.min())错误!!!
'''遍历元组或列表,
不用for index in range(len(x)) print(x[index]),
用for val in enumerate(x)print(val)'''

c, v = np.loadtxt('data.csv', delimiter=',', usecols=(4, 5), unpack=True)
print(np.ptp(c))
print(np.max(c))
print(np.median(c))  #中位数
print(np.msort(c))
print(np.var(c))  #方差=np.mean((c-c.mean())**2)
Exemple #33
0
def dip_test(median_FL_data, total_data, alpha=0.05, save_figure=False):
    """
    Perform a Hartigan's dip test to check for unimodality
    in clusters and splits clusters if bimodality is found.
    This function will take the highest intensity channel
    for each cluster and check for bimodality to correct for
    errors in clustering similar fluorescence profiles.

    Changing alpha will alter how stringent the dip test is.
    A higher alpha will result in higher detection of bimodality,
    but runs a greater risk of false identification. It is
    important to note that this dip test is relatively coarse
    grained and will not identify very slight populations of mixed
    cells (e.g. 10 orange cells clustered with 1000 red cells).

    Returns an updated clustering of the primary clustering
    after performing a dip test.

    Parameters
    ----------
    median_FL_data : dict, clustering data generated by
                    'flowsym.cluster' function
    total_data : other fluorescence profiles for which errors
                 will be corrected
    alpha : how stringent the dip test is
    save_figure : Save generated bar chart showing the number of
                  cells in each cluster and a heat map of the median
                  fluorescence intensity in each channel for each
                  cluster. Figure is saved using 'matplotlib' module.

    Returns
    -------
    change_dict : a dictory containing the corection that must be
                  applied to similar fluorescence profiles if
                  bimodality is found.

    See Also
    --------
    cluster, dip_test

    Examples
    --------
    dip_test(median_FL_data, total_data, alpha=0.08, save_figure=False)


    """

    # Create a copy of the dictionary so we can retain the original clustering data
    change_dict = deepcopy(total_data)

    # Make kde plots
    if 'Cluster 0' in median_FL_data.keys():
        fig, ax = plt.subplots(1,
                               len(median_FL_data.keys()) - 1,
                               figsize=(12, 3))

    else:
        fig, ax = plt.subplots(1, len(median_FL_data.keys()), figsize=(12, 3))

    # Keep track of what plot we're on
    i = 0

    # Get the index of the max fluorescence for each cluster
    for key, value in median_FL_data.items():
        cluster_max_FL_index = np.argmax(value)

        # As long as we aren't cluster one, do our dip test and plot
        if int(key[-1]) - 1 != -1:
            search_key = int(key[-1]) - 1

            # Intensity in each cluster where the intensity is max
            dat = [row[cluster_max_FL_index] for row in total_data[search_key]]

            # Do the dip test
            data = np.msort(dat)
            intervals = UniDip(data, alpha=alpha).run()
            print("Performing dip test on cluster " + str(search_key + 1) +
                  " ... ")

            # Show on the graph where the intervals are
            for j in intervals:
                ax[i].axvspan(data[j[0]],
                              data[j[1]],
                              color='lightblue',
                              alpha=0.4)
                for q in j:
                    ax[i].axvline(data[q], color='red')

            # Split the clusters that failed the dip test into separate clusters
            if len(intervals) > 1:
                split_point = int(np.mean([intervals[0][1], intervals[1][0]]))
                clust1 = data[:split_point]
                clust2 = data[split_point:]

                # Reset current cluster number to cluster 1 and make a new cluster to the dictionary
                print("Identified bimodality in cluster " +
                      str(search_key + 1) + ", reclustering data ... ")
                change_dict[max(total_data.keys()) + 1] = [
                    row for row in total_data[search_key]
                    if row[cluster_max_FL_index] in clust2
                ]
                change_dict[search_key] = [
                    row for row in total_data[search_key]
                    if row[cluster_max_FL_index] in clust1
                ]

            # Plot data
            sns.kdeplot(data, ax=ax[i], color='black')

            ax[i].set(title='Cluster ' + str(search_key + 1),
                      xlabel='FL ' + str(cluster_max_FL_index + 1),
                      yticks=[])

            # Move to the next plot
            i += 1

        plt.tight_layout()

        # save first figure of the dip test
        if save_figure:
            plt.savefig("Dip_test_example")

        final_reclustered = {}

    # Make a new dictionary which will have the median value for each channel in the vector for a heatmap downstream
    for key, value in change_dict.items():
        med_values = []
        for i in range(len(value[0])):
            med_values.append(np.median([row[i] for row in value]))
            final_reclustered["Cluster " + str(key + 1)] = med_values

    search = np.random.choice(list(median_FL_data.keys()))

    cols = ['FL' + str(i + 1) for i in range(len(median_FL_data[search]))]

    # Dataframe to create heatmap
    reclustered_df = pd.DataFrame(final_reclustered, index=cols)

    # Counts dictionary for barchart
    reclustered_counts = {}

    for key, value in change_dict.items():
        reclustered_counts[key] = len(value)

        # Replot the new clusters
    print("Plotting reclustered data ...")

    fig2, ax = plt.subplots(1, 2, figsize=(10, 4))
    sns.heatmap(reclustered_df.transpose(), cmap='copper')

    reclust = []
    recount = []

    for key, value in reclustered_counts.items():
        reclust.append(int(key) + 1)
        recount.append(value)

    rey_pos = np.arange(len(reclust))

    ax[0].bar(rey_pos, recount, color='black')
    ax[0].set_xticks(rey_pos)
    ax[0].set_xticklabels(reclust)
    ax[0].set_xlabel('Cluster')
    ax[0].set_ylabel('Counts')
    ax[0].set_title('Cells per cluster')

    ax[1].set_title('Fluorescence profile of clusters')
    ax[1].set_xlabel('Fluorescence channel')
    plt.yticks(rotation=0)
    plt.tight_layout()

    if save_figure:
        plt.savefig("reclustered_after_dip_test")

    return change_dict
#!/usr/bin/python

import numpy

c=numpy.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
print "median =", numpy.median(c)
sorted = numpy.msort(c)
print "sorted =", sorted

N = len(c)
print "middle =", sorted[(N - 1)/2]
print "average middle =", (sorted[N /2] + sorted[(N - 1) / 2]) / 2

print "variance =", numpy.var(c)
print "variance from definition =", numpy.mean((c - c.mean())**2)
Exemple #35
0
 def predict_label(self, X):
     score = self.predict(X)
     score_temp = np.msort(score)
     threshold = score_temp[score.size - (int)(score.size * self.nu)]
     return np.where(score > threshold, -1, 1)
step = (r_max - r_min) / max_step
const_2 = -1.0 / step**2
const_1 = -2.0 * const_2
orb_factor = orb_l * (orb_l + 1)

#Calculate array of potential values
V = numpy.zeros(max_step+1,numpy.double)
for i in xrange(max_step+1):
    r = r_min + i*step
    V[i] = potential(r) + orb_factor/r**2 #(include centrifugal term)

#Calculate elements of the matrix to be diagonalized
d = numpy.zeros(max_step,numpy.double) #diagonal elements
e = numpy.zeros(max_step,numpy.double) #off-diagonal elements
z = numpy.zeros((max_step,max_step),numpy.double) #matrix for eigenvectors (only used as a dummy-argument to
                                                  #tqli() in the current version of this program)
for i in xrange(max_step):
    d[i] = const_1 + V[i+1]
    e[i] = const_2

    z[i,i] = 1 #(identity matrix)

#Diagonalize and obtain eigenvalues. The eigenvalues are stored in d.
computationalLib.pylib(cpp=False).tqli(d,e,z)

#Sort the eigenvalues (smallest to largest)
d = numpy.msort(d)

#Output to file
output()
            RDs[index]=[all_list.count('A'),all_list.count('C'),all_list.count('G'),all_list.count('T')]
        
        ###work out MQ0 and mean mapping quality for the snp (not RMSE)
        MQ0[index]=map_list.count(0)
        try:
            MQm[index]=sum(map_list)/float(len(map_list))
        except:
            MQm[index]=0.0

        ###if minimum read depth is met, try calling the genotype 
        if len(var_list)>=min_RD:
            
            GLs[index]=geno_caller_10GT_aDNA(var_list)  ##ancient DNA aware calculation of genotype likelihoods
            GTs[index]=np.argmax(GLs[index])    ##record best genotype
            PLs[index]=(GLs[index]-np.max(GLs[index]))*-10   ###calculte Phred-scale values
            GQs[index]=np.msort(PLs[index])[1]  ###record genotype quality

            ###if GQ is less than threshold and site is heterozygous, swith to best homozygous genotype
            if (alt_dic[GTs[index]][0]<>alt_dic[GTs[index]][1]) and (GQs[index]<GQ):
                GTs[index]=homs[np.argmax(GLs[index][homs])]
                SWGQ[index]=1

        ###Work out basesian QUALity score
##            LL_0=np.sum(10**GLs[index][LL0_map[REFs[index]]])
##            LL_1=np.sum(10**GLs[index][LL1_map[REFs[index]]])
##            LL_2=np.sum(10**GLs[index][LL2_map[REFs[index]]])
##            norconst1=sum([LL_0,LL_1,LL_2])  ###Depristo says this, but it really should be multiplied by the prior
##            norconst2=sum([LL_0*theta_prior[0],LL_1*theta_prior[1],LL_2*theta_prior[2]])  ###Depristo says this, but it really should be multiplied by the prior
##
##            Pr0=(theta_prior[0]*LL_0)/norconst2
##            Pr1=(theta_prior[1]*LL_1)/norconst2
import numpy as np

a = np.array([1, 2])
np.msort(a)
def FFT_Stepcount(Quality_Acc,T_convert,window,Cutoff):
    Len = len(Quality_Acc)
    Firstwindow_stepdetection = 0
    count = []
    count2 = []
    count3 = []
    frequency = numpy.zeros((1,5))
    portion = 0.0
    Box = []
    Box2 = []
    power = numpy.array([])
    Time = []
    for i in range(0,Len-window,window):

        if i==0:
            Time = T_convert[i]
        else:
            Time = numpy.vstack([Time,T_convert[i]])

        for n in range(0,window):

            if Quality_Acc[i+n] < 66 and Quality_Acc[i+n+1]>66:
                Firstwindow_stepdetection += 1

        if Firstwindow_stepdetection < 8:
            count.append(0)
        else:
            for j in range(i,i+window-38,38):
                A = numpy.msort(numpy.array(Quality_Acc[j:j+39]))
                #print A
                if A[-6]<66:
                    pass
                else:
                    portion = float(portion)+float(38.0/window)

            T = T_convert[i+window]-T_convert[i]

            p = numpy.abs(numpy.fft.fft(Quality_Acc[i:i+window+1])/(window/2.0))
            p = p[0:window/2]**2

            freq = numpy.arange(0,window/2)/T
            freq = numpy.transpose(freq)

            Power_and_Freq = numpy.c_[p,freq]
            Power_and_Freq = Power_and_Freq[Power_and_Freq[:,0].argsort()]

            if Power_and_Freq[0,1]>Cutoff:
                Power_and_Freq[0,1]=0
            count.append(Power_and_Freq[-2,1]*T*portion)
            Box2.append(portion)
            portion=0.0
            Box.append(Firstwindow_stepdetection)
            Firstwindow_stepdetection=0

        Power_and_Freq = []
        freq = []
        p = []

    count = numpy.transpose(count)
    count = numpy.sum(count)
    return count
Exemple #40
0
def dip(histogram=None, idxs=None):
    """
        Compute the Hartigans' dip statistic either for a histogram of
        samples (with equidistant bins) or for a set of samples.
    """
    if idxs is None:
        idxs = np.arange(len(histogram))
    elif histogram is None:
        h = collections.Counter(idxs)
        idxs = np.msort(h.keys())
        histogram = np.array([h[i] for i in idxs])
    else:
        if len(histogram) != len(idxs):
            raise ValueError("Need exactly as many indices as histogram bins.")
        if len(idxs) != len(set(idxs)):
            raise ValueError("idxs must be unique if histogram is given.")
        if not np.array_equal(np.msort(idxs), idxs):
            idxs_s = np.argsort(idxs)
            idx = np.asarray(idxs)[idxs_s]
            histogram = np.asarray(histogram)[idxs_s]

    cdf = np.cumsum(histogram, dtype=float)
    cdf /= cdf[-1]

    work_idxs = idxs
    work_histogram = np.asarray(histogram, dtype=float) / np.sum(histogram)
    work_cdf = cdf

    D = 0
    left = [0]
    right = [1]

    while True:
        left_part, left_touchpoints   = _gcm_(work_cdf - work_histogram, work_idxs)
        right_part, right_touchpoints = _lcm_(work_cdf, work_idxs)

        d_left, left_diffs   = _touch_diffs_(left_part, right_part, left_touchpoints)
        d_right, right_diffs = _touch_diffs_(left_part, right_part, right_touchpoints)

        if d_right > d_left:
            xr = right_touchpoints[d_right == right_diffs][-1]
            xl = left_touchpoints[left_touchpoints <= xr][-1]
            d  = d_right
        else:
            xl = left_touchpoints[d_left == left_diffs][0]
            xr = right_touchpoints[right_touchpoints >= xl][0]
            d  = d_left

        left_diff  = np.abs(left_part[:xl+1] - work_cdf[:xl+1]).max()
        right_diff = np.abs(right_part[xr:]  - work_cdf[xr:] + work_histogram[xr:]).max()

        if d <= D or xr == 0 or xl == len(work_cdf):
            the_dip = max(np.abs(cdf[:len(left)] - left).max(), np.abs(cdf[-len(right)-1:-1] - right).max())
            return the_dip/2, (cdf, idxs, left, left_part, right, right_part)
        else:
            D = max(D, left_diff, right_diff)

        work_cdf = work_cdf[xl:xr+1]
        work_idxs = work_idxs[xl:xr+1]
        work_histogram = work_histogram[xl:xr+1]

        left[len(left):] = left_part[1:xl+1]
        right[:0] = right_part[xr:-1]
Exemple #41
0
vwap = np.average(close_price,weights=volume)
#时间加权平均数
twap = np.average(close_price,weights=(np.arange(len(close_price))))
#算数平均数
mean = np.mean(close_price)

#3.获取收盘价最高,最低价
h ,l = np.loadtxt('data.csv',delimiter=',',usecols=(4,5),unpack=True)
hightest = np.max(h)
lowest = np.max(l)

#4.计算收盘价的中位数
median = np.median(close_price)

#排序(sort)
sorted_closing = np.msort(close_price)
#判断个数是奇数还是偶数
N = len(close_price)
median_ind =(N-1)/2
if(N&0x1):
    median_sorted_closing = sorted_closing[median_ind]
else:
    median_sorted_closing = (sorted_closing[median_ind]+sorted_closing[median_ind+1])/2

#5.计算方差
variance = np.var(close_price)

#手动求方差
variance_from_definition = np.mean((close_price-close_price.mean())**2)

print "twap =", np.average(c, weights=t)

#寻找最大值和最小值
h,l=np.loadtxt('data.csv', delimiter=',', usecols=(4,5), unpack=True)
print "highest =", np.max(h)
print "lowest =", np.min(l)
print (np.max(h) + np.min(l)) /2

print "Spread high price", np.ptp(h)
print "Spread low price", np.ptp(l)

#统计分析

c=np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
print "median =", np.median(c)
sorted = np.msort(c)
print "sorted =", sorted

N = len(c)
print "middle =", sorted[(N - 1)/2]
print "average middle =", (sorted[N /2] + sorted[(N - 1) / 2]) / 2

print "variance =", np.var(c)
print "variance from definition =", np.mean((c - c.mean())**2)

#股票收益率
c=np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)

returns = np.diff( c ) / c[ : -1]
print "Standard deviation =", np.std(returns)
import numpy as np

c=np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
print "median =", np.median(c)
sorted = np.msort(c)
print "sorted =", sorted

N = len(c)
print "middle =", sorted[(N - 1)/2]
print "average middle =", (sorted[N /2] + sorted[(N - 1) / 2]) / 2

print "variance =", np.var(c)
print "variance from definition =", np.mean((c - c.mean())**2)
#
#    basic functions
#
############################################################

import numpy as np
np.set_printoptions(precision=3)


a = np.array( np.random.random(12) * 100, dtype="int" ).reshape(3,4)
print(a)

# perform operations along specified axis
print(np.average(a, axis=0))
print(np.average(a, axis=1))

# sort data
print(np.msort(a))
print(np.sort(a, axis=0))
print(np.sort(a, axis=1))

# insert elements:  insert(array, positions, items)
print(a)
b = np.insert(a, (1,5,9), (-1,-2,-3) ); print(b)
b = np.insert(a, (1,5,9), (-1,-2,-3) ).reshape(3,5); print(b)
b = np.append(a, (-1,-2,-3,-4) ).reshape(4,4); print(b)

b = np.round_(a ** 1.1, 2); print(b)

1
Exemple #45
0
    def find_periodicity(self, func, par,
                 fitmethod='bfgs',
                 nchain = 10,
                 niter = 5000,
                 nsim = 1000,
                 covfactor = 1.0,
                 parname = None,
                 noise = -1,
                 use_emcee = True,
                 searchfreq = None):


        """
        Find periodicities in observed data and compute significance via MCMCs.

        First, fit the periodogram with func and compute the
        maximum-a-posteriori (MAP) estimate.
        Divide the data by the MAP model; for a perfect data-model fit,
        the resulting residuals should follow a chi-square distribution
        with two degrees of freedom.
        Find the highest power in the residuals and its frequency.

        Sample the posterior distribution of parameters for func using MCMC,
        and create fake periodograms from samples of the posterior.
        For each fake periodogram, find the MAP estimate, divide out the
        MAP model and find the highest power in that periodogram.

        Create a posterior distribution of maximum powers and compute
        a posterior predictive p-value of seeing the maximum power
        in the data under the null hypothesis (no QPO).


        Parameters
        ----------

        func : function
            Parametric model for the periodogram.
            Needs to be a function that takes an array of frequencies and
            k parameters, and returns an array of model powers.
            The function should include a parameter setting a constant background
            level, and this parameter should be last!

        par : {list, array-like}
            Input guesses for the parameters taken by func.
            The number of elements in this list or array must match the
            number of parameters k taken by func.

        fitmethod : string, optional, default "bfgs"
            Choose the optimization algorithm used when minimizing the
            -log-likelihood. Choices are listed in mle.py, but the default
            (bfgs) should be sufficient for most applications.

        nchain : int, optional, default 10
            The number of chains or walkers to use in MCMC.
            For Metropolis-Hastings, use ~10-20 and many samples
            For emcee, use as many as you can afford (~500) and fewer samples

        niter : int, optional, default 5000
            Sets the length of the Markov chains.
            For Metropolis-Hastings, this needs to be large (>10000)
            For emcee, this can be smaller, but it's a good idea to
            verify that the chains have mixed.

        nsim : int, optional, default 1000
            The number of simulations to use when computing the
            posterior distribution of the likelihood ratio.
            Note that this also sets the maximum precision of the
            posterior predictive p-value (for 1000 simulations, the
            p-value can be constrained only to 0.001).

        covfactor : float, optional, default 1.0
            A tuning parameter for the MCMC step. Used only in
            Metropolis-Hastings.


        parname : list, optional, default None
            Include a list of strings here to set parameter names for
            plotting

        noise: int, optional, default -1
            The index for the noise parameter in func.
            In the pre-defined models, this index is *always* -1.

        use_emcee : boolean, optional, default True
            If True (STRONGLY RECOMMENDED), use the emcee package
            for running MCMC. If False, use Metropolis-Hastings.


        """


        ## the file name where the output will be stored
        resfilename = self.namestr + "_findperiodicity_results.dat"

        ## open the output log file
        resfile = utils.TwoPrint(resfilename)


        ### step 1: fit model to observation
        psfit = mle.PerMaxLike(self.ps, fitmethod=fitmethod, obs=True)
        fitpars = psfit.mlest(func, par, obs=True, noise=noise, m=self.m)
        bindict = fitpars['bindict']
        #print('popt: ' + str(fitpars['popt']))

        ## which posterior do I need to use?
        if self.m == 1:
            lpost = posterior.PerPosterior(self.ps, func)
        else:
            lpost = posterior.StackPerPosterior(self.ps, func, self.m)



        ### Step 2: Set up Markov Chain Monte Carlo Simulations
        ### of model 1:
        mcobs = mcmc.MarkovChainMonteCarlo(self.ps.freq, self.ps.ps, lpost,
                                      topt = fitpars['popt'],
                                      tcov = fitpars['cov'],
                                      covfactor = covfactor,
                                      niter=niter,
                                      nchain=nchain,
                                      parname= parname,
                                      check_conv = True,
                                      namestr = self.namestr,
                                      use_emcee = True,
                                      plot=self.plot, 
                                      printobj = resfile,
                                      m = self.m)


        ### Step 3: create fake periodograms out of MCMCs
        fakeper = mcobs.simulate_periodogram(nsim = nsim)

        sim_pars_all, sim_deviance, sim_ksp, sim_fpeak, sim_srat, \
        sim_maxpow, sim_merit, sim_y0, sim_s3max, sim_s5max, sim_s11max =[], [], [], [], [], [], [], [], [], [], []

        bmax = int(self.ps.freq[-1]/(2.0*(self.ps.freq[1]-self.ps.freq[0])))
        bins = [1,3,5,7,10,15,20,30,50,70,100,200,300,500,700,1000]


        binlist = [r for r in fitpars["bindict"].keys()]
        nbins = len(binlist)/4
        sain = copy.copy(fitpars['popt'])

#        print('popt2: ' + str(fitpars['popt']))
        ### Step 4: Fit fake periodograms:
        for i,x in enumerate(fakeper):
            try:
#            print('popt' + str(i) + 'a : ' + str(fitpars['popt']))
      
                fitfake = mle.PerMaxLike(x, fitmethod=fitmethod, obs=False)
#            print('popt' + str(i) + 'b : ' + str(fitpars['popt']))

                sim_pars = fitfake.mlest(func, sain,obs=False, noise=noise, m=self.m)
#            print('popt' + str(i) + 'c : ' + str(fitpars['popt']))
         
                sim_pars_all.append(sim_pars)
 
                sim_deviance.append(sim_pars['deviance'])
                sim_ksp.append(sim_pars['ksp'])
                sim_maxpow.append(sim_pars['maxpow'])
                sim_merit.append(sim_pars['merit'])
                sim_fpeak.append(sim_pars['maxfreq'])
                sim_y0.append(sim_pars['mfit'][sim_pars['maxind']])
                sim_srat.append(sim_pars['sobs'])
                sim_s3max.append(sim_pars['s3max'])
                sim_s5max.append(sim_pars['s5max'])
                sim_s11max.append(sim_pars['s11max'])

            except KeyboardInterrupt:
                break

            #except:
            #    print("Simulation failed! Continuing ...")
            #    continue 
#               print('popt' + str(i) + 'd : ' + str(fitpars['popt']))

#             print('popt3: ' + str(fitpars['popt']))

        ### upper limit is the power in the sorted array where p_maxpow would be 0.05
        ### i.e. when only 0.05*nsim simulations are higher than this
        ### note: sometimes simulations fail, therefore the 5% limit should be 0.05*len(sims)
        fiveperlim = int(0.05*len(sim_maxpow))
        if fiveperlim == 0: 
            resfile('Warning! Too few simulations to compute five percent limit reliably!')
            fiveperlim = 1
        ninetyfiveperlim = len(sim_maxpow) - fiveperlim


        #print('popt4: ' + str(fitpars['popt']))
        bindicts = [x["bindict"] for x in sim_pars_all] 
        ### get out binned powers:

        maxpows_all = {}

        binprob = {}
        for b in bins[:nbins]:
            binps = fitpars['bindict']['bin'+str(b)]
            bmaxpow = np.array([x["bmax" + str(b)] for x in bindicts])

            maxpows_all["bin"+str(b)] = bmaxpow

            bindict['sim_bmaxpow' + str(b)] = bmaxpow
            p_bmaxpow = float(len([x for x in bmaxpow if x > fitpars['bindict']["bmax" + str(b)]]))/float(len(bmaxpow))
            bindict["p_maxpow" + str(b)] = p_bmaxpow
            
            bmaxpow_err = np.sqrt(p_bmaxpow*(1.0-p_bmaxpow)/float(len(bmaxpow)))
            bindict['p_maxpow' + str(b) + 'err'] = bmaxpow_err            
        
            sim_bmaxpow_sort = np.msort(bmaxpow)

            ### note: this is the limit for 2*I/S --> multiply by S to get powers for each frequency 
            ### Like everything else, this is n-trial corrected!
            #print('len(bmaxpow_sort) : ' + str(len(sim_bmaxpow_sort)))
            resfile('ninetyfiveperlim: ' + str(ninetyfiveperlim))
            bmaxpow_ul = sim_bmaxpow_sort[ninetyfiveperlim]
            bindict['bmax' + str(b) + '_ul'] = bmaxpow_ul 
            resfile('The posterior p-value for the maximum residual power for a binning of ' + str(self.ps.df*b) + 'Hz is p = ' + str(p_bmaxpow) + ' +/- ' +  str(bmaxpow_err))
            resfile('The corresponding value of the T_R statistic at frequency f = ' + str(fitpars["bindict"]["bmaxfreq" + str(b)]) + ' is 2I/S = ' + str(fitpars['bindict']["bmax" + str(b)]))

            resfile('The upper limit on the T_R statistic is 2I/S = ' + str(bmaxpow_ul))

            ### now turn upper limit into an rms amplitude:
            ## first compute broadband noise model for binned frequencies
            bintemplate = func(fitpars['bindict']['bin'+str(b)].freq, *fitpars['popt'])
            resfile("bintemplate[0]: " + str(bintemplate[0]))
            ## then compute upper limits for powers I_j depending on frequency
            binpowers = bmaxpow_ul*bintemplate/2.0 - bintemplate
            ## now compute rms amplitude at 40, 70, 100 and 300 Hz

            ## first, convert powers into rms normalization, if they're not already
            if self.ps.norm == 'leahy':
                binpowers = binpowers/(self.ps.df*b * self.ps.nphots)
            elif self.ps.norm == 'variance':
                binpowers = binpowers*self.ps.n**2.0 / (self.ps.df*b*self.ps.nphots**2.0)

            #print('len(binps.freq): ' + str(len(binps.freq)))
            #print('len(binpowers): ' + str(len(binpowers)))


            if searchfreq is None:
                  searchfreq = [40.0, 70.0, 100.0, 300.0, 500.0, 1000.0]
            ## for 40 Hz: 
            print(searchfreq)
            for bc in searchfreq:
                if bc > (binps.freq[1] - binps.freq[0]):
                    bind = np.searchsorted(binps.freq, bc) - 1
                    bpow = binpowers[bind]
                    brms = np.sqrt(bpow*b*self.ps.df)

                    resfile('The upper limit on the power at ' + str(bc) +
                            'Hz for a binning of ' + str(b) + ' is P = ' +
                            str(bpow*(self.ps.df*b*self.ps.nphots)))

                    resfile('The upper limit on the rms amplitude at ' + str(bc) +
                            'Hz for a binning of ' + str(b) + ' is rms = ' + str(brms))

                    bindict['bin' + str(b) + '_ul_%.4fHz'%bc] = brms 
                else:
                    continue


        ### Step 5: Compute Bayesian posterior probabilities of individual quantities
        p_maxpow = float(len([x for x in sim_maxpow if x > fitpars['maxpow']]))/float(len(sim_maxpow))
        p_deviance = float(len([x for x in sim_deviance if x > fitpars['deviance']]))/float(len(sim_deviance))
        p_ksp = float(len([x for x in sim_ksp if x > fitpars['ksp']]))/float(len(sim_ksp))
        p_merit = float(len([x for x in sim_merit if x > fitpars['merit']]))/float(len(sim_merit))
        p_srat = float(len([x for x in sim_srat if x > fitpars['sobs']]))/float(len(sim_srat))
 
        p_s3max = float(len([x for x in sim_s3max if x > fitpars['s3max']]))/float(len(sim_s3max))
        p_s5max = float(len([x for x in sim_s5max if x > fitpars['s5max']]))/float(len(sim_s5max))
        p_s11max = float(len([x for x in sim_s11max if x > fitpars['s11max']]))/float(len(sim_s11max))


        ### sort maximum powers from lowest to highest
        sim_maxpow_sort = np.msort(sim_maxpow)
        sim_s3max_sort = np.msort(sim_s3max)
        sim_s5max_sort = np.msort(sim_s5max)
        sim_s11max_sort = np.msort(sim_s11max)

        ### note: this is the limit for 2*I/S --> multiply by S to get powers for each frequency 
        ### Like everything else, this is n-trial corrected!
        maxpow_ul = sim_maxpow_sort[ninetyfiveperlim]


        ### Step 6: Compute errors of Bayesian posterior probabilities
        pmaxpow_err = np.sqrt(p_maxpow*(1.0-p_maxpow)/float(len(sim_ksp)))
        pdeviance_err = np.sqrt(p_deviance*(1.0-p_deviance)/float(len(sim_ksp)))
        pksp_err = np.sqrt(p_ksp*(1.0-p_ksp)/float(len(sim_ksp)))
        pmerit_err = np.sqrt(p_merit*(1.0-p_merit)/float(len(sim_ksp)))
        psrat_err = np.sqrt(p_srat*(1.0-p_srat)/float(len(sim_ksp)))

        ps3max_err = np.sqrt(p_s3max*(1.0-p_s3max)/float(len(sim_ksp)))
        ps5max_err = np.sqrt(p_s5max*(1.0-p_s5max)/float(len(sim_ksp)))
        ps11max_err = np.sqrt(p_s11max*(1.0-p_s11max)/float(len(sim_ksp)))


        ### Display results on screen and make funky plots
        resfile("Bayesian p-value for maximum power P_max =  " + str(p_maxpow) + " +/- " + str(pmaxpow_err))
        #resfile('Upper limit on maximum signal power P_max_ul = ' + str(maxpow_ul))

        resfile("Bayesian p-value for maximum power P_max =  " + str(p_s3max) + " +/- " + str(ps3max_err))
        #resfile('Upper limit on maximum signal power P_max_ul = ' + str(s3max_ul))

        resfile("Bayesian p-value for maximum power P_max =  " + str(p_s5max) + " +/- " + str(ps5max_err))
        #resfile('Upper limit on maximum signal power P_max_ul = ' + str(s5max_ul))

        resfile("Bayesian p-value for maximum power P_max =  " + str(p_s11max) + " +/- " + str(ps11max_err))
        #resfile('Upper limit on maximum signal power P_max_ul = ' + str(s11max_ul))


        resfile("Bayesian p-value for deviance D =  " + str(p_deviance) + " +/- " + str(pdeviance_err))
        resfile("Bayesian p-value for KS test: " + str(p_ksp) + " +/- " + str(pksp_err))
        resfile("Bayesian p-value for Merit function: " + str(p_merit) + " +/- " + str(pmerit_err))
        resfile("Bayesian p-value for the np.sum of residuals: " + str(p_srat) + " +/- " + str(psrat_err))

        if self.plot:
            plt.subplot(2,2,1)
            n, bins, patches = plt.hist(sim_maxpow, bins=100, normed = True, color="cyan",  histtype='stepfilled')
            xmin, xmax = min(min(bins), fitpars['maxpow'])/1.2, max(25, fitpars['maxpow']*1.2)
            plt.axis([xmin, xmax, 0.0, max(n)])
            plt.vlines(fitpars['maxpow'], 0.0, max(n), lw=2, color='navy')
            plt.title('unsmoothed data', fontsize=12)
 
            plt.subplot(2,2,2)
            n, bins, patches = plt.hist(sim_s3max, bins=100, normed = True, color="cyan", histtype='stepfilled')
            xmin, xmax = min(min(bins), fitpars['s3max'])/1.2, max(25, fitpars['s3max']*1.2)
            plt.axis([xmin, xmax, 0.0, max(n)])
            plt.vlines(fitpars['s3max'], 0.0, max(n), lw=2, color='navy')
            plt.title('smoothed (3) data', fontsize=12)

            plt.subplot(2,2,3)
            n, bins, patches = plt.hist(sim_s3max, bins=100, normed = True, color="cyan", histtype='stepfilled')
            xmin, xmax = min(min(bins), fitpars['s5max'])/1.2, max(25, fitpars['s5max']*1.2)
            plt.axis([xmin, xmax, 0.0, max(n)])

            plt.vlines(fitpars['s5max'], 0.0, max(n), lw=2, color='navy')
            plt.title('smoothed (5) data/model outlier', fontsize=12)

            plt.subplot(2,2,4)
            n, bins, patches = plt.hist(sim_s3max, bins=100, normed = True, color="cyan",  histtype='stepfilled')
            xmin, xmax = min(min(bins), fitpars['s11max'])/1.2, max(25, fitpars['s3max']*1.2)
            plt.axis([xmin, xmax, 0.0, max(n)])
 
            plt.vlines(fitpars['s11max'], 0.0, max(n), lw=2, color='navy')
            plt.title('smoothed (11) data', fontsize=12)

            plt.savefig(self.namestr + '_maxpow.png', format='png')
            plt.close()


        results = {"fitpars":fitpars, 'bindict':bindict, 'maxpows_all':maxpows_all, 'mcobs':mcobs, 'p_maxpow':[sim_maxpow, p_maxpow, pmaxpow_err], 'maxpow_ul':maxpow_ul, 'p_s3max':[sim_s3max, p_s3max, ps3max_err], 'p_s5max':[sim_s5max, p_s5max, ps5max_err], 'p_s11max':[sim_s11max, p_s11max, ps11max_err], 'p_merit':[p_merit, pmerit_err], 'p_srat':[p_srat, psrat_err], 'p_deviance':[p_deviance, pdeviance_err], 'fitpars':fitpars,  "postmean":mcobs.mean, "posterr":mcobs.std, "postquantiles":mcobs.ci, "rhat":mcobs.rhat, "acor":mcobs.acor, "acceptance":mcobs.acceptance}

        return results
Exemple #46
0
def dip_fn(dat, is_hist=False, just_dip=False):
    """
        Compute the Hartigans' dip statistic either for a histogram of
        samples (with equidistant bins) or for a set of samples.
    """
    if is_hist:
        histogram = dat
        idxs = np.arange(len(histogram))
    else:
        counts = collections.Counter(dat)
        idxs = np.msort(list(counts.keys()))
        histogram = np.array([counts[i] for i in idxs])

    # check for case 1<N<4 or all identical values
    if len(idxs) <= 4 or idxs[0] == idxs[-1]:
        left = []
        right = [1]
        d = 0.0
        return d if just_dip else (d, (None, idxs, left, None, right, None))

    cdf = np.cumsum(histogram, dtype=float)
    cdf /= cdf[-1]

    work_idxs = idxs
    work_histogram = np.asarray(histogram, dtype=float) / np.sum(histogram)
    work_cdf = cdf

    D = 0
    left = [0]
    right = [1]

    while True:
        left_part, left_touchpoints = _gcm_(work_cdf - work_histogram,
                                            work_idxs)
        right_part, right_touchpoints = _lcm_(work_cdf, work_idxs)

        d_left, left_diffs = _touch_diffs_(left_part, right_part,
                                           left_touchpoints)
        d_right, right_diffs = _touch_diffs_(left_part, right_part,
                                             right_touchpoints)

        if d_right > d_left:
            xr = right_touchpoints[d_right == right_diffs][-1]
            xl = left_touchpoints[left_touchpoints <= xr][-1]
            d = d_right
        else:
            xl = left_touchpoints[d_left == left_diffs][0]
            xr = right_touchpoints[right_touchpoints >= xl][0]
            d = d_left

        left_diff = np.abs(left_part[:xl + 1] - work_cdf[:xl + 1]).max()
        right_diff = np.abs(right_part[xr:] - work_cdf[xr:] +
                            work_histogram[xr:]).max()

        if d <= D or xr == 0 or xl == len(work_cdf):
            the_dip = max(
                np.abs(cdf[:len(left)] - left).max(),
                np.abs(cdf[-len(right) - 1:-1] - right).max())
            if just_dip:
                return the_dip / 2
            else:
                return the_dip / 2, (cdf, idxs, left, left_part, right,
                                     right_part)
        else:
            D = max(D, left_diff, right_diff)

        work_cdf = work_cdf[xl:xr + 1]
        work_idxs = work_idxs[xl:xr + 1]
        work_histogram = work_histogram[xl:xr + 1]

        left[len(left):] = left_part[1:xl + 1]
        right[:0] = right_part[xr:-1]
Exemple #47
0

def datestr2num(s):
    """
		0 - Monday
		6 - Sunday
	"""
    return datetime.datetime.strptime(s, "%Y-%m-%d").date().weekday()


c = np.loadtxt("GOOG.csv", delimiter=",", usecols=(4,), unpack=True)

# median
print("median = {0}".format(np.median(c)))

sorted_close = np.msort(c)
# print("sorted = {0}".format(sorted_close))
N = len(c)
print("middle = {0}".format(sorted_close[N / 2]))
print("averaged middle = {0}".format((sorted_close[N / 2] + sorted_close[(N - 1) / 2]) / 2))

# variance
print("variance = {0}".format(np.var(c)))

print("variance from definition = {0}".format(np.mean((c - c.mean()) ** 2)))

# return

returns = np.diff(c) / c[:-1]
print("Returns")
print("Standard deviation = {0}".format(np.std(returns)))