def test_msort_1(self): a = np.array([[1, 4], [3, 1]]) b = np.msort(a) print(b) print("********") a = np.arange(32.2, 0.2, -1.0) b = np.msort(a) print(b)
def calculate_eCDF(data, extend=False): """Calculate the x- and y-coordinates of an empirical CDF curve. This function finds the unique values within a dataset, `data`, and calculates the likelihood that a random data point within the set is less than or equal to each of those unique values. The `extend` option creates extra values outside the range of `data` corresponding to P(X <= x) = 0 and 1, which are useful for plotting eCDFs.""" ## Get the unique values in `data` and their counts (the histogram). counts = Counter(data.ravel()) ## Sort the unique values vals = np.msort(list(counts.keys())) ## Calculate the cumulative number of counts, then divide by the total. CDF = np.cumsum([counts[val] for val in vals]) CDF = CDF / CDF[-1] ## If `extend`, add points to `vals` and `CDF` if extend: data_range = vals[-1] - vals[0] vals = [vals[0] - (0.01 * data_range)] + list(vals) vals = np.asarray(vals + [vals[-1] + (0.01 * data_range)]) CDF = np.asarray([0] + list(CDF) + [1]) return vals, CDF
def plot(self, sub, ints, plot_style="seaborn"): """ Plot complete data, highlight subset currently being searched, and add vertical lines for discovered intervals. (only intervals of the current recursion level appear.) """ import matplotlib.pyplot as plt plt.style.use(plot_style) if self.is_hist: plt.step(list(range(len(self.dat))), self.dat) plt.fill_between(list(range(len(self.dat))), self.dat, step="pre", alpha=.4) plt.axvspan(sub[0], sub[1] - 1, color="orange", alpha=.3) for i in ints: plt.axvspan(i[0], i[1], color="green", alpha=.1) for i in ints: plt.axvline(i[0], color="black") plt.axvline(i[1], color="black") else: dat = np.msort(self.dat) plt.hist(dat, bins=30) plt.axvspan(dat[sub[0]], dat[sub[1] - 1], color="orange", alpha=.3) for i in ints: plt.axvspan(dat[i[0]], dat[i[1]], color="green", alpha=.1) for i in ints: plt.axvline(dat[i[0]], color="black") plt.axvline(dat[i[1]], color="black") plt.show()
def test(shape): tensor = make_tensor(shape, device, dtype, low=-9, high=9) if tensor.size() != torch.Size([]): if dtype is torch.bfloat16: expected = torch.from_numpy(np.msort(tensor.float().cpu().numpy())).bfloat16() else: expected = torch.from_numpy(np.msort(tensor.cpu().numpy())) else: expected = tensor # numpy.msort() does not support empty shapes tensor result = torch.msort(tensor) self.assertEqual(result, expected) out = torch.empty_like(result) torch.msort(tensor, out=out) self.assertEqual(out, expected)
def paDifCone( thetaMin, thetaMax, outfile, ntrials=100000, nbins=1000 ) : nsamps = 0 pdif = numpy.zeros( ntrials, dtype=float ) costhetaMin = math.cos( math.pi*thetaMin/180. ) costhetaMax = math.cos( math.pi*thetaMax/180. ) for n in range( 0, ntrials ) : [pa1,x1,y1,z1] = paRandom() cosphi = -1.01 # impossible value while (cosphi > costhetaMin) or (cosphi < costhetaMax) : [pa2,x2,y2,z2] = paRandom() cosphi = x1*x2 + y1*y2 + z1*z2 # yeah, this should be dot(a,b) nsamps = nsamps + 1 pdif[n] = abs(pa1 - pa2) if pdif[n] > 90. : pdif[n] = 180. - pdif[n] # complement of angle to keep angle acute print "ntrials = %d, nsamps = %d" % (ntrials,nsamps) pdifSorted = numpy.msort(pdif) nskip = ntrials/nbins if nskip == 0 : nskip = ntrials fout = open( outfile, "w" ) for n in range( 0, ntrials, nskip ) : fout.write("%10.3f %10.3f\n" % (pdifSorted[n], float(n)/float(ntrials) ) ) fout.write("%10.3f %10.3f\n" % (pdifSorted[ntrials-1], 1.0 ) ) return pdifSorted
def mode(x,j): """ calculate the mode for continuous data in array x see Numerical Recipes, Chapter 13 usage: index_list, probability_list = mode(array_of_data,window) returns two lists: 1) the index {i.e. the value from the data calculated as (x[i]+x[i+window])/2} 2) the probability of finding that value """ # make sure data is in an array and make sure it is sorted # (will not maintain synchronicity between columns though, but that shouldn't matter # for the mode calculation!) x = N.asarray(x) x = N.msort(x) # create the index array ind = N.zeros((len(x)-j,x.shape[1]),float) # create the probability array p = N.zeros((len(x)-j,x.shape[1]),float) n=len(x) for i in range(n-j): ind[i] = N.multiply(0.5,add(x[i],x[i+j])) p[i] = N.divide(j,N.multiply(n,N.subtract(x[i+j],x[i]))) return ind, p
def unimodality_dip_test(path: str, plot_show=False) -> bool: ''' #http://www.nicprice.net/diptest/Hartigan_1985_AnnalStat.pdf #https://github.com/BenjaminDoran/unidip Given the image and conduct dip test to see whether it's unimodal or not. @path: image path @plot_show: see whether plot the histogram or not ''' img = cv2.imread(path, 0) img_array = img.ravel() #input an array #return True if its unimodal distributed data = np.msort(img_array) #the probability of unimodal uni_prob = dip.diptst(data)[1] if uni_prob > 0.5: #print(f'This image is unimodel distributed with probability of {uni_prob*100:.2f} %') unimodality = True else: #print(f'This image is at least bimodel distributed with probability of {(1-uni_prob)*100:.2f} %') unimodality = False if plot_show: plt.figure() sns.distplot(img.ravel(), bins=256, kde=True, hist=True) plt.title('Histogram of the image') plt.show() return unimodality
def paDifUniform( outfile ) : # choose 2 points randomly on surface of sphere; compute PA projected on the plane of # the sky for each of them; then compute angular difference of the two directions; # reorder and compute cumulative distribution function ntrials = 100000 #100000 nsamples = 1000 #1000 zratio = numpy.zeros( ntrials, dtype=float ) incldif = 0. padif = 0. count = 0. for n in range( 0, ntrials ) : [pa1,incl1,x1,y1,z1] = paRandom() #core field [pa2,incl2,x2,y2,z2] = paRandom() #envelope field zratio[n] = z1/z2 if (zratio[n] > ratio1) or (0. < zratio[n] < ratio2): padif_tmp = abs(pa1 - pa2) padif += numpy.where(padif_tmp > 90., 180.-padif_tmp, padif_tmp) incldif_tmp = abs(incl1 - incl2) incldif += numpy.where(incldif_tmp > 180., 360.-incldif_tmp, incldif_tmp) count += 1. zratioSorted = numpy.msort(zratio) nskip = ntrials/nsamples print "the mean position angle difference in degrees =", padif/count print "the mean inclination angle difference in degrees =", incldif/count print "probability =", count/ntrials fout = open( outfile, "w" ) for n in range( 0, ntrials, nskip ) : fout.write("%10.3f %10.3f\n" % (zratioSorted[n], float(n)/float(ntrials) ) ) fout.write("%10.3f %10.3f\n" % (zratioSorted[ntrials-1], 1.0 ) )
def find_closest_k(self, point): distances = np.ones(self.x.shape[0]) for i, xm in enumerate(self.norm_x): distances[i] = np.linalg.norm(xm - point) sorted_dist = np.msort(distances) closest_k = (distances <= sorted_dist[self.k-1]) return self.y[closest_k]
def get_sample_list(self, sample_dir): list_sample = [ os.listdir(sample_dir)[i].split(".")[0] for i in range(len(os.listdir(sample_dir))) ] list_sample = np.uint32(list_sample) list_sample = np.msort(list_sample) return list_sample
def sort_func(): x = np.array([3, 1, 2]) print("x ", x) # 数组排序 print("np.sort(a) ", np.sort(x)) # 多维数组排序 a = np.array([[1, 5, 4], [3, 2, 1]]) print("x ", a) # 数组排序 sort along the last axis print("np.sort(a) ", np.sort(a)) # sort the flattened array print(" np.sort(a, axis=None) ", np.sort(a, axis=None)) # sort along the first axis print(" np.sort(a, axis=0) ", np.sort(a, axis=0)) # 使用键序列执行间接排序。 surnames = ('Hertz', 'Galilei', 'Hertz') first_names = ('Heinrich', 'Galileo', 'Gustav') ind = np.lexsort((first_names, surnames)) print("ind", ind) rs = [surnames[i] + ", " + first_names[i] for i in ind] print("rs", rs) # 返回将数组分类的索引 返回索引 x = np.array([3, 1, 2]) print("np.argsort(x)", np.argsort(x)) # 排序后重新赋值到新的数组中 rs = [x[i] for i in ind] print("rs", rs) # 返回沿第一个轴排序的数组的副本。 x = np.array([3, 1, 2]) print("np.msort(x)", np.msort(x)) y = np.sort_complex([5, 3, 6, 2, 1]) print("y", y) # numpy.argmax 返回沿轴的最大值的索引 a = np.arange(6).reshape(2, 3) print("a", a) print("np.argmax(a)", np.argmax(a)) # numpy.argpartition a = np.array([[np.nan, 4], [2, 3]]) print("np.nanargmax(a)", np.nanargmax(a)) a = np.arange(6).reshape(2, 3) print("a", a) print("np.argmin(a)", np.argmin(a)) # 找到非零的数组元素的索引,按元素分组 x = np.argwhere(x > 1) print("x", x) x = np.where(x > 1) print("x", x) # 查找要插入元素以维持顺序的索引 x = np.searchsorted([1, 2, 3, 4, 5], 3) print("x", x)
def find_docs_by(self, q: Query) -> np.ndarray: binmat = self.matrix docs = binmat.columns.to_numpy() rows = binmat.index ones = np.ones(len(docs), dtype=np.bool_) zeroes = np.zeros(len(docs), dtype=np.bool_) vectors = [ binmat.loc[term] if term in rows else zeroes for term in get_terms(q) ] intersection = reduce(np.logical_and, vectors, ones) return np.msort(docs[intersection == True])
def AALERGIA(dffa, alpha, dpfa_orig): dffa_merged = copy.deepcopy(dffa) initial_state = dffa.initial_state dffa_merged.RED = np.append(dffa_merged.RED, initial_state) initial_blue_states = dffa.frequency_transition_matrix[0][initial_state][:] initial_blue_states = initial_blue_states[np.nonzero(initial_blue_states)] dffa_merged.BLUE = np.append(dffa_merged.BLUE, initial_blue_states) while len(dffa_merged.BLUE) > 0: dffa_merged.BLUE = np.msort(dffa_merged.BLUE) q_b = dffa_merged.BLUE[0] dffa_merged.BLUE = dffa_merged.BLUE[1:len(dffa_merged.BLUE)] promote = 1 labels = dffa.state_labels[dffa_merged.RED] state_labels_q_b = dffa_merged.state_labels[q_b] label_index = findall(labels, state_labels_q_b) for i in range(0, len(label_index)): q_r = dffa_merged.RED[label_index[i]] thresh = calculate_compatible_parameter(dffa, q_r, q_b, alpha) if (AAlergia_compatible(dffa, dpfa_orig, q_r, q_b, 1, 1, alpha, thresh)): dffa_merged = AAlergia_merge(dffa_merged, q_r, q_b) promote = 0 break if promote == 1: dffa_merged.RED = np.append(dffa_merged.RED, q_b) #build new blue set qr_succ = dffa_merged.frequency_transition_matrix[0][ dffa_merged.RED][:] qr_succ = qr_succ[np.nonzero(qr_succ)] #gets the new blue data (returns data that is not in intersection) difference = np.setxor1d(qr_succ, np.append(dffa_merged.RED, dffa_merged.BLUE)) ia = np.empty(0, dtype=int) for i in range(0, len(qr_succ)): for j in range(0, len(difference)): if (difference[j] == qr_succ[i]): ia = np.append(ia, qr_succ[i]) dffa_merged.BLUE = np.append(dffa_merged.BLUE, ia) return dffa_merged
def test_dip(data, alpha = 0.05, verbose = True)->bool: import unidip.dip as dip # sort data data = np.msort(data) # test stat, p, _ = dip.diptst(data) # display if verbose: print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably unimodal') else: print('Probably not unimodal.')
def __init__(self, dat, is_hist=False, alpha=0.05, ntrials=100, mrg_dst=1, debug=False): self.dat = np.msort(np.array(dat)) if not is_hist else np.array(dat) self.is_hist = is_hist self.alpha = alpha self.ntrials = ntrials self.mrg_dst = mrg_dst self.debug = debug
def isbimodal(data, method): if method == 'hdt': # Use Hartigan's dip statistic to decide if distribution deviates from unimodality. _, pval, _ = diptst(np.msort(data)) return (pval is not None) and (pval <= 0.05) else: # Compare Bayesian Information Content of two Gaussian Mixture Models. X = data.reshape(-1, 1) gmm2 = mixture.GaussianMixture(n_components=2, covariance_type='full').fit(X) gmm1 = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(X) return gmm2.bic(X) <= gmm1.bic(X)
def lizFindCirclesGrid(circles): """ Passed in array of circles with no false positives, return array of most likely 6x7 grid centers in raste """ # generate row and column delimiting values cxs = [] cys = [] for i in circles: cxs.append(i[0]) cys.append(i[1]) criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0) retval_x, bestlabels_x, centers_kmeans_x = cv2.kmeans( np.array(np.float32(cxs)), 7, criteria, 10, cv2.KMEANS_PP_CENTERS ) retval_y, bestlabels_y, centers_kmeans_y = cv2.kmeans( np.array(np.float32(cys)), 6, criteria, 10, cv2.KMEANS_PP_CENTERS ) # we should see 7 groups of in x, and 6 groups in y # delimited by a jump of one piece size in width (~30-50 pixels) centers_kmeans_x = np.msort(centers_kmeans_x) centers_kmeans_y = np.msort(centers_kmeans_y) fullrow = [] fullcol = [] for i in centers_kmeans_x: fullcol.append(int(i)) for j in centers_kmeans_y: fullrow.append(int(j)) finalout = [] # finalout is 42 possible pairs for i in range(5, -1, -1): for j in range(7): finalout.append([fullrow[i], fullcol[j]]) return finalout, centers_kmeans_x, centers_kmeans_y
def eg3_10(): """ 3.10 动手实践:简单统计分析 我们可以用一些阈值来除去异常值,但其实有更好的方法,那就是中位数。将各个变量值按 大小顺序排列起来,形成一个数列,居于数列中间位置的那个数即为中位数。例如,我们有1、2、 3、4、5这5个数值,那么中位数就是中间的数字3。下面是计算中位数的步骤。 """ # (1) 计算收盘价的中位数。创建一个新的Python脚本文件,命名为simplestats.py。你已经知道 # 如何从CSV文件中读取数据到数组中了,因此只需要复制一行代码并确保只获取收盘价数据即 # 可,如下所示: c = np.loadtxt('output/data.csv', delimiter=',', usecols=(6, ), unpack=True) print c # (2) 一个叫做median的函数将帮助我们找到中位数。我们调用它并立即打印出结果。添加下 # 面这行代码: print "median =", np.median(c) # (3) 既然这是我们首次使用median函数,我们来检查一下结果是否正确。这可不是因为我 # 们多疑!当然,我们可以将整个数据文件浏览一遍并人工找到正确的答案,但那样太无趣了。 # 我们将对价格数组进行排序,并输出排序后居于中间位置的值,这也就是模拟了寻找中位数的 # 算法。msort函数可以帮我们完成第一步。我们将调用这个函数,获得排序后的数组,并输出 # 结果。 sorted_close = np.msort(c) print "sorted =", sorted_close # 太好了,代码生效了!现在,我们来获取位于中间的那个数字: N = len(c) print "middle =", sorted_close[(N - 1) / 2] """ # 书上印刷错误,故测试当前这段代码的实现方式 错误例子:print "middle =", sorted[(N - 1)/2] 正确写法:print "middle =", sorted_close[(N - 1)/2] tmp = [1, 2, 3, 4, 5, 6] print np.msort(tmp) print "median = ", np.median(tmp) print "middle = ", tmp[(len(tmp) - 1) / 2] """ # (4) 咦,这个值和median函数给出的值不一样,这是怎么回事?经过仔细观察我们发现, # median函数返回的结果甚至根本没有在我们的数据文件里出现过。这就更奇怪了!在给NumPy # 团队提交bug报告之前,我们先来看下文档。原来这个谜团很容易解开。原因就在于我们的简单 # 算法模拟只对长度为奇数的数组奏效。对于长度为偶数的数组,中位数的值应该等于中间那两个 # 数的平均值。因此,输入如下代码: print "average middle =", (sorted_close[N / 2] + sorted_close[(N - 1) / 2]) / 2 # (5) 另外一个我们关心的统计量就是方差。方差能够体现变量变化的程度。在我们的例子中, # 方差还可以告诉我们投资风险的大小。那些股价变动过于剧烈的股票一定会给持有者制造麻烦。 # 在NumPy中,计算方差只需要一行代码,看下面: print "variance =", np.var(c) # (6) 既然我们不相信NumPy的函数,那就再次根据文档中方差的定义来复核一下结果。注意, # 这里方差的定义可能与你在统计学的书中看到的不一致,但这个定义在统计学上更为通用。 print "variance from definition =", np.mean((c - c.mean())**2)
def calMedian(filename) : c = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=(title['TCLOSE'],), unpack=True) #print c print "median =", np.median(c) sorted_close=np.msort(c) print "sorted_close =", sorted_close N=len(c) print "middle =",sorted_close[(N-1)/2] print "variance =", np.var(c) print "variance from definition =", np.mean((c-c.mean())**2)
def lizFindCirclesGrid(circles): """ Passed in array of circles with no false positives, return array of most likely 6x7 grid centers in raste """ #generate row and column delimiting values cxs = [] cys = [] for i in circles: cxs.append(i[0]) cys.append(i[1]) criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0) retval_x, bestlabels_x, centers_kmeans_x = cv2.kmeans( np.array(np.float32(cxs)), 7, criteria, 10, cv2.KMEANS_PP_CENTERS) retval_y, bestlabels_y, centers_kmeans_y = cv2.kmeans( np.array(np.float32(cys)), 6, criteria, 10, cv2.KMEANS_PP_CENTERS) #we should see 7 groups of in x, and 6 groups in y #delimited by a jump of one piece size in width (~30-50 pixels) centers_kmeans_x = np.msort(centers_kmeans_x) centers_kmeans_y = np.msort(centers_kmeans_y) fullrow = [] fullcol = [] for i in centers_kmeans_x: fullcol.append(int(i)) for j in centers_kmeans_y: fullrow.append(int(j)) finalout = [] #finalout is 42 possible pairs for i in range(5, -1, -1): for j in range(7): finalout.append([fullrow[i], fullcol[j]]) return finalout, centers_kmeans_x, centers_kmeans_y
def paDifUniform( outfile ) : ntrials = 100000 nsamples = 1000 pdif = numpy.zeros( ntrials, dtype=float ) for n in range( 0, ntrials ) : [pa1,x,y,z] = paRandom() [pa2,x,y,z] = paRandom() pdif[n] = abs(pa1 - pa2) if pdif[n] > 90. : pdif[n] = 180. - pdif[n] pdifSorted = numpy.msort(pdif) nskip = ntrials/nsamples fout = open( outfile, "w" ) for n in range( 0, ntrials, nskip ) : fout.write("%10.3f %10.3f\n" % (pdifSorted[n], float(n)/float(ntrials) ) ) fout.write("%10.3f %10.3f\n" % (pdifSorted[ntrials-1], 1.0 ) )
def dip_test(properties, cluster_members, feature=3): data = [] for ii in range(cluster_members): data.append(float(properties[ii][feature])) data = np.array(data) data = np.msort(data) intervals = dip.diptst(data) t_range = np.linspace(0, 0.15, 200) kde = gaussian_kde(data) plt.plot(t_range, kde(t_range)/100) plt.grid() plt.title('Pick PDF' + '\n p_value = ' + str(intervals[1])) plt.show()
def close_peaks(ind, right): # tests whether two peaks are too close to have empty space in between. If yes it removes the second peak as this peak doesn't indicate another storey or wall. ind = np.msort(ind) # necessary to get real distances close_peaks = False distance = [] # list with bin distance of peaks for i in range(len(ind)-1): distance.append(ind[i+1]-ind[i]) temp = [] # list with index of items which have ot be deleted, because distance to low for i in distance: if i <= right: # right parameter identifies it as one wall temp.append(distance.index(i)+1) close_peaks = True if close_peaks is True: print 'two close peaks --> one floor' return np.delete(ind,temp) # second peak deleted, continue with newind, should right be made bigger??? else: return None
def dip_test(properties, cluster_members, feature=3): data = [] outliers = [u'121010', u'091110', u'091109', u'091106'] for ii in range(cluster_members): if not properties[ii][0] in outliers: data.append(float(properties[ii][feature])) data = np.array(data) data = np.msort(data) intervals = dip.diptst(data) t_range = np.linspace(0, 0.2, 200) kde = gaussian_kde(data) plt.plot(t_range, kde(t_range) / 100) plt.grid() plt.title('Distance PDF' + '\n p_value = ' + str(intervals[1])) plt.show()
def historical_VaR_ES_Calculation(df,window_years,output_years,var_per,es_per,day_num): outputdf = pd.DataFrame() outputdf['DATE'] = df['DATE'][:252*output_years] npts=252*window_years npaths = npts- day_num ntrails=252*output_years port_n_rtn=np.log(df['port_cur_value']/(df['port_cur_value'].shift(-5))) port_value = 10000 * np.exp(port_n_rtn) scenario=[] for i in range(ntrails): scenario.append(port_value[i:i+npaths-1]) # sort the matrix by column scenario = np.msort(scenario) for i in range(252*output_years): # VaR outputdf.ix[i,'Historical_VaR_'] = df.ix[i,'port_cur_value']-scenario[np.ceil((1-var_per)*npaths),:] # ES outputdf.ix[i,'Historical_ES_'] = df.ix[i,'PORT_CurrentValue']-mean(scenario[1:np.ceil((1-es_per)*npaths),:])
def sort_string_wfs(prefixes): length_array = np.zeros(len(prefixes)) for i in range(0, len(prefixes)): length_array[i] = prefixes[i].count(',') length_array_u = np.unique(length_array) string_set_new = np.empty(0, dtype=str) cursor = 0 IX = np.empty(0, dtype=int) for i in range(0, len(length_array_u)): len_s = length_array_u[i] indices = np.where(length_array == len_s) prefixes_to_sort = prefixes[indices] s_temp = np.msort(prefixes_to_sort) IXargsort = prefixes_to_sort.argsort() for j in range(0, len(s_temp)): string_set_new = np.append(string_set_new, s_temp[j]) IX = np.append(IX, indices[0][IXargsort[j]]) cursor = cursor + len(indices) return string_set_new, IX
def find_peaks_and_shoulders(self, x_window: tuple = None, slope_threshold: float = -0.00001, amp_threshold: float = 0.1, _window: int = None, _conv_threshold: int = None): conv_threshold = 3 if conv_threshold is not None: conv_threshold = _conv_threshold _peaks = self.find_peaks(x_window = x_window, slope_threshold = slope_threshold, amp_threshold = amp_threshold, _window = _window) _shoulders = self.find_shoulders(x_window = x_window, slope_threshold = slope_threshold, amp_threshold = amp_threshold, _window = _window) result = list(_peaks) conv_threshold = 3 for shoulder in _shoulders: acceptable = [] for peak in _peaks: if abs(peak - shoulder) > conv_threshold: acceptable.append(True) else: acceptable.append(False) break if np.product(acceptable): result.append(shoulder) return np.msort(np.array(result))
def get_QQ_vals(data1, data2): """Align 2 datasets' eCDFs for plotting on QQ-plot.""" vals1, CDF1 = get_eCDF(data1, extend=True) vals2, CDF2 = get_eCDF(data2, extend=True) joint_vals = np.msort(np.unique(np.hstack((vals1, vals2)))) joint_CDF1 = np.zeros_like(joint_vals) joint_CDF2 = np.zeros_like(joint_vals) id1, id2 = 0, 0 for ii, val in enumerate(joint_vals): joint_CDF1[ii] = CDF1[id1] if (val in vals1) and (id1 + 1 < len(vals1)): id1 += 1 joint_CDF2[ii] = CDF2[id2] if (val in vals2) and (id2 + 1 < len(vals2)): id2 += 1 return joint_vals, joint_CDF1, joint_CDF2
def common_fun(): # 单位矩阵 i2 = np.eye(4) print(i2) # 数据存储文件 np.savetxt('eye.txt', i2) # 读取文件 c, v = np.loadtxt('000032.csv', skiprows=1, delimiter=',', usecols=(4, 5), unpack=True) # print(c) # 计算成交量加权平均价格(VWAP) vwap = np.average(c, weights=v) print('vwap = ', vwap) # j计算算术平均值 wap = np.mean(c) print('算术平均值为:', wap) # 计算时间加权平均价格 print(c.size) t = np.arange(c.size) twap = np.average(c, weights=t) print('twap = ', twap) # 计算区间的中间点 print('middle is ', np.max(c) - np.min(c)) print('middle is ', np.ptp(c)) # 获得中位数 median = np.median(c) print('中位数为:', median) # 获得数组的排序 sorted_c = np.msort(c) print('sorted_c:', sorted_c[c.size // 2]) print('sorted_c:', sorted_c[(c.size - 1) // 2]) # 计算方差 variance = np.var(c) print('variance is ', variance)
def init(mode='r'): """ Generating the starting positions of the vehicles. Initializing the starting velocities to 0. mode: 'r', 'e', or 'fk' (random, equidistant, first k positions) """ starting_vel = np.zeros(vehicles, int) if mode == 'r': starting_pos = np.msort( np.random.choice(range(road_zones), vehicles, replace=0)) elif mode == 'e': step = road_zones // vehicles # print(step) starting_pos = np.arange(0, road_zones, step)[:vehicles] elif mode == 'fk': starting_pos = np.arange(vehicles) # print(starting_pos) if verbosity: print("Initialization complete...") return (starting_pos, starting_vel)
# 3.7计算数组的中位数和方差 import numpy as np ''' 计算数组的中位数和方差 1.中位数 median(a) 返回数组a的中位数 2.数组排序 msort(a) 对数组a升序排序 返回排序后的数组 3.方差var(a) 返回数组a的方差 ''' a = np.array([1, 3, 6, 2, 12, 43, 23, 12, 13, 20, 90, 78, 54]) b = np.median(a) c = np.msort(a) d = np.var(a) print(a) print(b) print(c) print(d)
plt.plot(a[mask], b[mask], 'bo') mask = (b >= 0) & (a <= np.pi / 2) plt.plot(a[mask], b[mask], 'go') plt.show()''' "where函数" b = np.where(a < 50) print(b) "full函数,2×2的全3数组" np.full((2, 2), 3) "np.ptp(x)计算x最大差值" x = 25, 56 a = np.array([1, 23, 45, 56]) list = [1, 60, 56] print(np.ptp(x)) #可以对数组使用 print(np.min(a)) #可以对数组使用 print(a.min()) #只对数组可用x.min(),列表不行 #print(list.min())错误!!! '''遍历元组或列表, 不用for index in range(len(x)) print(x[index]), 用for val in enumerate(x)print(val)''' c, v = np.loadtxt('data.csv', delimiter=',', usecols=(4, 5), unpack=True) print(np.ptp(c)) print(np.max(c)) print(np.median(c)) #中位数 print(np.msort(c)) print(np.var(c)) #方差=np.mean((c-c.mean())**2)
def dip_test(median_FL_data, total_data, alpha=0.05, save_figure=False): """ Perform a Hartigan's dip test to check for unimodality in clusters and splits clusters if bimodality is found. This function will take the highest intensity channel for each cluster and check for bimodality to correct for errors in clustering similar fluorescence profiles. Changing alpha will alter how stringent the dip test is. A higher alpha will result in higher detection of bimodality, but runs a greater risk of false identification. It is important to note that this dip test is relatively coarse grained and will not identify very slight populations of mixed cells (e.g. 10 orange cells clustered with 1000 red cells). Returns an updated clustering of the primary clustering after performing a dip test. Parameters ---------- median_FL_data : dict, clustering data generated by 'flowsym.cluster' function total_data : other fluorescence profiles for which errors will be corrected alpha : how stringent the dip test is save_figure : Save generated bar chart showing the number of cells in each cluster and a heat map of the median fluorescence intensity in each channel for each cluster. Figure is saved using 'matplotlib' module. Returns ------- change_dict : a dictory containing the corection that must be applied to similar fluorescence profiles if bimodality is found. See Also -------- cluster, dip_test Examples -------- dip_test(median_FL_data, total_data, alpha=0.08, save_figure=False) """ # Create a copy of the dictionary so we can retain the original clustering data change_dict = deepcopy(total_data) # Make kde plots if 'Cluster 0' in median_FL_data.keys(): fig, ax = plt.subplots(1, len(median_FL_data.keys()) - 1, figsize=(12, 3)) else: fig, ax = plt.subplots(1, len(median_FL_data.keys()), figsize=(12, 3)) # Keep track of what plot we're on i = 0 # Get the index of the max fluorescence for each cluster for key, value in median_FL_data.items(): cluster_max_FL_index = np.argmax(value) # As long as we aren't cluster one, do our dip test and plot if int(key[-1]) - 1 != -1: search_key = int(key[-1]) - 1 # Intensity in each cluster where the intensity is max dat = [row[cluster_max_FL_index] for row in total_data[search_key]] # Do the dip test data = np.msort(dat) intervals = UniDip(data, alpha=alpha).run() print("Performing dip test on cluster " + str(search_key + 1) + " ... ") # Show on the graph where the intervals are for j in intervals: ax[i].axvspan(data[j[0]], data[j[1]], color='lightblue', alpha=0.4) for q in j: ax[i].axvline(data[q], color='red') # Split the clusters that failed the dip test into separate clusters if len(intervals) > 1: split_point = int(np.mean([intervals[0][1], intervals[1][0]])) clust1 = data[:split_point] clust2 = data[split_point:] # Reset current cluster number to cluster 1 and make a new cluster to the dictionary print("Identified bimodality in cluster " + str(search_key + 1) + ", reclustering data ... ") change_dict[max(total_data.keys()) + 1] = [ row for row in total_data[search_key] if row[cluster_max_FL_index] in clust2 ] change_dict[search_key] = [ row for row in total_data[search_key] if row[cluster_max_FL_index] in clust1 ] # Plot data sns.kdeplot(data, ax=ax[i], color='black') ax[i].set(title='Cluster ' + str(search_key + 1), xlabel='FL ' + str(cluster_max_FL_index + 1), yticks=[]) # Move to the next plot i += 1 plt.tight_layout() # save first figure of the dip test if save_figure: plt.savefig("Dip_test_example") final_reclustered = {} # Make a new dictionary which will have the median value for each channel in the vector for a heatmap downstream for key, value in change_dict.items(): med_values = [] for i in range(len(value[0])): med_values.append(np.median([row[i] for row in value])) final_reclustered["Cluster " + str(key + 1)] = med_values search = np.random.choice(list(median_FL_data.keys())) cols = ['FL' + str(i + 1) for i in range(len(median_FL_data[search]))] # Dataframe to create heatmap reclustered_df = pd.DataFrame(final_reclustered, index=cols) # Counts dictionary for barchart reclustered_counts = {} for key, value in change_dict.items(): reclustered_counts[key] = len(value) # Replot the new clusters print("Plotting reclustered data ...") fig2, ax = plt.subplots(1, 2, figsize=(10, 4)) sns.heatmap(reclustered_df.transpose(), cmap='copper') reclust = [] recount = [] for key, value in reclustered_counts.items(): reclust.append(int(key) + 1) recount.append(value) rey_pos = np.arange(len(reclust)) ax[0].bar(rey_pos, recount, color='black') ax[0].set_xticks(rey_pos) ax[0].set_xticklabels(reclust) ax[0].set_xlabel('Cluster') ax[0].set_ylabel('Counts') ax[0].set_title('Cells per cluster') ax[1].set_title('Fluorescence profile of clusters') ax[1].set_xlabel('Fluorescence channel') plt.yticks(rotation=0) plt.tight_layout() if save_figure: plt.savefig("reclustered_after_dip_test") return change_dict
#!/usr/bin/python import numpy c=numpy.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True) print "median =", numpy.median(c) sorted = numpy.msort(c) print "sorted =", sorted N = len(c) print "middle =", sorted[(N - 1)/2] print "average middle =", (sorted[N /2] + sorted[(N - 1) / 2]) / 2 print "variance =", numpy.var(c) print "variance from definition =", numpy.mean((c - c.mean())**2)
def predict_label(self, X): score = self.predict(X) score_temp = np.msort(score) threshold = score_temp[score.size - (int)(score.size * self.nu)] return np.where(score > threshold, -1, 1)
step = (r_max - r_min) / max_step const_2 = -1.0 / step**2 const_1 = -2.0 * const_2 orb_factor = orb_l * (orb_l + 1) #Calculate array of potential values V = numpy.zeros(max_step+1,numpy.double) for i in xrange(max_step+1): r = r_min + i*step V[i] = potential(r) + orb_factor/r**2 #(include centrifugal term) #Calculate elements of the matrix to be diagonalized d = numpy.zeros(max_step,numpy.double) #diagonal elements e = numpy.zeros(max_step,numpy.double) #off-diagonal elements z = numpy.zeros((max_step,max_step),numpy.double) #matrix for eigenvectors (only used as a dummy-argument to #tqli() in the current version of this program) for i in xrange(max_step): d[i] = const_1 + V[i+1] e[i] = const_2 z[i,i] = 1 #(identity matrix) #Diagonalize and obtain eigenvalues. The eigenvalues are stored in d. computationalLib.pylib(cpp=False).tqli(d,e,z) #Sort the eigenvalues (smallest to largest) d = numpy.msort(d) #Output to file output()
RDs[index]=[all_list.count('A'),all_list.count('C'),all_list.count('G'),all_list.count('T')] ###work out MQ0 and mean mapping quality for the snp (not RMSE) MQ0[index]=map_list.count(0) try: MQm[index]=sum(map_list)/float(len(map_list)) except: MQm[index]=0.0 ###if minimum read depth is met, try calling the genotype if len(var_list)>=min_RD: GLs[index]=geno_caller_10GT_aDNA(var_list) ##ancient DNA aware calculation of genotype likelihoods GTs[index]=np.argmax(GLs[index]) ##record best genotype PLs[index]=(GLs[index]-np.max(GLs[index]))*-10 ###calculte Phred-scale values GQs[index]=np.msort(PLs[index])[1] ###record genotype quality ###if GQ is less than threshold and site is heterozygous, swith to best homozygous genotype if (alt_dic[GTs[index]][0]<>alt_dic[GTs[index]][1]) and (GQs[index]<GQ): GTs[index]=homs[np.argmax(GLs[index][homs])] SWGQ[index]=1 ###Work out basesian QUALity score ## LL_0=np.sum(10**GLs[index][LL0_map[REFs[index]]]) ## LL_1=np.sum(10**GLs[index][LL1_map[REFs[index]]]) ## LL_2=np.sum(10**GLs[index][LL2_map[REFs[index]]]) ## norconst1=sum([LL_0,LL_1,LL_2]) ###Depristo says this, but it really should be multiplied by the prior ## norconst2=sum([LL_0*theta_prior[0],LL_1*theta_prior[1],LL_2*theta_prior[2]]) ###Depristo says this, but it really should be multiplied by the prior ## ## Pr0=(theta_prior[0]*LL_0)/norconst2 ## Pr1=(theta_prior[1]*LL_1)/norconst2
import numpy as np a = np.array([1, 2]) np.msort(a)
def FFT_Stepcount(Quality_Acc,T_convert,window,Cutoff): Len = len(Quality_Acc) Firstwindow_stepdetection = 0 count = [] count2 = [] count3 = [] frequency = numpy.zeros((1,5)) portion = 0.0 Box = [] Box2 = [] power = numpy.array([]) Time = [] for i in range(0,Len-window,window): if i==0: Time = T_convert[i] else: Time = numpy.vstack([Time,T_convert[i]]) for n in range(0,window): if Quality_Acc[i+n] < 66 and Quality_Acc[i+n+1]>66: Firstwindow_stepdetection += 1 if Firstwindow_stepdetection < 8: count.append(0) else: for j in range(i,i+window-38,38): A = numpy.msort(numpy.array(Quality_Acc[j:j+39])) #print A if A[-6]<66: pass else: portion = float(portion)+float(38.0/window) T = T_convert[i+window]-T_convert[i] p = numpy.abs(numpy.fft.fft(Quality_Acc[i:i+window+1])/(window/2.0)) p = p[0:window/2]**2 freq = numpy.arange(0,window/2)/T freq = numpy.transpose(freq) Power_and_Freq = numpy.c_[p,freq] Power_and_Freq = Power_and_Freq[Power_and_Freq[:,0].argsort()] if Power_and_Freq[0,1]>Cutoff: Power_and_Freq[0,1]=0 count.append(Power_and_Freq[-2,1]*T*portion) Box2.append(portion) portion=0.0 Box.append(Firstwindow_stepdetection) Firstwindow_stepdetection=0 Power_and_Freq = [] freq = [] p = [] count = numpy.transpose(count) count = numpy.sum(count) return count
def dip(histogram=None, idxs=None): """ Compute the Hartigans' dip statistic either for a histogram of samples (with equidistant bins) or for a set of samples. """ if idxs is None: idxs = np.arange(len(histogram)) elif histogram is None: h = collections.Counter(idxs) idxs = np.msort(h.keys()) histogram = np.array([h[i] for i in idxs]) else: if len(histogram) != len(idxs): raise ValueError("Need exactly as many indices as histogram bins.") if len(idxs) != len(set(idxs)): raise ValueError("idxs must be unique if histogram is given.") if not np.array_equal(np.msort(idxs), idxs): idxs_s = np.argsort(idxs) idx = np.asarray(idxs)[idxs_s] histogram = np.asarray(histogram)[idxs_s] cdf = np.cumsum(histogram, dtype=float) cdf /= cdf[-1] work_idxs = idxs work_histogram = np.asarray(histogram, dtype=float) / np.sum(histogram) work_cdf = cdf D = 0 left = [0] right = [1] while True: left_part, left_touchpoints = _gcm_(work_cdf - work_histogram, work_idxs) right_part, right_touchpoints = _lcm_(work_cdf, work_idxs) d_left, left_diffs = _touch_diffs_(left_part, right_part, left_touchpoints) d_right, right_diffs = _touch_diffs_(left_part, right_part, right_touchpoints) if d_right > d_left: xr = right_touchpoints[d_right == right_diffs][-1] xl = left_touchpoints[left_touchpoints <= xr][-1] d = d_right else: xl = left_touchpoints[d_left == left_diffs][0] xr = right_touchpoints[right_touchpoints >= xl][0] d = d_left left_diff = np.abs(left_part[:xl+1] - work_cdf[:xl+1]).max() right_diff = np.abs(right_part[xr:] - work_cdf[xr:] + work_histogram[xr:]).max() if d <= D or xr == 0 or xl == len(work_cdf): the_dip = max(np.abs(cdf[:len(left)] - left).max(), np.abs(cdf[-len(right)-1:-1] - right).max()) return the_dip/2, (cdf, idxs, left, left_part, right, right_part) else: D = max(D, left_diff, right_diff) work_cdf = work_cdf[xl:xr+1] work_idxs = work_idxs[xl:xr+1] work_histogram = work_histogram[xl:xr+1] left[len(left):] = left_part[1:xl+1] right[:0] = right_part[xr:-1]
vwap = np.average(close_price,weights=volume) #时间加权平均数 twap = np.average(close_price,weights=(np.arange(len(close_price)))) #算数平均数 mean = np.mean(close_price) #3.获取收盘价最高,最低价 h ,l = np.loadtxt('data.csv',delimiter=',',usecols=(4,5),unpack=True) hightest = np.max(h) lowest = np.max(l) #4.计算收盘价的中位数 median = np.median(close_price) #排序(sort) sorted_closing = np.msort(close_price) #判断个数是奇数还是偶数 N = len(close_price) median_ind =(N-1)/2 if(N&0x1): median_sorted_closing = sorted_closing[median_ind] else: median_sorted_closing = (sorted_closing[median_ind]+sorted_closing[median_ind+1])/2 #5.计算方差 variance = np.var(close_price) #手动求方差 variance_from_definition = np.mean((close_price-close_price.mean())**2)
print "twap =", np.average(c, weights=t) #寻找最大值和最小值 h,l=np.loadtxt('data.csv', delimiter=',', usecols=(4,5), unpack=True) print "highest =", np.max(h) print "lowest =", np.min(l) print (np.max(h) + np.min(l)) /2 print "Spread high price", np.ptp(h) print "Spread low price", np.ptp(l) #统计分析 c=np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True) print "median =", np.median(c) sorted = np.msort(c) print "sorted =", sorted N = len(c) print "middle =", sorted[(N - 1)/2] print "average middle =", (sorted[N /2] + sorted[(N - 1) / 2]) / 2 print "variance =", np.var(c) print "variance from definition =", np.mean((c - c.mean())**2) #股票收益率 c=np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True) returns = np.diff( c ) / c[ : -1] print "Standard deviation =", np.std(returns)
import numpy as np c=np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True) print "median =", np.median(c) sorted = np.msort(c) print "sorted =", sorted N = len(c) print "middle =", sorted[(N - 1)/2] print "average middle =", (sorted[N /2] + sorted[(N - 1) / 2]) / 2 print "variance =", np.var(c) print "variance from definition =", np.mean((c - c.mean())**2)
# # basic functions # ############################################################ import numpy as np np.set_printoptions(precision=3) a = np.array( np.random.random(12) * 100, dtype="int" ).reshape(3,4) print(a) # perform operations along specified axis print(np.average(a, axis=0)) print(np.average(a, axis=1)) # sort data print(np.msort(a)) print(np.sort(a, axis=0)) print(np.sort(a, axis=1)) # insert elements: insert(array, positions, items) print(a) b = np.insert(a, (1,5,9), (-1,-2,-3) ); print(b) b = np.insert(a, (1,5,9), (-1,-2,-3) ).reshape(3,5); print(b) b = np.append(a, (-1,-2,-3,-4) ).reshape(4,4); print(b) b = np.round_(a ** 1.1, 2); print(b) 1
def find_periodicity(self, func, par, fitmethod='bfgs', nchain = 10, niter = 5000, nsim = 1000, covfactor = 1.0, parname = None, noise = -1, use_emcee = True, searchfreq = None): """ Find periodicities in observed data and compute significance via MCMCs. First, fit the periodogram with func and compute the maximum-a-posteriori (MAP) estimate. Divide the data by the MAP model; for a perfect data-model fit, the resulting residuals should follow a chi-square distribution with two degrees of freedom. Find the highest power in the residuals and its frequency. Sample the posterior distribution of parameters for func using MCMC, and create fake periodograms from samples of the posterior. For each fake periodogram, find the MAP estimate, divide out the MAP model and find the highest power in that periodogram. Create a posterior distribution of maximum powers and compute a posterior predictive p-value of seeing the maximum power in the data under the null hypothesis (no QPO). Parameters ---------- func : function Parametric model for the periodogram. Needs to be a function that takes an array of frequencies and k parameters, and returns an array of model powers. The function should include a parameter setting a constant background level, and this parameter should be last! par : {list, array-like} Input guesses for the parameters taken by func. The number of elements in this list or array must match the number of parameters k taken by func. fitmethod : string, optional, default "bfgs" Choose the optimization algorithm used when minimizing the -log-likelihood. Choices are listed in mle.py, but the default (bfgs) should be sufficient for most applications. nchain : int, optional, default 10 The number of chains or walkers to use in MCMC. For Metropolis-Hastings, use ~10-20 and many samples For emcee, use as many as you can afford (~500) and fewer samples niter : int, optional, default 5000 Sets the length of the Markov chains. For Metropolis-Hastings, this needs to be large (>10000) For emcee, this can be smaller, but it's a good idea to verify that the chains have mixed. nsim : int, optional, default 1000 The number of simulations to use when computing the posterior distribution of the likelihood ratio. Note that this also sets the maximum precision of the posterior predictive p-value (for 1000 simulations, the p-value can be constrained only to 0.001). covfactor : float, optional, default 1.0 A tuning parameter for the MCMC step. Used only in Metropolis-Hastings. parname : list, optional, default None Include a list of strings here to set parameter names for plotting noise: int, optional, default -1 The index for the noise parameter in func. In the pre-defined models, this index is *always* -1. use_emcee : boolean, optional, default True If True (STRONGLY RECOMMENDED), use the emcee package for running MCMC. If False, use Metropolis-Hastings. """ ## the file name where the output will be stored resfilename = self.namestr + "_findperiodicity_results.dat" ## open the output log file resfile = utils.TwoPrint(resfilename) ### step 1: fit model to observation psfit = mle.PerMaxLike(self.ps, fitmethod=fitmethod, obs=True) fitpars = psfit.mlest(func, par, obs=True, noise=noise, m=self.m) bindict = fitpars['bindict'] #print('popt: ' + str(fitpars['popt'])) ## which posterior do I need to use? if self.m == 1: lpost = posterior.PerPosterior(self.ps, func) else: lpost = posterior.StackPerPosterior(self.ps, func, self.m) ### Step 2: Set up Markov Chain Monte Carlo Simulations ### of model 1: mcobs = mcmc.MarkovChainMonteCarlo(self.ps.freq, self.ps.ps, lpost, topt = fitpars['popt'], tcov = fitpars['cov'], covfactor = covfactor, niter=niter, nchain=nchain, parname= parname, check_conv = True, namestr = self.namestr, use_emcee = True, plot=self.plot, printobj = resfile, m = self.m) ### Step 3: create fake periodograms out of MCMCs fakeper = mcobs.simulate_periodogram(nsim = nsim) sim_pars_all, sim_deviance, sim_ksp, sim_fpeak, sim_srat, \ sim_maxpow, sim_merit, sim_y0, sim_s3max, sim_s5max, sim_s11max =[], [], [], [], [], [], [], [], [], [], [] bmax = int(self.ps.freq[-1]/(2.0*(self.ps.freq[1]-self.ps.freq[0]))) bins = [1,3,5,7,10,15,20,30,50,70,100,200,300,500,700,1000] binlist = [r for r in fitpars["bindict"].keys()] nbins = len(binlist)/4 sain = copy.copy(fitpars['popt']) # print('popt2: ' + str(fitpars['popt'])) ### Step 4: Fit fake periodograms: for i,x in enumerate(fakeper): try: # print('popt' + str(i) + 'a : ' + str(fitpars['popt'])) fitfake = mle.PerMaxLike(x, fitmethod=fitmethod, obs=False) # print('popt' + str(i) + 'b : ' + str(fitpars['popt'])) sim_pars = fitfake.mlest(func, sain,obs=False, noise=noise, m=self.m) # print('popt' + str(i) + 'c : ' + str(fitpars['popt'])) sim_pars_all.append(sim_pars) sim_deviance.append(sim_pars['deviance']) sim_ksp.append(sim_pars['ksp']) sim_maxpow.append(sim_pars['maxpow']) sim_merit.append(sim_pars['merit']) sim_fpeak.append(sim_pars['maxfreq']) sim_y0.append(sim_pars['mfit'][sim_pars['maxind']]) sim_srat.append(sim_pars['sobs']) sim_s3max.append(sim_pars['s3max']) sim_s5max.append(sim_pars['s5max']) sim_s11max.append(sim_pars['s11max']) except KeyboardInterrupt: break #except: # print("Simulation failed! Continuing ...") # continue # print('popt' + str(i) + 'd : ' + str(fitpars['popt'])) # print('popt3: ' + str(fitpars['popt'])) ### upper limit is the power in the sorted array where p_maxpow would be 0.05 ### i.e. when only 0.05*nsim simulations are higher than this ### note: sometimes simulations fail, therefore the 5% limit should be 0.05*len(sims) fiveperlim = int(0.05*len(sim_maxpow)) if fiveperlim == 0: resfile('Warning! Too few simulations to compute five percent limit reliably!') fiveperlim = 1 ninetyfiveperlim = len(sim_maxpow) - fiveperlim #print('popt4: ' + str(fitpars['popt'])) bindicts = [x["bindict"] for x in sim_pars_all] ### get out binned powers: maxpows_all = {} binprob = {} for b in bins[:nbins]: binps = fitpars['bindict']['bin'+str(b)] bmaxpow = np.array([x["bmax" + str(b)] for x in bindicts]) maxpows_all["bin"+str(b)] = bmaxpow bindict['sim_bmaxpow' + str(b)] = bmaxpow p_bmaxpow = float(len([x for x in bmaxpow if x > fitpars['bindict']["bmax" + str(b)]]))/float(len(bmaxpow)) bindict["p_maxpow" + str(b)] = p_bmaxpow bmaxpow_err = np.sqrt(p_bmaxpow*(1.0-p_bmaxpow)/float(len(bmaxpow))) bindict['p_maxpow' + str(b) + 'err'] = bmaxpow_err sim_bmaxpow_sort = np.msort(bmaxpow) ### note: this is the limit for 2*I/S --> multiply by S to get powers for each frequency ### Like everything else, this is n-trial corrected! #print('len(bmaxpow_sort) : ' + str(len(sim_bmaxpow_sort))) resfile('ninetyfiveperlim: ' + str(ninetyfiveperlim)) bmaxpow_ul = sim_bmaxpow_sort[ninetyfiveperlim] bindict['bmax' + str(b) + '_ul'] = bmaxpow_ul resfile('The posterior p-value for the maximum residual power for a binning of ' + str(self.ps.df*b) + 'Hz is p = ' + str(p_bmaxpow) + ' +/- ' + str(bmaxpow_err)) resfile('The corresponding value of the T_R statistic at frequency f = ' + str(fitpars["bindict"]["bmaxfreq" + str(b)]) + ' is 2I/S = ' + str(fitpars['bindict']["bmax" + str(b)])) resfile('The upper limit on the T_R statistic is 2I/S = ' + str(bmaxpow_ul)) ### now turn upper limit into an rms amplitude: ## first compute broadband noise model for binned frequencies bintemplate = func(fitpars['bindict']['bin'+str(b)].freq, *fitpars['popt']) resfile("bintemplate[0]: " + str(bintemplate[0])) ## then compute upper limits for powers I_j depending on frequency binpowers = bmaxpow_ul*bintemplate/2.0 - bintemplate ## now compute rms amplitude at 40, 70, 100 and 300 Hz ## first, convert powers into rms normalization, if they're not already if self.ps.norm == 'leahy': binpowers = binpowers/(self.ps.df*b * self.ps.nphots) elif self.ps.norm == 'variance': binpowers = binpowers*self.ps.n**2.0 / (self.ps.df*b*self.ps.nphots**2.0) #print('len(binps.freq): ' + str(len(binps.freq))) #print('len(binpowers): ' + str(len(binpowers))) if searchfreq is None: searchfreq = [40.0, 70.0, 100.0, 300.0, 500.0, 1000.0] ## for 40 Hz: print(searchfreq) for bc in searchfreq: if bc > (binps.freq[1] - binps.freq[0]): bind = np.searchsorted(binps.freq, bc) - 1 bpow = binpowers[bind] brms = np.sqrt(bpow*b*self.ps.df) resfile('The upper limit on the power at ' + str(bc) + 'Hz for a binning of ' + str(b) + ' is P = ' + str(bpow*(self.ps.df*b*self.ps.nphots))) resfile('The upper limit on the rms amplitude at ' + str(bc) + 'Hz for a binning of ' + str(b) + ' is rms = ' + str(brms)) bindict['bin' + str(b) + '_ul_%.4fHz'%bc] = brms else: continue ### Step 5: Compute Bayesian posterior probabilities of individual quantities p_maxpow = float(len([x for x in sim_maxpow if x > fitpars['maxpow']]))/float(len(sim_maxpow)) p_deviance = float(len([x for x in sim_deviance if x > fitpars['deviance']]))/float(len(sim_deviance)) p_ksp = float(len([x for x in sim_ksp if x > fitpars['ksp']]))/float(len(sim_ksp)) p_merit = float(len([x for x in sim_merit if x > fitpars['merit']]))/float(len(sim_merit)) p_srat = float(len([x for x in sim_srat if x > fitpars['sobs']]))/float(len(sim_srat)) p_s3max = float(len([x for x in sim_s3max if x > fitpars['s3max']]))/float(len(sim_s3max)) p_s5max = float(len([x for x in sim_s5max if x > fitpars['s5max']]))/float(len(sim_s5max)) p_s11max = float(len([x for x in sim_s11max if x > fitpars['s11max']]))/float(len(sim_s11max)) ### sort maximum powers from lowest to highest sim_maxpow_sort = np.msort(sim_maxpow) sim_s3max_sort = np.msort(sim_s3max) sim_s5max_sort = np.msort(sim_s5max) sim_s11max_sort = np.msort(sim_s11max) ### note: this is the limit for 2*I/S --> multiply by S to get powers for each frequency ### Like everything else, this is n-trial corrected! maxpow_ul = sim_maxpow_sort[ninetyfiveperlim] ### Step 6: Compute errors of Bayesian posterior probabilities pmaxpow_err = np.sqrt(p_maxpow*(1.0-p_maxpow)/float(len(sim_ksp))) pdeviance_err = np.sqrt(p_deviance*(1.0-p_deviance)/float(len(sim_ksp))) pksp_err = np.sqrt(p_ksp*(1.0-p_ksp)/float(len(sim_ksp))) pmerit_err = np.sqrt(p_merit*(1.0-p_merit)/float(len(sim_ksp))) psrat_err = np.sqrt(p_srat*(1.0-p_srat)/float(len(sim_ksp))) ps3max_err = np.sqrt(p_s3max*(1.0-p_s3max)/float(len(sim_ksp))) ps5max_err = np.sqrt(p_s5max*(1.0-p_s5max)/float(len(sim_ksp))) ps11max_err = np.sqrt(p_s11max*(1.0-p_s11max)/float(len(sim_ksp))) ### Display results on screen and make funky plots resfile("Bayesian p-value for maximum power P_max = " + str(p_maxpow) + " +/- " + str(pmaxpow_err)) #resfile('Upper limit on maximum signal power P_max_ul = ' + str(maxpow_ul)) resfile("Bayesian p-value for maximum power P_max = " + str(p_s3max) + " +/- " + str(ps3max_err)) #resfile('Upper limit on maximum signal power P_max_ul = ' + str(s3max_ul)) resfile("Bayesian p-value for maximum power P_max = " + str(p_s5max) + " +/- " + str(ps5max_err)) #resfile('Upper limit on maximum signal power P_max_ul = ' + str(s5max_ul)) resfile("Bayesian p-value for maximum power P_max = " + str(p_s11max) + " +/- " + str(ps11max_err)) #resfile('Upper limit on maximum signal power P_max_ul = ' + str(s11max_ul)) resfile("Bayesian p-value for deviance D = " + str(p_deviance) + " +/- " + str(pdeviance_err)) resfile("Bayesian p-value for KS test: " + str(p_ksp) + " +/- " + str(pksp_err)) resfile("Bayesian p-value for Merit function: " + str(p_merit) + " +/- " + str(pmerit_err)) resfile("Bayesian p-value for the np.sum of residuals: " + str(p_srat) + " +/- " + str(psrat_err)) if self.plot: plt.subplot(2,2,1) n, bins, patches = plt.hist(sim_maxpow, bins=100, normed = True, color="cyan", histtype='stepfilled') xmin, xmax = min(min(bins), fitpars['maxpow'])/1.2, max(25, fitpars['maxpow']*1.2) plt.axis([xmin, xmax, 0.0, max(n)]) plt.vlines(fitpars['maxpow'], 0.0, max(n), lw=2, color='navy') plt.title('unsmoothed data', fontsize=12) plt.subplot(2,2,2) n, bins, patches = plt.hist(sim_s3max, bins=100, normed = True, color="cyan", histtype='stepfilled') xmin, xmax = min(min(bins), fitpars['s3max'])/1.2, max(25, fitpars['s3max']*1.2) plt.axis([xmin, xmax, 0.0, max(n)]) plt.vlines(fitpars['s3max'], 0.0, max(n), lw=2, color='navy') plt.title('smoothed (3) data', fontsize=12) plt.subplot(2,2,3) n, bins, patches = plt.hist(sim_s3max, bins=100, normed = True, color="cyan", histtype='stepfilled') xmin, xmax = min(min(bins), fitpars['s5max'])/1.2, max(25, fitpars['s5max']*1.2) plt.axis([xmin, xmax, 0.0, max(n)]) plt.vlines(fitpars['s5max'], 0.0, max(n), lw=2, color='navy') plt.title('smoothed (5) data/model outlier', fontsize=12) plt.subplot(2,2,4) n, bins, patches = plt.hist(sim_s3max, bins=100, normed = True, color="cyan", histtype='stepfilled') xmin, xmax = min(min(bins), fitpars['s11max'])/1.2, max(25, fitpars['s3max']*1.2) plt.axis([xmin, xmax, 0.0, max(n)]) plt.vlines(fitpars['s11max'], 0.0, max(n), lw=2, color='navy') plt.title('smoothed (11) data', fontsize=12) plt.savefig(self.namestr + '_maxpow.png', format='png') plt.close() results = {"fitpars":fitpars, 'bindict':bindict, 'maxpows_all':maxpows_all, 'mcobs':mcobs, 'p_maxpow':[sim_maxpow, p_maxpow, pmaxpow_err], 'maxpow_ul':maxpow_ul, 'p_s3max':[sim_s3max, p_s3max, ps3max_err], 'p_s5max':[sim_s5max, p_s5max, ps5max_err], 'p_s11max':[sim_s11max, p_s11max, ps11max_err], 'p_merit':[p_merit, pmerit_err], 'p_srat':[p_srat, psrat_err], 'p_deviance':[p_deviance, pdeviance_err], 'fitpars':fitpars, "postmean":mcobs.mean, "posterr":mcobs.std, "postquantiles":mcobs.ci, "rhat":mcobs.rhat, "acor":mcobs.acor, "acceptance":mcobs.acceptance} return results
def dip_fn(dat, is_hist=False, just_dip=False): """ Compute the Hartigans' dip statistic either for a histogram of samples (with equidistant bins) or for a set of samples. """ if is_hist: histogram = dat idxs = np.arange(len(histogram)) else: counts = collections.Counter(dat) idxs = np.msort(list(counts.keys())) histogram = np.array([counts[i] for i in idxs]) # check for case 1<N<4 or all identical values if len(idxs) <= 4 or idxs[0] == idxs[-1]: left = [] right = [1] d = 0.0 return d if just_dip else (d, (None, idxs, left, None, right, None)) cdf = np.cumsum(histogram, dtype=float) cdf /= cdf[-1] work_idxs = idxs work_histogram = np.asarray(histogram, dtype=float) / np.sum(histogram) work_cdf = cdf D = 0 left = [0] right = [1] while True: left_part, left_touchpoints = _gcm_(work_cdf - work_histogram, work_idxs) right_part, right_touchpoints = _lcm_(work_cdf, work_idxs) d_left, left_diffs = _touch_diffs_(left_part, right_part, left_touchpoints) d_right, right_diffs = _touch_diffs_(left_part, right_part, right_touchpoints) if d_right > d_left: xr = right_touchpoints[d_right == right_diffs][-1] xl = left_touchpoints[left_touchpoints <= xr][-1] d = d_right else: xl = left_touchpoints[d_left == left_diffs][0] xr = right_touchpoints[right_touchpoints >= xl][0] d = d_left left_diff = np.abs(left_part[:xl + 1] - work_cdf[:xl + 1]).max() right_diff = np.abs(right_part[xr:] - work_cdf[xr:] + work_histogram[xr:]).max() if d <= D or xr == 0 or xl == len(work_cdf): the_dip = max( np.abs(cdf[:len(left)] - left).max(), np.abs(cdf[-len(right) - 1:-1] - right).max()) if just_dip: return the_dip / 2 else: return the_dip / 2, (cdf, idxs, left, left_part, right, right_part) else: D = max(D, left_diff, right_diff) work_cdf = work_cdf[xl:xr + 1] work_idxs = work_idxs[xl:xr + 1] work_histogram = work_histogram[xl:xr + 1] left[len(left):] = left_part[1:xl + 1] right[:0] = right_part[xr:-1]
def datestr2num(s): """ 0 - Monday 6 - Sunday """ return datetime.datetime.strptime(s, "%Y-%m-%d").date().weekday() c = np.loadtxt("GOOG.csv", delimiter=",", usecols=(4,), unpack=True) # median print("median = {0}".format(np.median(c))) sorted_close = np.msort(c) # print("sorted = {0}".format(sorted_close)) N = len(c) print("middle = {0}".format(sorted_close[N / 2])) print("averaged middle = {0}".format((sorted_close[N / 2] + sorted_close[(N - 1) / 2]) / 2)) # variance print("variance = {0}".format(np.var(c))) print("variance from definition = {0}".format(np.mean((c - c.mean()) ** 2))) # return returns = np.diff(c) / c[:-1] print("Returns") print("Standard deviation = {0}".format(np.std(returns)))