def testScipyExponential(): data0 = expon.rvs(scale=10, size=1000) ################### data = data0 plt.figure() x = np.linspace(0, 100, 100) plt.hist(data, bins=x, normed=True) plt.plot(x, expon.pdf(x, loc=0, scale=10), color='g') #loc, scale = expon.fit(data, floc=0) #plt.plot(x, expon.pdf(x, loc=loc, scale=scale), color='r') removedHeadLength = 1.0 dataNoHead = [v for v in data if v > removedHeadLength] loc1, scale1 = expon.fit(dataNoHead) plt.plot(x, expon.pdf(x, loc=0, scale=scale1), color='b') loc, scale = expon.fit(dataNoHead, floc=removedHeadLength) plt.plot(x, expon.pdf(x, loc=0, scale=scale), color='r') # non-normed graphs # plt.figure() # plt.hist(data0, bins=x, normed=False) # plt.plot(x, expon.pdf(x, loc=0, scale=10)*len(data0), color='r') plt.figure() plt.hist(dataNoHead, bins=x, normed=False) # s = len(dataNoHead) / sInvNormalisation = intergral(pdf, removedHeadLength, infty) int_0_removedHeadLength_expon = np.exp(- float(removedHeadLength) / scale) s = len(dataNoHead) / int_0_removedHeadLength_expon s0 = len(data0) / 1.0 print >> sys.stderr, s0, len(dataNoHead), s plt.plot(x, expon.pdf(x, loc=0, scale=scale)*s, color='r') ############################################################## # deprecated ############################################################## # non-linear fit #A, K, C = fit_exp_nonlinear(t, noisy) # linear fit with the constant set to 0 # C = 0 # A, K = fit_exp_linear(t, noisy, C) # ysModel = model_func(t, A, K, C) #plt.tight_layout() #plt.xlim(0, 100) #plt.title("OSB length distribution in Human-Mouse comparison \n Confidence Interval : %s*sigma around mean" % arguments["ICfactorOfSigma"] ) #plt.title("") # #plt.legend() #plt.savefig(sys.stdout, format='svg')
def daywise_training_data(self, d, combine, fac1, fac2, f1, days, orignal_start_slot): # fac2 is out internal slots that are combined # it is also worth noting that we calculate the average for combined slots and then put them for # all the slots for that given duration if self.combined_slots: x = fac2[(fac1 == f1)] day = days[(fac1 == f1)] model_d = [] for day_i in np.unique(day): model_d_temp = [] for t_i in np.unique(x): try: model_d_temp.append([[ t_i, expon.fit( pd.to_numeric(d[(x == t_i) & (day == day_i)]))[1], day_i ]]) except: continue model_d_temp = np.vstack(model_d_temp) scale_val = model_d_temp[( model_d_temp[:, 0] == combine[0])].flatten()[1] add = [[i, scale_val, day_i] for i in combine[1:]] model_d_temp = np.concatenate((model_d_temp, add)) model_d.append(model_d_temp) model_d = np.vstack(model_d) else: x = orignal_start_slot[(fac1 == f1)] day = days[(fac1 == f1)] model_d = [] for day_i in np.unique(day): model_d_temp = [] for t_i in np.unique(x): try: model_d_temp.append([[ t_i, expon.fit( pd.to_numeric(d[(x == t_i) & (day == day_i)]))[1], day_i ]]) except: continue model_d_temp = np.vstack(model_d_temp) model_d.append(model_d_temp) model_d = np.vstack(model_d) return model_d
def testExponOneEvent(self): """ generate and fit an exponential distribution with lifetime of 25 make a plot in testExpon.png """ tau = 25.0 nBins = 400 size = 100 x = range(nBins) timeHgValues = np.zeros(nBins, dtype=np.int64) timeStamps = expon.rvs(loc=0, scale=tau, size=size) ts64 = timeStamps.astype(np.uint64) tsBinner.tsBinner(ts64, timeHgValues) param = expon.fit(timeStamps) fit = expon.pdf(x,loc=param[0],scale=param[1]) fit *= size tvf = timeHgValues.astype(np.double) tvf[tvf<1] = 1e-3 # the plot looks nicer if zero values are replaced plt.plot(x, tvf, label="data") plt.plot(x, fit, label="fit") plt.yscale('log') plt.xlim(xmax=100) plt.ylim(ymin=0.09) plt.legend() plt.title("true tau=%.1f fit tau=%.1f"%(tau,param[1])) plt.savefig(inspect.stack()[0][3]+".png")
def returnDistData(cls, self): gammaParam = gamma.fit(10**(self.data / 10)) gammaDist = gamma.pdf(self.data, *gammaParam) rayleighParam = rayleigh.fit(self.data) rayleighDist = rayleigh.pdf(self.data, *rayleighParam) normParam = norm.fit(self.data) normDist = norm.pdf(self.data, *normParam) logNormParam = lognorm.fit(self.data) lognormDist = lognorm.pdf(self.data, *logNormParam) nakagamiParam = nakagami.fit(self.data) nakagamiDist = nakagami.pdf(self.data, *nakagamiParam) exponParam = expon.fit(self.data) exponDist = expon.pdf(self.data, *exponParam) exponweibParam = exponweib.fit(self.data) weibDist = exponweib.pdf(self.data, *exponweibParam) distDF = pd.DataFrame(np.column_stack([ gammaDist, rayleighDist, normDist, lognormDist, nakagamiDist, exponDist, weibDist ]), columns=[ 'gammaDist', 'rayleighDist', 'normDist', 'lognormDist', 'nakagamiDist', 'exponDist', 'weibDist' ]) self.distDF = distDF
def main(): symbol = 'BTCUSDT' #symbols = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'ETHBTC', 'LTCBTC', 'LTCETH'] #symbols = ['ETHUSDT', 'LTCUSDT', 'ETHBTC', 'LTCBTC', 'LTCETH'] #trades = get_trades('BTCUSDT', datetime.datetime.timestamp(datetime.datetime(2019, 6, 1)) * 1000, 24 * 365) trades = get_trades( symbol, datetime.datetime.timestamp(datetime.datetime(2019, 6, 1)) * 1000, 1200) with open(symbol + '.json', 'w') as f: json.dump(trades, f) previous_time = None interarrival_times = [] for trade in trades: time = trade['time'] if previous_time is not None: interarrival_times.append(time - previous_time) previous_time = time plt.hist(interarrival_times, 100, density=True) loc, scale = expon.fit(interarrival_times, loc=0) x = np.linspace(0, 2000, 100) plt.plot(x, expon.pdf(x, loc=loc, scale=scale)) plt.show()
def distribution_check(self, dist): #histogram and normal probability plot if dist == 'norm': sns.distplot(self.series, fit=norm) (mu, sigma) = norm.fit(self.series) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #Now plot the distribution plt.legend([ 'normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu, sigma) ], loc='best') plt.ylabel('Frequency') #plt.title('Series distribution') plt.show() if dist == 'expon': plt.clf() sns.distplot(self.series, fit=expon) (mu, sigma) = expon.fit(self.series) print('\n mu = {:.2f} '.format(sigma)) #Now plot the distribution plt.legend(['expon dist. ($\mu=$ {:.2f} )'.format(sigma)], loc='best') plt.ylabel('Frequency') #plt.title('Series distribution') plt.show()
def computeMeanSbLength(lSbsLengths, binwidth=None, minSbLength=None): # first fit an exponential of the form s * 1/theta * exp(-x/theta) # with s the scale factor = total #sbs loc, scale = expon.fit(lSbsLengths, floc=0.0) assert loc == 0.0 #A, K = fit_exp_linear(t, v, 0) # A exp(K t) # A = s * 1/theta #theta = -1.0/K # proportion of the missing normalised distribution between 0 and minSbLength # propMissing = 1.0 / np.exp(-minSbLength/scale) - 1.0 # s = (propMissing + 1) * len(lSbsLengths) int_0_removedHeadLength_expon = np.exp(- float(minSbLength) / scale) s = len(lSbsLengths) / float(int_0_removedHeadLength_expon) theta = scale # print >> sys.stderr, "theta=", theta # print >> sys.stderr, "s=", s missingBins = list(np.arange(0, int(minSbLength), binwidth)) nbMissingSbs = s * Exp(np.asarray([bin - binwidth/2.0 for bin in missingBins]), theta) missingNbSbsPerBin = zip(missingBins, nbMissingSbs) lSbsLengthsMissing = [(bin + random.random() * binwidth) for (bin, sbLength) in missingNbSbsPerBin for _ in range(int(sbLength))] lSbsLengthsc = list(lSbsLengthsMissing) + lSbsLengths meanSbLength = float(sum(lSbsLengthsc) / len(lSbsLengthsc)) # TODO compare s and len(lSbsLengthsc) print >> sys.stderr, "meanSbLength=%s, scale=%s" % (meanSbLength, theta) print >> sys.stderr, "s=%s, len(lSbsLengthsc)=%s" % (s, len(lSbsLengthsc)) return (meanSbLength, lSbsLengthsc, missingNbSbsPerBin)
def testExpon(self): """ generate and fit an exponential distribution with lifetime of 25 make a plot in testExpon.png """ tau = 25.0 nBins = 400 size = 100 x = range(nBins) timeHgValues = np.zeros(nBins, dtype=np.int64) timeStamps = expon.rvs(loc=0, scale=tau, size=size) ts64 = timeStamps.astype(np.uint64) tsBinner.tsBinner(ts64, timeHgValues) with warnings.catch_warnings(): warnings.simplefilter("ignore") # Note: this line casus a RuntimeWarning in optimize.py:301 param = expon.fit(timeStamps) fit = expon.pdf(x,loc=param[0],scale=param[1]) fit *= size tvf = timeHgValues.astype(np.double) #tvf[tvf<1] = 1e-3 # the plot looks nicer if zero values are replaced plt.plot(x, tvf, label="data") plt.plot(x, fit, label="fit") plt.yscale('symlog', linthreshy=0.9) plt.xlim(xmax=100) plt.ylim(ymin = -0.1) plt.legend() plt.title("true tau=%.1f fit tau=%.1f"%(tau,param[1])) plt.savefig(inspect.stack()[0][3]+".png")
def MBdist( n, e_photon, thick ): # n: particle number, loct: start point(x-x0), scale: sigma, wl: wavelength,thick: thickness of the cathode assert e_photon > bandgap if e_photon - bandgap - 0.8 <= 0: scale = e_photon - bandgap loct = 0 else: scale = 0.8 loct = e_photon - bandgap - scale data = maxwell.rvs(loc=loct, scale=scale, size=n) data_ene = np.array(data) params = maxwell.fit(data, floc=0) data_v = np.sqrt(2 * data_ene * ec / me) * 10**9 p2D = [] wl = ((19.82 - 27.95 * e_photon + 11.15 * e_photon**2) * 10**-3)**-1 pens = expon.rvs(loc=0, scale=wl, size=n) penss = filter(lambda x: x <= thick, pens) params_exp = expon.fit(pens, floc=0) i = 0 for n in range(len(penss)): phi = random.uniform(0, 2 * math.pi) # initial angular poy = random.uniform(-1 * 10**6, 1 * 10**6) # initial y direction position p2D.append([ penss[i], poy, data_v[i] * math.cos(phi), data_v[i] * math.sin(phi), data_v[i], data[i] ]) #p2D: (z,y,vz,vy,v,ene) i += 1 p2D = np.array(p2D) return params, p2D, penss, params_exp
def testExponManyEvents(self): """ generate and fit an exponential distribution with lifetime of 25 make a plot in testExponManyEvents.png """ tau = 25.0 nBins = 400 size = 100 taulist = [] for i in range(1000): x = range(nBins) timeHgValues = np.zeros(nBins, dtype=np.int64) timeStamps = expon.rvs(loc=0, scale=tau, size=size) ts64 = timeStamps.astype(np.uint64) tsBinner.tsBinner(ts64, timeHgValues) param = expon.fit(timeStamps) fit = expon.pdf(x,loc=param[0],scale=param[1]) fit *= size print "i=",i," param[1]=",param[1] taulist.append(param[1]) hist,bins = np.histogram(taulist, bins=20, range=(15,25)) width = 0.7*(bins[1]-bins[0]) center = (bins[:-1]+bins[1:])/2 plt.step(center, hist, where = 'post') plt.savefig(inspect.stack()[0][3]+".png")
def mean_model(self,data,x,data_save,x_save): ks_t_D = pd.DataFrame() ks_t_pval = pd.DataFrame() t_t_pval = pd.DataFrame() exp_loc = pd.DataFrame() exp_scale = pd.DataFrame() time_slot = pd.DataFrame() for f2 in np.unique(x): d = pd.to_numeric(np.array(data[(x==f2)])) loc, scale = expon.fit(d) # ks test D , kspval = kstest(d,'expon') # ttest - one sided sample2 = np.random.exponential(scale, size=d.shape[0]) val , pval = ttest_ind(d,sample2) # if we have combined data then add same model to all combined timeslots if self.combined_slots and f2 == self.combine[0]: for var in self.combine: exp_loc = exp_loc.append(pd.DataFrame([loc])) exp_scale = exp_scale.append(pd.DataFrame([scale])) ks_t_D = ks_t_D.append(pd.DataFrame([D])) ks_t_pval = ks_t_pval.append(pd.DataFrame([kspval])) t_t_pval = t_t_pval.append(pd.DataFrame([pval / 2])) # add timeslot time_slot = time_slot.append([var]) else: exp_loc = exp_loc.append(pd.DataFrame([loc])) exp_scale = exp_scale.append(pd.DataFrame([scale])) ks_t_D = ks_t_D.append(pd.DataFrame([D])) ks_t_pval = ks_t_pval.append(pd.DataFrame([kspval])) t_t_pval = t_t_pval.append(pd.DataFrame([pval / 2])) # add timeslot time_slot = time_slot.append([f2]) # this is the final fit fit = pd.DataFrame() fit[[self.x_names[1]]] = time_slot fit['Exp_loc'] = np.array(exp_loc).flatten() fit['Exp_scale'] = np.array(exp_scale).flatten() fit['KS_D'] = np.array(ks_t_D).flatten() fit['KS_PVal'] = np.array(ks_t_pval).flatten() fit['Ttest_PVal'] = np.array(t_t_pval).flatten() # if self._log: # data_save = np.log(data_save) # if self._normal: # day_max = pd.DataFrame({'time':x,'scale':data,'day':days} ) # day_max = day_max.groupby("day")["scale"].transform(max) # data = data/day_max # scalings = np.unique(day_max) # else: # scalings = 1 return fit,data_save,x_save
def main(): print('Loading and Processing Orders') transactions = load_transactions() orders = classify_trades(transactions) interarrivals = calculate_interarrival_times(orders) buy_orders = [o for o in orders if o.buyer] sell_orders = [o for o in orders if not o.buyer] arrivals = np.cumsum(interarrivals) time_step = 60 * 60 * 1000 #calculating hourly rates bins = [] bin = [] time = 0 print('Processing...') for i, t in enumerate(arrivals): if t > time + time_step: jump = int(np.floor((t - time) / time_step)) time += jump * time_step bins.extend([bin + [] * (jump - 1)]) bin = [] bin.append(orders[i]) times = [] rates = [] for bin in bins: if len(bin) != 0: times.append(bin[0].start_time) ia = calculate_interarrival_times(bin) loc, scale = expon.fit([i for i in ia if i > 1000]) rates.append(1 / scale) fig, ax1 = plt.subplots() ax1.plot((np.array(times) - bins[0][0].start_time) / time_step, rates, label='Genesis Order Rate') ax2 = ax1.twinx() ax2.plot((np.array([o.start_time for o in orders]) - orders[0].start_time) / time_step, [o.price for o in orders], label='Price', color='orange') fig.legend() fig.savefig('Order rate through time') plt.show()
def fit(data: FloatIterable) -> 'Exponential': """ Fit an Exponential distribution to the data. :param data: Iterable of data to fit to. """ loc, scale = expon.fit(data=data, floc=0) return Exponential(lambda_=1 / scale)
def fit_exp(start_ts): start_ts = np.sort(np.array(start_ts)) intervals = start_ts[1:] - start_ts[:-1] # intervals = np.sort(intervals) # print(intervals) exp_loc, exp_scale = expon.fit(intervals) return 1. / exp_scale
def DosDist( n, e_photon, thick, data, absorb_data ): # n, the photon numbers; loct, the postion(z) of the electrons;thick, unit, nm. f = interp1d(data[:, 0], data[:, 1]) f2 = interp1d(absorb_data[:, 0], absorb_data[:, 1]) n1 = int((e_photon - 1) / 0.01) # change to n energy = np.linspace(1., e_photon, n1) norm, err = integrate.quad(lambda e: f(e - e_photon) * f(e), 1, e_photon, limit=10000) data_ene = [] num_energy = [] i = 0 while i < n1: n3 = round(1.5 * n * f(energy[i] - e_photon) * f(energy[i]) * 0.01 / norm) #using n instead of n1 num_energy.append(n3) ener_array = np.empty(n3) ener_array.fill(energy[i]) data_ene.extend(ener_array) i += 1 np.random.shuffle(data_ene) ''' plt.subplot(211) plt.plot(data[:,0],data[:,1]) plt.subplot(212) plt.hist(data_ene,bins=30) plt.show() ''' p2D = [] #wl=((19.82-27.95*e_photon+11.15*e_photon**2)*10**-3 )**-1 wl = (f2(e_photon) * 10**-3)**-1 pens = expon.rvs(loc=0, scale=wl, size=n) penss = list(filter(lambda x: x <= thick, pens)) params_exp = expon.fit(pens, floc=0) i = 0 for i in range(len(penss)): phi = random.uniform(0, 2 * math.pi) # initial angular on 2D surface php = random.uniform(0, 2 * math.pi) # initial angular on perpdiculat face poy = random.uniform(-1 * 10**6, 1 * 10**6) # initial y direction position, nm v = np.sqrt(2 * np.abs((data_ene[i] - bandgap)) * ec / me) * 10**9 p2D.append([ penss[i], poy, v * math.cos(phi) * math.cos(php), v * math.sin(phi) * math.cos(php), v, np.abs(data_ene[i] - bandgap) ]) #p2D: (z,y,vz,vy,v,ene) i += 1 p2D = np.array(p2D) #print p2D return params_exp, p2D, penss, params_exp
def test_sigdiff(data, var, stat, bmi1, bmi2): x = data[data.common_user_id.isin(bmi1)] y = data[data.common_user_id.isin(bmi2)] if stat == 'n': x1 = x[var] y1 = y[var] print("Significance Tests for " + var) else: x1 = x[var][stat] y1 = y[var][stat] print("Significance Tests for " + stat) locx, scalex = expon.fit(x1) locy, scaley = expon.fit(y1) rvsx = expon.rvs(locx, scalex, size=300) rvsy = expon.rvs(locy, scaley, size=300) print(str(stats.ks_2samp(rvsx, rvsy))) print(str(stats.ttest_ind(rvsx, rvsy, equal_var=False)))
def fit(self, X): """ Sets the scale based on the input data Args: X (array): the data to be used to set the scale """ self.scale = np.zeros(X.shape[-1], dtype=np.float32) for i in range(0, X.shape[-1]): _, self.scale[i] = expon.fit(X[:, i], floc=0)
def fit_exponential_sp(trace,plot=False): loc,scale = expon.fit(trace[:,4],floc=0) if plot == True: xmax = max(trace[:,4]) xmin = min(trace[:,4]) xdata = np.linspace(xmin,xmax,num=500) plt.plot(xdata,expon.pdf(xdata,loc,scale)) plt.hist(trace[:,4],bins=50,density=True) return loc,scale
def fit_gamma_expon_Q(trace,gammafactor=20,exponfactor=2,plot=False): params = [] for i in range(1,len(trace[0])): if i < 4: a,loc,theta = gamma.fit(trace[:,i],floc = 0) params.append([a/gammafactor,loc,theta*gammafactor]) if i == 4: loc,scale = expon.fit(trace[:,i],floc = 0) params.append([loc,scale*exponfactor]) return params
def exponential_fit(data, col): plt.hist(data[col], bins=10000, density=True, alpha=0.6, color='g') loc, scale = expon.fit(data[col]) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 10000) # plt.plot(x, p, 'k', linewidth=2) # title = "Exponential fitting of "+str(col) # plt.title(title) # plt.show() return x, loc, scale
def MLE_plt(categories, inter_arrivals, inter_arrival_means): cat_means = cat_mean(inter_arrivals, categories) for i in range(0, len(categories)): #X = np.asarray(extract_cat_samples(categories.inter_arrivals,categories.categories,i))#for single inter-arrivals in a category #X = np_matrix(categories.categories[i][0])#for avg(inter-arrival)/person in a category data = [0] * len(categories[i][0]) for j in range(0, len(categories[i][0])): data.append(inter_arrival_means[categories[i][0][j]]) X = np.asarray(data) param = expon.fit(X) # distribution fitting sample_mean = cat_means[i] #rate_param = 1.0/sample_mean #fitted_pdf = expon.pdf(X,scale = 1/rate_param) # rate_param_estimate = exp_rate_param_estimate(sample_means) max_sample = max_interarrival_mean(categories, inter_arrivals, i) X_plot = np.linspace(0, 2 * sample_mean, 2000)[:, np.newaxis] fitted_pdf = expon.pdf(X_plot, loc=param[0], scale=param[1]) # Generate the pdf (fitted distribution) #kde = KernelDensity(kernel='gaussian', bandwidth=4).fit(X) #KDEs.append(kde) #to use for prob_return() #max_sample = max_interarrival_mean(categories.categories,categories.inter_arrivals,i) #X_plot = np.linspace(0,1.5*max_sample,2000)[:, np.newaxis] #log_dens = kde.score_samples(X_plot) fig = plt.figure() #plt.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian')) plt.plot(X_plot[:, 0], fitted_pdf, "red", label="Estimated Exponential Dist", linestyle="dashed", linewidth=1.5) #plt.draw() #plt.pause(0.001) plt.title( "Parametric MLE (exponential distribution) for category=%s Visitors" % (i)) plt.hist(X, bins=40, normed=1, color="cyan", alpha=.3, label="histogram") #alpha, from 0 (transparent) to 1 (opaque) #plt.hist(combine_inner_lists(extract_cat_samples(categories.inter_arrivals,categories.categories,i)),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) #plt.hist(np.asarray(categories[i][0]),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) plt.xlabel("inter-arrival time (days)") plt.ylabel("PDF") plt.legend() save_as = './app/static/img/cat_result/mle/mleplt_cat' + str( i) + '.png' # dump results into mle folder plt.savefig(save_as) plt.show(block=False) plt.close(fig)
def calculate_parameters(self, values): """ Calculate parameters of the current distribution Parameters ----------- values Empirical values to work on """ if len(values) > 1: self.loc, self.scale = expon.fit(values, floc=0)
def get_dist_matrix_exp(pred_D): labmdas = np.zeros(np.shape(pred_D)[0]) dist_matrix_exp = np.zeros([np.shape(pred_D)[0], np.shape(pred_D)[0]]) for i in range(len(labmdas)): _, scale = expon.fit(pred_D[i, :, 0], floc=0) labmdas[i] = 1. / scale for i in tqdm(range(np.shape(dist_matrix_exp)[0])): for j in range(np.shape(dist_matrix_exp)[0]): dist_matrix_exp[i, j] = JS_divergence_exp(labmdas[i], labmdas[j]) return dist_matrix_exp
def rate_from_exp_fit(self): ''' We know the interarrival time of a poisson process has an exponential distribution. Fit to it to get the rate! To fit to the exponential distribution form for interarrival time, refer to this link. https://stackoverflow.com/questions/25085200/scipy-stats-expon-fit-with-no-location-parameter ''' if self.interarrival_times is None: self.interarrival_times = self.trigger_intervals_all_files() loc, scale = expon.fit(self.interarrival_times, floc=0) return 1 / scale
def approximating_dists(data,bins): try : exp_param = expon.fit(data) except: print "screwed expon fit " #print "params for exponential ", exp_param try: pdf_exp_fitted = expon.pdf(bins, *exp_param[:-2],loc=exp_param[0],scale=exp_param[1]) # fitted distribution except : print " returning as nothing to plot " return [exp_param, pdf_exp_fitted]
def fit(self, data): """ data is an np array :param data: :return: """ data = np.array(data) nPoints = len(data) avg = np.mean(data) std = np.std(data) spikes = data > ([avg + self.spike_std_factor * std] * nPoints) self.last = data[-1] self.params = None if any(spikes): self.spike_max = max(data[spikes]) self.spike_avg = np.mean(data[spikes]) last_nonzero_idx = np.max(np.nonzero(data)) self.time_since_last_spike = len(data) - 1 - np.max( np.nonzero(spikes)) interarrivaltime = 0 spikewidth = 0 inter_arrival_times = [] in_spike = False has_spiked = False spikewidths = [] for isspike in spikes: if not isspike: if in_spike: # was in spike, now not spike spikewidths.append(spikewidth) spikewidth = 0 interarrivaltime = interarrivaltime + 1 in_spike = False else: if not in_spike and has_spiked: inter_arrival_times.append(interarrivaltime) interarrivaltime = 0 spikewidth = spikewidth + 1 in_spike = True has_spiked = True if len(inter_arrival_times) > 0: if self.fit_model == "Weibull": self.params = exponweib.fit(inter_arrival_times, floc=0, f0=1) # a, c, loc, scale elif self.fit_model == "Expon": self.params = expon.fit(inter_arrival_times, floc=0) # returns loc, scale else: # self.fit_model == "Sampling": self.params = inter_arrival_times self.spike_width_avg = int( np.mean(spikewidths)) if len(spikewidths) > 0 else 1 return self
def distest_loose(x): x = _series(x) data = { 'Shapiro-Wilk (normal)': shapiro(x), 'D\'Agostino-Pearson (normal)': normaltest(x), 'Kolmogorov-Smirnov (normal)': kstest(x, norm.cdf, norm.fit(x)), 'Kolmogorov-Smirnov (powerlaw)': kstest(x, powerlaw.cdf, powerlaw.fit(x)), 'Kolmogorov-Smirnov (exponential)': kstest(x, expon.cdf, expon.fit(x)), } keys = data.keys() values = (p for _, p in data.values()) return pd.DataFrame(values, keys, ['p-value']).round(DEC)
def q_calibrate(G = 20000, S = 100000, p0 = 0.001): qneg = [qscore(np.random.rand(G)) for i in range(S)] qneg = np.array(qneg) qneg_sorted = qneg[np.argsort(-qneg)] n1 = int(p0 * S) #lam = S / np.sum(qneg) # maximum likelihood estimate of exponential distribution of Q loc, scale = expon.fit(qneg, floc = 0) lam = 1 / scale qcal = Q_Cal(qmax = qneg_sorted[n1], lam = lam, p0 = p0, ecdf = ECDF(qneg_sorted)) return qcal
def exp_statistics(data,bins): exp_param[-1,-1] exp_pdf=[-1] try : exp_param = expon.fit(data) except: exp_param[-2,-2] try: pdf_exp_fitted = expon.pdf(bins, *exp_param[:-2],loc=exp_param[0],scale=exp_param[1]) # fitted distribution except : exp_param[-1] return [exp_param, pdf_exp_fitted]
def approximating_dists(data, bins): try: exp_param = expon.fit(data) except: print "screwed expon fit " #print "params for exponential ", exp_param try: pdf_exp_fitted = expon.pdf(bins, *exp_param[:-2], loc=exp_param[0], scale=exp_param[1]) # fitted distribution except: print " returning as nothing to plot " return [exp_param, pdf_exp_fitted]
def displayFits(self): """ generates two histograms on the same plot. One uses maximum likelihood to fit the data while the other uses the average time. """ tau = 25.0 nBins = 400 size = 100 taulist = [] taulistavg = [] for i in range(1000): x = range(nBins) timeHgValues = np.zeros(nBins, dtype=np.int64) timeStamps = expon.rvs(loc=0, scale=tau, size=size) ts64 = timeStamps.astype(np.uint64) tsBinner.tsBinner(ts64, timeHgValues) param = sum(timeStamps)/len(timeStamps) fit = expon.pdf(x,param) fit *= size taulistavg.append(param) for i in range(1000): x = range(nBins) timeHgValues = np.zeros(nBins, dtype=np.int64) timeStamps = expon.rvs(loc=0, scale=tau, size=size) ts64 = timeStamps.astype(np.uint64) tsBinner.tsBinner(ts64, timeHgValues) param = expon.fit(timeStamps) fit = expon.pdf(x,loc=param[0],scale=param[1]) fit *= size taulist.append(param[1]) hist,bins = np.histogram(taulistavg, bins=20, range=(15,35)) width = 0.7*(bins[1]-bins[0]) center = (bins[:-1]+bins[1:])/2 plt.step(center, hist, where = 'post', label="averagetime", color='g') hist,bins = np.histogram(taulist, bins=20, range=(15,35)) width = 0.7*(bins[1]-bins[0]) center = (bins[:-1]+bins[1:])/2 plt.step(center, hist, where = 'post', label="maxlikelihood") plt.legend() plt.savefig(inspect.stack()[0][3]+".png")
def other_simulator(data, n_precip, n_heat, interval, p_rain, strength, samples): if strength == 'Meh, sorta feel ok about it': mult = 5.0 elif strength == 'It will probably happen': mult = 25.0 else: mult = 50.0 prior_a = 1.0*mult prior_b = (1.0/p_rain.value)*mult sample = data[data.WEEK==interval] years = np.max(sample.YEAR) - np.min(sample.YEAR) a, b = prior_a + np.sum(sample.RAIN), prior_b + years if np.isnan(a)==True: a = 0 if np.isnan(b)==True: b = 1 gam = gamma.rvs(a=a, scale=1/b, size=samples) rain_mu = a/b mu, sigma = norm.fit(sample.TMAX) l, s, = expon.fit(sample.PRCP) raindays = poisson.rvs(rain_mu, size=samples) storm = np.zeros(samples) t_vec = np.zeros(samples) rf_vec = np.zeros(samples) for i in range(len(raindays)): if raindays[i] > 0: if raindays[i] > 7: days = np.random.randint(3,7) else: days = raindays[i] t_max = norm.rvs(mu, sigma, size=days) rainfall = expon.rvs(l, s, days) temp = np.zeros(days) for j in range(len(t_max)): if rainfall[j] >= n_precip.value and t_max[j] < n_heat.value: temp[j] = 1 t_vec[i] = np.max(t_max) rf_vec[i] = np.sum(rainfall) if np.sum(temp) >= 3: storm[i] = 1 return gam, storm, raindays, t_vec, rf_vec
def fit_condition_distributions(train_cond_data): """ Calculate the scale parameter for the exponential distribution of correlated conditional variables for the Lorenz 96 model in time. Args: train_cond_data: array of conditioning values where the first column is the current X, and each other column is a lagged X value Returns: array of scale values """ train_cond_exp_scale = np.zeros(train_cond_data.shape[1] - 1) for i in range(1, train_cond_data.shape[1]): train_cond_exp_scale[i - 1] = expon.fit(np.abs(train_cond_data[:, 0] - train_cond_data[:, i]), floc=0)[1] return train_cond_exp_scale
def delay_times(ax, dts, bins=500, bounds=None, fit=False, alpha=0.75): if bounds is None: bounds = [0, 1e5] [n, bins, _patches] = plots_base.plot_hist(ax, [dts], bins=bins, x_range=bounds, density=True) if fit: loc, scale = expon.fit(dts[(dts > bounds[0]) & (dts < bounds[1])]) red_line = ax.plot(bins[0][:-1], expon.pdf(bins[0][:-1], loc=loc, scale=scale), color="r") ax.legend(red_line, "1/tau = " + str(round(1 / (scale * 1e-9))) + " Hz") ax.set_xlabel("Delay Times (ns)") ax.set_ylabel("Normalized Counts") ax.set_xscale("log") ax.set_yscale("log")
def expon_fit(data, var, stat, bmicat, start, end, space): bmimean = np.zeros((4, 3)) for i in range(len(cat)): x = data[data.common_user_id.isin(bmicat[i])] if stat == 'n': x1 = x[var] else: x1 = x[var][stat] loc, scale = expon.fit(x1) f = np.linspace(start, end, space) y = expon.pdf(f, loc, scale) plt.plot(f, y) plt.ylim(0, .14) bmimean[i, 0] = x1.mean() bmimean[i, 1] = x1.median() bmimean[i, 2] = x1.std() plt.show() return (bmimean)
def calculate_global_time(path): # calculate global_time among all adoption times/dates in a dataset all_ts = list() i = 0 with open(path, 'r') as f: for line in f: i += 1 last_t = 0 paths = line.strip().split('\t') # remove cascade id and label paths = paths[2:-1] for path in paths: t = int(path.split(':')[1]) reaction_t = t - last_t last_t = t all_ts.append(reaction_t) return np.mean(all_ts), i, expon.fit(all_ts)
def main(): transactions = classification.load_transactions() orders = classification.classify_trades(transactions) interarrivals = classification.calculate_interarrival_times(orders) arrivals = np.cumsum(interarrivals) time_step = 10 * 60 * 1000 #calculating hourly rates bins = [] bin = [] time = 0 print('Processing...') for i, t in enumerate(arrivals): if t > time + time_step: jump = int(np.floor((t - time) / time_step)) time += jump * time_step bins.extend([bin + [] * (jump - 1)]) bin = [] bin.append(orders[i]) times = [] rates = [] for bin in bins: if len(bin) != 0: times.append(bin[0].start_time) ia = classification.calculate_interarrival_times(bin) loc, scale = expon.fit([i for i in ia if i > 1000]) rates.append(scale) rate_returns = [rates[i + 1] / rates[i] for i in range(len(rates) - 1)] hours = [[] for i in range(24 * 6)] for i, r in enumerate(1 / np.array(rates)): hours[i % 24 * 6].append(r) plt.plot([i for i in range(24 * 6)], [np.mean(hour) for hour in hours]) plt.show()
def MLE_plt(categories,inter_arrivals,inter_arrival_means): cat_means = cat_mean(inter_arrivals,categories) for i in range(0,len(categories)): #X = np.asarray(extract_cat_samples(categories.inter_arrivals,categories.categories,i))#for single inter-arrivals in a category #X = np_matrix(categories.categories[i][0])#for avg(inter-arrival)/person in a category data = [0]*len(categories[i][0]) for j in range(0,len(categories[i][0])): data.append(inter_arrival_means[categories[i][0][j]]) X = np.asarray(data) param = expon.fit(X) # distribution fitting sample_mean = cat_means[i] #rate_param = 1.0/sample_mean #fitted_pdf = expon.pdf(X,scale = 1/rate_param) # rate_param_estimate = exp_rate_param_estimate(sample_means) max_sample = max_interarrival_mean(categories,inter_arrivals,i) X_plot = np.linspace(0,2*sample_mean,2000)[:, np.newaxis] fitted_pdf = expon.pdf(X_plot,loc=param[0],scale=param[1]) # Generate the pdf (fitted distribution) #kde = KernelDensity(kernel='gaussian', bandwidth=4).fit(X) #KDEs.append(kde) #to use for prob_return() #max_sample = max_interarrival_mean(categories.categories,categories.inter_arrivals,i) #X_plot = np.linspace(0,1.5*max_sample,2000)[:, np.newaxis] #log_dens = kde.score_samples(X_plot) fig = plt.figure() #plt.plot(X_plot[:, 0], np.exp(log_dens), '-',label="kernel = '{0}'".format('gaussian')) plt.plot(X_plot[:, 0],fitted_pdf,"red",label="Estimated Exponential Dist",linestyle="dashed", linewidth=1.5) #plt.draw() #plt.pause(0.001) plt.title("Parametric MLE (exponential distribution) for category=%s Visitors"%(i)) plt.hist(X,bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) #plt.hist(combine_inner_lists(extract_cat_samples(categories.inter_arrivals,categories.categories,i)),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) #plt.hist(np.asarray(categories[i][0]),bins=40,normed=1,color="cyan",alpha=.3,label="histogram") #alpha, from 0 (transparent) to 1 (opaque) plt.xlabel("inter-arrival time (days)") plt.ylabel("PDF") plt.legend() save_as='./app/static/img/cat_result/mle/mleplt_cat'+str(i)+'.png' # dump results into mle folder plt.savefig(save_as) plt.show(block=False) plt.close(fig)
def mass_selection_ormel(): m_star_list = np.linspace(0.08, 0.2, 1000) dm = m_star_list[1] - m_star_list[0] IMF = np.power(m_star_list, -1.3) #From Chandler 2003 #n = (1./0.3)*((1./(0.08**0.3)) - (1./(m_star**0.3))) number = IMF * dm loc, scale = expon.fit(number.tolist()) t = expon.rvs(loc, scale, size=1) if t[0] < number[0] and t[0] > number[len(number) - 1]: return 0.04 * np.power(np.divide(t[0], dm), -1. / 1.3) else: pass
import sys from scipy.stats import expon import numpy as np if sys.stdin.isatty(): sys.stderr.write("usage: cat data.log | python %s\n" % __file__) exit(1) x = np.array([float(line.strip()) for line in sys.stdin]) # shape, loc, scale loc, scale = expon.fit(x, floc=0) print("lambda") print(1./scale) # 0.278487310799 3.59082788057
def main(): print(floor(time.time()*1000)) db = startup_tests() eL.main(False) conf = getConfig() global albumsBest, current_playlist, con, getSongData, repeatsList, nonExplicitList albumsBest = db.prepare( "SELECT album_genres.album_id, album_genres.similarity from album_genres INNER JOIN albums on albums.album_id=album_genres.album_id WHERE " +("SUBSTRING(albums.folder_path,1,1) = '/' and albums.album_id in (select songs.album_id from songs where SUBSTRING(songs.filename,1,1) = '/') AND " if conf['production'] else "") +("albums.playcount>0 AND " if conf['playlistRepeats'] else "") +"album_genres.genre_id=$1") getSongData = db.prepare("SELECT songs.song, songs.length FROM songs WHERE songs.album_id=$1") if conf['playlistRepeats']: repeatsList = [x[0] for lst in db.prepare("select distinct song_id from playlist_song") for x in lst] nonExplicitList = conf['nonExplicitList'] current_playlist = playlistBuilder(db) con = databaseCon(db) #Doing subgenre/album for "python3 genplaylist type id" if len(sys.argv) == 3: if sys.argv[1] == 'subgenre': genPlaylist(getStartingAlbum(int(sys.argv[2])), production = conf['production'], playlistRepeats = conf['playlistRepeats'],subgenre=int(sys.argv[2])) elif sys.argv[1] == 'album': current_playlist.fillAlbumsArtistsCache(int(sys.argv[2])) current_playlist.album_history.extend([int(sys.argv[2]) for i in range(5)]) genPlaylist(int(sys.argv[2]), production = conf['production'], playlistRepeats = conf['playlistRepeats']) else: print("Error with arg1: not matching to album or subgenre:"+sys.argv[1]) exit(1) elif len(sys.argv) != 1 and not (len(sys.argv) == 2 and sys.argv[1].strip().isdigit()): print("Error with args; needs some or none!") exit(1) else: if not os.path.isfile("config/schedule.tsv"): print("Error: no schedule file found. Write one and save it to config/schedule.tsv") exit(1) schedule, supergenres = processSchedule() if "correctGenreProportions" in conf and conf["correctGenreProportions"]: def getGenre(d,h): real_genre_vals = dict([x for lst in con.db.prepare("SELECT genre, COUNT(*) FROM playlists GROUP BY genre").chunks() for x in lst]) for genre, val in supergenres.items(): real_genre_vals[genre] = real_genre_vals[genre]*val mostDiff = min(list(real_genre_vals.items()), key=(lambda x: x[1])) print("Lowest corrected proportional genre is "+mostDiff[0]+" at "+str(mostDiff[1])+" playlistcount") return mostDiff[0] else: def getGenre(d,h): supergenresSum = sum([supergenres[y] for y in schedule[d][h]]) print("Generating "+str(ceil(playlistLength/120))+"+ albums out of one of the following genres with the following weights (of which gets picked:") genres = [(x, supergenres[x]/supergenresSum) for x in schedule[d][h]] print('\t'+(',\t'.join([' : '.join(map(str,x)) for x in genres]))) real_genre_vals = dict([x for lst in con.db.prepare("SELECT genre, COUNT(*) FROM playlists GROUP BY genre").chunks() for x in lst]) if len(real_genre_vals) > 0 and any([x > 30 for x in real_genre_vals.values()]): print("Since real genre data in playlists present, here are the real proportions:") supergenresRealSum = sum([real_genre_vals[y[0]] for y in genres]) print('\t'+(',\t'.join([' : '.join(map(str,x)) for x in [(y[0], real_genre_vals[y[0]]/supergenresRealSum) for y in genres]]))) mostDiff = max([(x[0], ((x[1] - real_genre_vals[x[0]]/supergenresRealSum)/x[1])) for x in genres], key=(lambda x: x[1])) print("Biggest difference is in "+mostDiff[0]+" by "+str(mostDiff[1])+"%") return mostDiff[0] return getitem(genres)[0] print("Processed supergenres from schedule with frequencies") day = list(schedule.keys())[randint(0,6)] hour = randint(0,23) print("Starting on "+day+" at "+str(hour)+":00:00, and doing a 1-hour playlist for each hour henceforth") playlistLength = int(conf['playlistLength']) linerTimes = dict([ (t,l) for t, l in conf['liners'].items() if (float(t)*60)+float(l) <= playlistLength]) print("Doing liners during the following times:") for t, duration in sorted(list(linerTimes.items())): print('\t'+str(t)+':00 - '+str(t)+':'+str(duration)) subgenres = dict([(x[0], list(x[1:]) if x[1] is not None else [0,x[2]]) for lst in db.prepare("SELECT genre_id, popularity, supergenre FROM genres").chunks() for x in lst]) subgenres_rvars = {} for key in supergenres.keys(): subgenres_rvars[key] = norm(*norm.fit([x[0] for x in subgenres.values() if x[1]==key])) genresUsed = db.prepare("SELECT subgenre, COUNT(subgenre) FROM playlists WHERE playlists.genre = $1 GROUP BY playlists.subgenre") getSubgenreName = db.prepare("SELECT genres.genre FROM genres WHERE genres.genre_id = $1") playlistGenerations = int(sys.argv[1]) if len(sys.argv) == 2 else int(conf['playlistGenerations']) #done with setup; real work now for g in range(playlistGenerations): genre = getGenre(day, hour) print("Picked "+genre) for lst in genresUsed.chunks(genre): for subgenre,plays in lst: if len(subgenres[subgenre]) == 2: subgenres[subgenre].append(plays) genresUsed_rvar = expon(*expon.fit([x[2] if len(x)>2 else 0 for x in subgenres.values() if x[1]==genre])) possible_subgenres = sorted([ (key, ((1-genresUsed_rvar.cdf(val[2] if len(val)>2 else 0))+subgenres_rvars[genre].cdf(val[0])) ) for key,val in subgenres.items() if val[1]==genre], key=lambda x: x[1]) albums = [] while len(albums) < 2 and len(possible_subgenres)>0: subgenre, temp = getitem(possible_subgenres) possible_subgenres.remove((subgenre,temp)) albums = sorted([[x[0],percentValidation(x[1])] for x in albumsBest(subgenre)], reverse=True) if len(possible_subgenres)==0: print("Error: couldn't find a suitable subgenre for genre") else: subgenreName = list(getSubgenreName(subgenre))[0][0] print("Picked "+subgenreName+" as a starting subgenre") startingAlbum = getStartingAlbum(subgenre, albums) current_playlist.fillAlbumsArtistsCache(startingAlbum, genre) current_playlist.album_history.extend([a[0] for a in albums[:ceil(len(albums)/20.0)+1] if a[0] in current_playlist.albums]) try: genPlaylist(startingAlbum, linerTimes, playlistLength, production = conf['production'], playlistRepeats = conf['playlistRepeats'], genre=genre) except Exception as e: handleError(e,"Error with generating this playlist; going to keep making new ones") current_playlist = playlistBuilder(db) hour = (hour + 1) % 23 if hour==0: weekdays = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'] day = weekdays[(weekdays.index(day)+1 ) % 7]
import matplotlib.pyplot as plt import numpy as np from scipy.stats import expon from scipy.stats import norm # Parte 2. Genere 1000 numeros aleatorias con una distribucion exponencial, grafique el histograma y compare con la PDF conocida de dicha distribucion. # Luego Realice 1000 sumas de 1000 numeros aleatorios con una distribucion exponencial y compare (haga un fit) a una distribucion normal, verificando el teorema del limite central. n=[] for i in range(1000): n.append(np.random.exponential(10)) # Llenamos la lista "n" con numeros aleatorios con distribución exponencial de media 10. loc1,scale1 = expon.fit(n) # obetenemos los parámetros "Scale" y "loc" de un fit sobre los datos en la lista "n". Para este caso, la media de la distribución es igual a "scale". print(scale1,loc1) #imrpimimos estos parámetros x = np.linspace(0,50, 100) y=expon.pdf(x,scale=scale1, loc=loc1) # Graficamos una distribución exponencial con media 10. f, fig1 = plt.subplots(1,1) fig1.plot(x, y,'r-', lw=5, alpha=0.6, label='expon pdf') #Graficamos x vs y (distribución) fig1.hist(n,bins=50,normed=True) #Hacemos el histograma de n. Es importante que esté normalizado. f.savefig('graficas.png') #Guardamos en una archivo las gráficas. #Hasta aca verificamos que los datos si pertenecen a la distribución dada. Ahora tenemos que repetir el proceso creando una variable que es la suma de las variables generadas. sumas=[] #En cada elemento de la lista "sumas" guardamos la suma de 1000 varables aleatorias con distribución exponencial. for i in range(1000):
def signal_variability(data, subplots=False, title=None, density_limits=(-20,0), threshold_level=10): import h5py if type(data)==h5py._hl.dataset.Dataset: title = data.file.filename+data.name data = data[:,:] from numpy import histogram, log, arange, sign import matplotlib.pyplot as plt plt.figure() # plt.figure(1) if subplots: rows = subplots[0] columns = subplots[1] channelNum = 0 else: rows = 1 columns = 1 channelNum = arange(data.shape[0]) for row in range(rows): for column in range(columns): if type(channelNum)==int and channelNum>=data.shape[0]: continue print("Calculating Channel "+str(channelNum)) if type(channelNum)==int: ax = plt.subplot(rows, columns, channelNum+1) else: ax = plt.subplot(rows, columns, 1) d = data[channelNum,:] dmean = d.mean() dstd = d.std() ye, xe = histogram(d, bins=100, normed=True) if (sign(d)>0).all(): from scipy.stats import expon expon_parameters = expon.fit(d) yf = expon.pdf(xe[1:], *expon_parameters) # left_threshold, right_threshold = likelihood_threshold(d, threshold_level, comparison_distribution='expon', comparison_parameters=expon_parameters) left_threshold = 0 right_threshold = 0 else: from scipy.stats import norm yf = norm.pdf(xe[1:],dmean, dstd) left_threshold, right_threshold = likelihood_threshold(d, threshold_level, comparison_distribution='norm', comparison_parameters=(dmean, dstd)) x = (xe[1:]-dmean)/dstd ax.plot(x, log(ye), 'b-', x ,log(yf), 'r-') # ax.set_ylabel('Density') # ax.set_xlabel('STD') if rows!=1 or columns!=1: ax.set_title(str(channelNum)) ax.set_yticklabels([]) ax.set_xticklabels([]) if density_limits: ax.set_ylim(density_limits) if (sign(d)>0).all(): ax.plot(((right_threshold-dmean)/dstd, (right_threshold-dmean)/dstd), plt.ylim()) else: ax.plot(((left_threshold-dmean)/dstd, (left_threshold-dmean)/dstd), plt.ylim()) ax.plot(((right_threshold-dmean)/dstd, (right_threshold-dmean)/dstd), plt.ylim()) channelNum += 1 if title: plt.suptitle(title)
# http://docs.scipy.org/doc/numpy/user/basics.creation.html data = np.array(crashIntervals) #print(sorted(data, reverse=True)) # We now try to fit an exponential distribution to the data. # This will print detailed information of this function! # print(expon.fit.__doc__) # http://stackoverflow.com/questions/21610034/fitting-distribution-with-fixed-parameters-in-scipy/ # http://stackoverflow.com/questions/25085200/scipy-stats-expon-fit-with-no-location-parameter loc, scale = expon.fit(data, floc=0) print(loc) print(scale) # Now, we want to test how well the exponential distribution # fits to the data. # TODO: study more on the kstest # try examples in the doc # read Wiki page # http://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test # also some related posts