def calculate_percentiles(ids): global duration global pers_duration global perc_duration global perc_pers_duration list_durations = [] list_pers_durations = [] for i in ids: if not i in duration: list_durations.append(0) list_pers_durations.append(0) duration[i] = 0 pers_duration[i] = 0 else: list_durations.append(duration[i]) list_pers_durations.append(pers_duration[i]) list_durations.sort() list_pers_durations.sort() print list_durations print list_pers_durations for i in ids: perc_duration[i] = stats.percentileofscore(list_durations, duration[i]) perc_pers_duration[i] = stats.percentileofscore(list_pers_durations, pers_duration[i])
def write_to_file(p, filename_av, filename_md, ids): av_list = [] md_list = [] for i in ids: (n, av, md, av_n, md_n) = p.report_stats(i) if n == None: continue av_list.append(av_n) md_list.append(md_n) av_list.sort() md_list.sort() print av_list print md_list o_file_av = csv.writer(open(filename_av, 'wb')) o_file_av.writerow(['person_id', '', '', 'value', 'percentile']) o_file_md = csv.writer(open(filename_md, 'wb')) o_file_md.writerow(['person_id', '', '', 'value', 'percentile']) for i in ids: print i (n, av, md, av_n, md_n) = p.report_stats(i) if n == None: continue print av_n av_perc = stats.percentileofscore(av_list, av_n) md_perc = stats.percentileofscore(md_list, md_n) o_file_av.writerow([i,'','',av_n,av_perc]) o_file_md.writerow([i,'','',md_n,md_perc])
def checkGold(users, karma, gold): for index, karmaNum in enumerate(karma): if stats.percentileofscore(karma, karmaNum) < 90.0: try: gold.remove(users[index]) except Exception: pass if stats.percentileofscore(karma, karmaNum) > 90.0: gold.append(users[index])
def _update_ratings_time_spent(self): eye_hists = EyeHistory.objects.all().select_related("user", "page") total_updates = eye_hists.count() ratings = {} filled_ratings = set() for i, eye_hist in enumerate(queryset_iterator(eye_hists)): user = eye_hist.user domain,_ = Domain.objects.get_or_create(url=eye_hist.domain) page,_ = Page.objects.get_or_create(url=eye_hist.url, domain=domain) key = (user.id, page.id) if key in filled_ratings or \ Ratings.objects.filter(user=user,page=page, from_time_distribution=False).exists(): filled_ratings.add(key) continue if key not in ratings: ratings[key] = (0,0) ratings[key]= (ratings[key][0] + 1.0* eye_hist.total_time/1000,i) if i != 0 and i % CHUNK_SIZE == 0: self._log_updates(i, total_updates, 'avg_time_spent_for_pages') total_updates = len(ratings) i = 0 users = {} for key,time_spent in ratings.items(): user_id = key[0] avg_time_spent = 1.0*time_spent[0]/time_spent[1] if not user_id in users: users[user_id] = [] users[user_id].append(avg_time_spent) ratings[key] = avg_time_spent if i != 0 and i % CHUNK_SIZE == 0: self._log_updates(i, total_updates, 'forming_time_spent_distributions_for_users') i+=1 i = 0 for key,avg_time_spent in ratings.items(): try: rating = Ratings.objects.get(user=User.objects.get(id=key[0]), page=Page.objects.get(id=key[1])) rating.score = round(stats.percentileofscore(users[key[0]], avg_time_spent))*4.0/100 + 1 rating.save() except Ratings.DoesNotExist: Ratings.objects.create(user=User.objects.get(id=key[0]), page=Page.objects.get(id=key[1]), score=round(stats.percentileofscore(users[key[0]], avg_time_spent))*4.0/100 + 1) if i != 0 and i % CHUNK_SIZE == 0: self._log_updates(i, total_updates, 'calculating_left_over_ratings') i+=1
def cal_percentile_from2lists(list1,list2,distr1,distr2): distr1 = list(distr1.flat) distr2 = list(distr2.flat) list_out = [] for n0 in range(0,len(list1)): #val1 = percentileofscore(np.array(distr1), list1[n0]) #val1 = percentileofscore(distr1, list1[n0]) #print(val1) #print(list1[n0]) max0 = max(percentileofscore(distr1, list1[n0]),percentileofscore(distr2, list2[n0])) list_out.append(max0) return list_out
def irr_calc(filled, etype, components=flows.keys()): # prepare matrix of flows total = irrflows(filled, components=components) # Rate of Return print 'Calculating IRR...' irr = total.apply(robust_irr, axis=1) point_f = irr.loc['f',0,0] point_m = irr.loc['m',0,0] point_p = irr.loc['p',0,0] qtrim = 0 irrf = irr.loc['f'].dropna() irrm = irr.loc['m'].dropna() irrp = irr.loc['p'].dropna() #irrf = irrf.ix[(irrf>irrf.quantile(q=qtrim)) & (irrf<irrf.quantile(q=1-qtrim))] #irrm = irrm.ix[(irrm>irrm.quantile(q=qtrim)) & (irrm<irrm.quantile(q=1-qtrim))] #irrp = irrp.ix[(irrp>irrp.quantile(q=qtrim)) & (irrp<irrp.quantile(q=1-qtrim))] irrf = irrf.ix[irrf > 0] irrm = irrm.ix[irrm > 0] irrp = irrp.ix[irrp > 0] # Conduct inference null_center = 0.03 irr_fp = 1 - percentileofscore(irrf - irrf.mean() + null_center, irrf.mean())/100 irr_mp = 1 - percentileofscore(irrm - irrm.mean() + null_center, irrm.mean())/100 irr_pp = 1 - percentileofscore(irrp - irrp.mean() + null_center, irrp.mean())/100 # Save results irr_pnt = pd.DataFrame([point_f, point_m, point_p], index=['f','m','p']) irr_mean = pd.DataFrame([irrf.mean(), irrm.mean(), irrp.mean()], index = ['f', 'm', 'p']) irr_p = pd.DataFrame([irr_fp, irr_mp, irr_pp], index = ['f', 'm', 'p']) irr_se = pd.DataFrame([irrf.std(),irrm.std(), irrp.std()], index=['f','m','p']) try: #irr_quant = irr.groupby(level='sex').quantile([0.1, 0.9]).unstack() irr_quant = pd.DataFrame(np.array([[irrf.quantile(0.10),irrf.quantile(0.90)],[irrm.quantile(0.10),irrm.quantile(0.90)],[irrp.quantile(0.10),irrp.quantile(0.90)]]), index=['f','m','p']) except: irr_quant = pd.DataFrame(np.array([[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]]), index = ['f', 'm', 'p']) irr_quant.index.name = 'sex' # Output the results table = pd.concat([irr_pnt, irr_mean, irr_p, irr_se, irr_quant], axis=1) table.columns = ['point', 'mean', 'pval', 'se', 'lb', 'ub'] return table
def Permutation_test(data1, data2, n1=100,n2=100): p_values = [] for simulation_time in range(n1): shuffle_difference =[] experiment_difference = np.mean(data1) - np.mean(data2) vector_concat = np.concatenate([data1,data2]) for shuffle_time in range(n2): shuffle(vector_concat) new_data1 = vector_concat[:len(data1)] new_data2 = vector_concat[len(data1):] shuffle_difference.append(np.mean(new_data1) - np.mean(new_data2)) p_values.append(min(percentileofscore(shuffle_difference,experiment_difference)/100., (100.-percentileofscore(shuffle_difference,experiment_difference))/100.)) return p_values,np.mean(p_values),np.std(p_values)
def bc_calc(filled, etype, components=flows.keys(), rate=0.03): # prepare matrix of flows benefits, costs = bcflows(filled, components = components) # Cost-benefit Ratio print 'Calculating B/C ratio...' costs = costs.apply(robust_npv, rate=rate, axis=1) benefits = benefits.apply(robust_npv, rate=rate, axis=1) ratio = -benefits/costs point_f = ratio.loc['f',0,0] point_m = ratio.loc['m',0,0] point_p = ratio.loc['p',0,0] qtrim = 0.05 ratiof = ratio.loc['f'].dropna() ratiom = ratio.loc['m'].dropna() ratiop = ratio.loc['p'].dropna() ratiof = ratiof.ix[(ratiof>ratiof.quantile(q=qtrim)) & (ratiof<ratiof.quantile(q=1-qtrim))] ratiom = ratiom.ix[(ratiom>ratiom.quantile(q=qtrim)) & (ratiom<ratiom.quantile(q=1-qtrim))] ratiop = ratiop.ix[(ratiop>ratiop.quantile(q=qtrim)) & (ratiop<ratiop.quantile(q=1-qtrim))] # Conduct inference null_center = 1 ratio_fp = 1 - percentileofscore(ratiof - ratiof.mean() + null_center, ratiof.mean())/100 ratio_mp = 1 - percentileofscore(ratiom - ratiom.mean() + null_center, ratiom.mean())/100 ratio_pp = 1 - percentileofscore(ratiop - ratiop.mean() + null_center, ratiop.mean())/100 # Save results ratio_pnt = pd.DataFrame([point_f, point_m, point_p], index=['f','m','p']) ratio_mean = pd.DataFrame([ratiof.mean(), ratiom.mean(), ratiop.mean()], index = ['f', 'm', 'p']) ratio_p = pd.DataFrame([ratio_fp, ratio_mp, ratio_pp], index = ['f', 'm', 'p']) ratio_se = pd.DataFrame([ratiof.std(),ratiom.std(), ratiop.std()], index=['f','m','p']) try: #ratio_quant = ratio.groupby(level='sex').quantile([0.1, 0.9]).unstack() ratio_quant = pd.DataFrame(np.array([[ratiof.quantile(0.10),ratiof.quantile(0.90)],[ratiom.quantile(0.10),ratiom.quantile(0.90)],[ratiop.quantile(0.10),ratiop.quantile(0.90)]]), index=['f','m','p']) except: ratio_quant = pd.DataFrame(np.array([[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]]), index = ['f', 'm', 'p']) ratio_quant.index.name = 'sex' # Output results table = pd.concat([ratio_pnt, ratio_mean, ratio_p, ratio_se, ratio_quant], axis=1) table.columns = ['point', 'mean', 'pval', 'se', 'lb', 'ub'] return table
def eval(self, row, dataset): # parse date from string col = self.value.value query_args = QueryArgs(select={col: 1}) column = dataset.dframe(query_args=query_args)[col] field = self.value.field(row) return percentileofscore(column, field)
def Asynchronous_regression(self): """Remove the biases by fitting a linear regression model with ordered observational and model datasets Stoner et al (2013) An asynchronous regional regression model for statistical downscaling of daily climate variables :returns: downscaled model_present and model_future """ ref_original = self.ref_dataset model_present = self.model_present model_present_sorted = np.sort(model_present) model_future = self.model_future # For linear regression, the size of reference data must be same as # model data. ref = np.zeros(model_present.size) for ival, model_value in enumerate(model_present_sorted): percentile = percentileofscore(model_present_sorted, model_value) ref[ival] = np.percentile(ref_original, percentile) slope, intercept = linregress(model_present_sorted, ref)[0:2] return model_present * slope + intercept, model_future * slope + intercept
def calcSeverity(model, cid, varname="soil_moist"): """Calculate drought severity from *climatology* table stored in database.""" log = logging.getLogger(__name__) outvars = model.getOutputStruct(model.model_path + "/global.txt") col = outvars[varname][1] if varname in ["soil_moist"]: p = np.loadtxt("{0}/{1}_{2:.{4}f}_{3:.{4}f}".format(model.model_path, outvars['runoff'][0], model.gid[cid][0], model.gid[cid][1], model.grid_decimal))[:, col:col+model.nlayers] p = pandas.Series(np.sum(p, axis=1), [datetime(model.startyear, model.startmonth, model.startday) + timedelta(t) for t in range(len(p))]) else: p = np.loadtxt("{0}/{1}_{2:.{4}f}_{3:.{4}f}".format(model.model_path, outvars['runoff'][0], model.gid[cid][0], model.gid[cid][1], model.grid_decimal))[:, col] p = pandas.Series(p, [datetime(model.startyear, model.startmonth, model.startday) + timedelta(t) for t in range(len(p))]) db = dbio.connect(model.dbname) cur = db.cursor() if dbio.tableExists(model.dbname, model.name, varname): if varname in ["soil_moist"]: lvar = ",layer" else: lvar = "" if dbio.columnExists(model.dbname, model.name, varname, "ensemble"): fsql = "with f as (select fdate{3},avg(st_value(rast,st_geomfromtext('POINT({0} {1})',4326))) as vals from {2}.{4} where st_intersects(rast,st_geomfromtext('POINT({0} {1})',4326)) group by fdate{3})".format(model.gid[cid][1], model.gid[cid][0], model.name, lvar, varname) else: fsql = "with f as (select fdate{3},st_value(rast,st_geomfromtext('POINT({0} {1})',4326)) as vals from {2}.{4} where st_intersects(rast,st_geomfromtext('POINT({0} {1})',4326)))".format(model.gid[cid][1], model.gid[cid][0], model.name, lvar, varname) sql = "{0} select fdate,sum(vals) from f group by fdate".format(fsql) cur.execute(sql) if bool(cur.rowcount): results = cur.fetchall() clim = pandas.Series([r[1] for r in results], [r[0] for r in results]) else: clim = p else: log.warning("Climatology table does not exist. Severity calculation will be inaccurate!") clim = p s = 100.0 - np.array(map(lambda v: stats.percentileofscore(clim, v), p)) return s
def get_rank_by_college(self): departments = Department.objects.filter(college = self.college) all_medians = [] for d in departments: all_medians.append(d.median_salary) rank = stats.percentileofscore(all_medians, self.median_salary, kind='strict') return rank
def get_rank_according_to_things(self, year = None, college = None, department = None, position = None, institution = None, campus=None): kwargs = {} if institution: kwargs['college__campus__institution'] = institution if campus: kwargs['college__campus'] = campus if college: kwargs['college'] = college if department: kwargs['department'] = department if position: kwargs['position'] = position if year: kwargs['identity__year'] = year members = EmployeeDetail.objects.filter(**kwargs) all_salaries = [] for member in members: if member.identity.proposed_total_salary > 0: all_salaries.append(int(member.identity.proposed_total_salary)) self_salary = self.proposed_total_salary rank = stats.percentileofscore(all_salaries, self_salary, kind='strict') print self.identity print self.id print rank return rank
def main(): connection=mdb.connect(user="******",passwd="", port=3316,db="moocdb") cursor = connection.cursor() connection2=mdb.connect(user="******",passwd="", port=3316,db="moocdb") cursor2 = connection2.cursor() sql = '''SELECT user_id, dropout_feature_value_week, dropout_feature_value FROM moocdb.dropout_feature_values WHERE dropout_feature_id = 9; ''' cursor.execute(sql) week_values = {} for [user_id, week, value] in cursor: if week in week_values: week_values[week].append(value) else: week_values[week] = [value] for [user_id, week, value] in cursor: insert_percentile(percentileofscore(week_values[week], value), user_id, week, cursor2, connection2) connection.close() connection2.close()
def stochastic_paired_permutation_test(group_a, group_b, confidence=5, repetitions=10000): """ Computes a stochastic version of the paired permutation test (for more speed). It will return nothing and just print out if there is any significant difference or not (and the percentile). """ paired_differences_sum = sum([a - b for a, b in zip(group_a, group_b)]) sampled_paired_differences = [] for i in range(repetitions): sampled_paired_differences.append( sum([(a - b) * choice([-1, 1]) for a, b in zip(group_a, group_b)])) lower_perc, upper_perc = np.percentile(sampled_paired_differences, [confidence/2., 100-confidence/2.]) print lower_perc, paired_differences_sum, upper_perc percentile = percentileofscore(sampled_paired_differences, paired_differences_sum) if paired_differences_sum < lower_perc: print "Group A is smaller!:", percentile elif paired_differences_sum > upper_perc: print "Group B is smaller!", percentile else: print "Nothing interesting to report. Percentile:", percentile
def get_pval(value_observed,sample_size,profile,iterations=1000): # random sampling of the profile ... sample_profile = lambda _profile: np.random.choice(_profile,size=sample_size,replace=False) # get fraction of structured features of the profile sample ... get_feature = lambda _profile: get_struct(sample_profile(_profile))*100.0/sample_size # return percentile of the observed value (same as p-value) in randomly sampled (#iteration) distribution... return st.percentileofscore( [get_feature(profile) for _ in range(iterations)], value_observed )
def testSignificances(self): """ Test significances of d values. If one differs MUCH number of it is returned as a P300 target. Temporarly, as a test, normal distribution boundries for pVal percentyl are calulated. If only one d is larger than that pVal then that's the target. """ print "++ testSignificances ++ " dMean = np.zeros(self.fields) for i in range(self.fields): dMean[i] = self.dArrTotal[i][:self.nLast].mean() self.per = [st.percentileofscore(self.pdf, x) for x in dMean] self.per = np.array(self.per) #~ print "dMean: ", dMean print "percentile: ", self.per # If only one value is significantly distant if np.sum(self.per > self.pPer) == 1: self.dec = np.arange(self.fields)[self.per==self.per.max()] self.dec = np.int(self.dec[0]) print "wybrano -- {0}".format(self.dec) return self.dec else: return -1
def validar_matr_pred(matr_predic, vector_obs): # Los ejes eje_t, eje_estoc, eje_parám = [0, 1, 2] # Quitar observaciones que faltan faltan = np.isnan(vector_obs) matr_predic = matr_predic[~faltan] vector_obs = vector_obs[~faltan] # El número de días de predicciones y observaciones n_días = matr_predic.shape[eje_t] # Calcular el promedio de todas las repeticiones de predicciones vector_predic = matr_predic.mean(axis=(eje_estoc, eje_parám)) # Calcular R cuadrado r2 = _r2(vector_obs, vector_predic) # Raíz cuadrada normalizada del error promedio rcnep = _rcnep(vector_obs, vector_predic) # Validar el intervalo de incertidumbre confianza = np.empty_like(vector_obs, dtype=float) for n in range(n_días): perc = estad.percentileofscore(matr_predic[n], vector_obs[n]) / 100 confianza[n] = abs(0.5 - perc) * 2 confianza.sort() percentiles = np.divide(np.arange(1, n_días + 1), n_días) r2_percentiles = _r2(confianza, percentiles) rcnep_prcntl = _rcnep(confianza, percentiles) return {'r2': r2, 'rcnep': rcnep, 'r2_prcntl': r2_percentiles, 'rcnep_prcntl': rcnep_prcntl}
def demo(request, county, state): state_f = State.objects.get(name=state).fips c = County.objects.get(name=county, state_fips=state_f) geom = c.geom.json center = c.geom.centroid.json extent = c.geom.extent area = {"l": c.land_area, "w": c.water_area, "t": c.land_area + c.water_area} q = County.objects.filter(state_fips=state_f) areas = { "l": [i.land_area for i in q], "w": [i.water_area for i in q], "t": [i.land_area + i.water_area for i in q], } area_p = {k: "{:.2f}".format(stats.percentileofscore(areas[k], area[k])) for k in area.keys()} results = { "lat": c.lat, "lon": c.lon, "area": area, "area_p": area_p, "geom": "__geom__", "center": "__center__", "extent": extent, } resultsJSON = json.dumps(results) resultsJSON = resultsJSON.replace('"__geom__"', geom) resultsJSON = resultsJSON.replace('"__center__"', center) return HttpResponse(resultsJSON)
def UpdateMktCapScores(): nSmMin = DB.sqlSelect('SELECT MIN(mktcap) FROM styleBox')[0][0] - 1 nLgMax = DB.sqlSelect('SELECT MAX(mktcap) FROM styleBox')[0][0] nSmCut = 5000000000 nMdCut = 17000000000 count = 0 mktCaps = {} rRngs = [[nSmMin, nSmCut], [nSmCut, nMdCut], [nMdCut, nLgMax]] for rRng in rRngs: Sql = ('SELECT ticker, mktcap FROM styleBox WHERE ' 'mktcap>%s AND mktcap<=%s') % (rRng[0], rRng[1]) di = dict(DB.sqlSelect(Sql)) lCaps = list(di.values()) for stock in di: di[stock] = stats.percentileofscore(lCaps, di[stock], kind='weak') / 100.0 x = (stock, di[stock] / 3.0 + (count / 3.0)) di[stock] = x[1] Sql = ("UPDATE styleBox SET " "scoreSize=%s " "WHERE ticker='%s'") % (di[stock], stock) DB.sqlExecute(Sql) #this takes forever mktCaps[stock] = di[stock] count = count + 1
def add_original(self, P): """ P : array Parameter map of the statistic of interest. """ self.clusters = [] # find clusters clusters, n = self._find_clusters(P) clusters_v = scipy.ndimage.measurements.sum(P, clusters, xrange(1, n + 1)) for i in xrange(n): v = clusters_v[i] p = 1 - percentileofscore(self.dist, np.abs(v), 'mean') / 100 if p <= self.pmax: im = P * (clusters == i + 1) name = 'p=%.3f' % p threshold = self.t_upper if (v > 0) else self.t_lower properties = {'p': p, 'unit': self.unit, 'threshold': threshold} ndv = ndvar(im, dims=self.dims, name=name, properties=properties) self.clusters.append(ndv) props = {'unit': self.unit, 'cs': self.cs, 'threshold_lower': self.t_lower, 'threshold_upper': self.t_upper} self.P = ndvar(P, dims=self.dims, name=self.name, properties=props)
def compute_pval_rsa(seed): stim, voxels = load_data(n_samples, n_features, model=model, seed=seed, heteroscedastic=heteroscedastic) # compute similarity stim_ = stim if stim.shape[1] == 1: stim_ = np.hstack((stim, - stim)) stim_similarity = square_pdist(stim_) # np.corrcoef(stim_) voxels_similarity = square_pdist(voxels) # np.corrcoef(voxels) # indices to extract lower triangular part of a matrix lw_idx = np.triu_indices(n_samples, k=1) stim_vsim = stim_similarity[lw_idx] voxels_vsim = voxels_similarity[lw_idx] # compute the statistic # T = np.corrcoef(stim_vsim, voxels_vsim)[0, 1] T = spearmanr(voxels_vsim, stim_vsim)[0] T_perm = [] for i in range(n_draws): # permute the labels perm = np.random.permutation(n_samples) # voxels_vsim_perm = np.corrcoef(voxels[perm])[lw_idx] voxels_vsim_perm = square_pdist(voxels[perm])[lw_idx] # compute the test statistic # T_perm.append(np.corrcoef(voxels_vsim_perm, stim_vsim)[0, 1]) T_perm.append(spearmanr(voxels_vsim_perm, stim_vsim)[0]) pval = 1 - percentileofscore(np.array(T_perm), T) / 100. return pval
def get_per_rnd_pct(self, per_value): per_value = myround(per_value) if(self.perCache.has_key(per_value)): p = self.perCache.get(per_value) else: p = stats.percentileofscore(self.per, per_value) self.perCache[per_value] = p return 100-p
def get_topct_rnd_pct(self, topct_value): topct_value = myround(topct_value) if(self.topctCache.has_key(topct_value)): p = self.topctCache.get(topct_value) else: p = stats.percentileofscore(self.per, topct_value) self.topctCache[topct_value] = p return 100-p
def percentile(self,variable,N,WHERE = None): assert isinstance(N,int) or isinstance(N,float) if variable in self.percentile_dict: return stats.percentileofscore(self.percentile_dict[variable],N) else: RC = Recursive_Cursor(self.sqlite_path,self.table_name) if WHERE: sql = "SELECT {vr} FROM {tn} WHERE {W}".\ format(vr = variable,tn = self.table_name,W = WHERE) else: sql = "SELECT {vr} FROM {tn}".format(vr = variable,tn = self.table_name) results = RC.get_data(sql) self.percentile_dict[variable] = results RC.close_db() return stats.percentileofscore(results,N)
def print_permutation_results(self): for key in self.snpoverlappermuted.keys(): #print(key,self.snpoverlap[key]) #print(key,self.snpoverlappermuted[key]) percentile=stats.percentileofscore(self.snpoverlappermuted[key], self.snpoverlap[key]) meanoverlap= np.mean(self.snpoverlappermuted[key]) numiters=len(self.snpoverlappermuted[key]) print('{}\t{:d}\t{:.2f}\t{:d}\t{:.4f}'.format(key,self.snpoverlap[key],meanoverlap,numiters,1-(percentile/100)))
def percentileofscore(self, independent_data): """ return the percentile equivalent to the provided values the data provided are used to generate a prediction and residuals the residuals are then compared to the training residuals to generate percentile values e.g. given consumption which differs from the model by precisely the median value of the training residuals, 50 will be returned """ res = self.residuals(independent_data) return np.array([percentileofscore(self.training_residuals, r) for r in res])
def _fap_boot(time, flux, flux_error, freq, psd_best_period, n_bootstraps): """ Computes significance for the bootstrap-resampling """ from scipy import stats max_periods = _bootstrap(time, flux, flux_error, freq, n_bootstraps) fap = 1 - (stats.percentileofscore(max_periods, psd_best_period) / 100) return fap
def normaltest(self): r_percent = np.array([stats.percentileofscore(self.residuals, i) if i != np.max(self.residuals) else 99.9 for i in self.residuals]) # percentile of each obs r_znorm = stats.norm.ppf(r_percent/100) # Theoretical z-score in a normdist clf = linear_model.LinearRegression() clf.fit(r_znorm[:, np.newaxis], self.residuals[:, np.newaxis]) # Make inputs correct shape (X , 1) r_predicted = clf.predict(r_znorm[:, np.newaxis]) # Predicted residuals given normdist, z-score input return (self.residuals, r_znorm, r_predicted) # inputs for a normal probability plot
def findPercentile(self,dataArray,point): """ Determines the percentile that a particular data value belongs to among the data :param dataArray: An array with all data points, even those not plotted :param point: The value of a particular point :return: The percentile (0.0-99.9) of point relative to dataArray """ return stats.percentileofscore(dataArray,point)
import os import re from statistics import median path = r"/home/gb2642/Tools-Project---Group-13/Lyrics" song_lengths = [] song_length_dic = {} for filename in os.listdir(path): song_file = path + r"/" + filename with open(song_file, encoding="latin1") as fp: data = fp.read().split() length_value = len(data) song_length_dic.update({filename: length_value}) import operator sorted_d = sorted(song_length_dic.items(), key=operator.itemgetter(1)) number_of_songs = len(sorted_d) song_length_dim = {} length_values_sorted = sorted(song_length_dic.values()) from scipy import stats for filename in os.listdir(path): dimension = (stats.percentileofscore(length_values_sorted, song_length_dic[filename]) / 100) song_length_dim.update({filename: dimension}) print(song_length_dim)
percentiles = [] hundreds_count = 0 zeros_count = 0 mc_num = 1000 for in_row,out_row in file_reader('../data/val_features.csv',train=False): test_length = len(in_row) preds = [] for i in range(mc_num): in_put = np.array([in_row]).astype(float) preds.append(model.predict(in_put)[0][0]) percent = stats.percentileofscore(np.array(preds), float(out_row)) percentiles.append(percent) if(percent==100.): hundreds_count +=1 elif(percent==0.): zeros_count += 1 print dropout_frac print hundreds_count print zeros_count print stats.kstest(percentiles, stats.uniform(loc=0.0, scale=100.0).cdf) #Plot the results res = stats.cumfreq(percentiles,numbins=mc_num) plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0])
def get_percentiles(self, Player): percentiles = {} height = Player.height.split("-") height = (int((int(height[0]) * 12)) + int(height[1])) weight = int(''.join(x for x in Player.weight if x.isdigit())) if Player.current_stats['efg_per']: efg = float(Player.current_stats['efg_per']) percentiles['efg'] = round( stats.percentileofscore(self.effective_shootings, efg), 2) if Player.current_stats['ft_per']: ft_per = float(Player.current_stats['ft_per']) percentiles['ft_per'] = round( stats.percentileofscore(self.ft_pers, ft_per), 2) if Player.current_stats['fg_per']: fg = float(Player.current_stats['fg_per']) percentiles['fg'] = round( stats.percentileofscore(self.fg_pers, fg), 2) if Player.current_stats['3fg_per']: fg_three = float(Player.current_stats['3fg_per']) percentiles['fg_three'] = round( stats.percentileofscore(self.three_pers, fg_three), 2) if Player.current_stats['pts']: ppg = float(Player.current_stats['pts']) percentiles['ppg'] = round(stats.percentileofscore(self.ppgs, ppg), 2) if Player.current_stats['ast']: apg = float(Player.current_stats['ast']) percentiles['apg'] = round(stats.percentileofscore(self.apgs, apg), 2) if Player.current_stats['blk']: bpg = float(Player.current_stats['blk']) percentiles['bpg'] = round(stats.percentileofscore(self.bpgs, bpg), 2) if Player.current_stats['stl']: spg = float(Player.current_stats['stl']) percentiles['spg'] = round(stats.percentileofscore(self.spgs, spg), 2) if Player.current_stats['trb']: trpg = float(Player.current_stats['trb']) percentiles['trpg'] = round( stats.percentileofscore(self.trpgs, trpg), 2) if Player.current_stats['minutes_played']: minutes_played = float(Player.current_stats['minutes_played']) percentiles['minutes_played'] = round( stats.percentileofscore(self.minutes_playeds, minutes_played), 2) if Player.current_stats['pts']: pts = float(Player.current_stats['pts']) percentiles['pts'] = round(stats.percentileofscore(self.ptss, pts), 2) if Player.current_stats['tov']: tov = float(Player.current_stats['tov']) percentiles['tov'] = round(stats.percentileofscore(self.tovs, tov), 2) if Player.current_stats['age']: age = float(Player.current_stats['age']) percentiles['age'] = round(stats.percentileofscore(self.ages, age), 2) percentiles['Height'] = round( stats.percentileofscore(self.heights, height), 2) percentiles['Weight'] = round( stats.percentileofscore(self.weights, weight), 2) return percentiles
pts_ordinal_desc=wnba['PTS_ordinal_scale'].value_counts().iloc[[4, 3, 0, 2, 1, 5]] ## 5. Proportions and Percentages ## wnba = pd.read_csv('wnba.csv') age=wnba['Age'].value_counts(normalize = True).sort_index()*100 proportion_25=age[25].round(2) percentage_30=age[30].round(2) percentage_over_30=age.loc[30:].round(2) percentage_below_23=age.loc[:23].round(2) ## 6. Percentiles and Percentile Ranks ## wnba = pd.read_csv('wnba.csv') from scipy.stats import percentileofscore percentile_rank_half_less=percentileofscore(a=wnba['Games Played'],score=17,kind='weak') percentage_half_more=100-percentile_rank_half_less ## 7. Finding Percentiles with pandas ## wnba = pd.read_csv('wnba.csv') age=wnba['Age'].describe(percentiles=[.5,.75,.95]) age_upper_quartile=age['75%'] age_middle_quartile = age['50%'] age_95th_percentile = age['95%'] question1=True question2=False question3=True
def _percentile(arr): score = arr[-1] vals = arr[:-1] return stats.percentileofscore(vals, score)
def calculate(score): melbUsers = pd.read_csv('Melbourne_airbnb_users.csv') reviewsArr = melbUsers['Number of reviews'].to_numpy() percentile = stats.percentileofscore(reviewsArr, score) return percentile
def save_location(self, path, filename, data, p="percentile"): """ Function for creating a heatmap that visualizes the distribution of the sampled location of the generated position and orientation the sensor uses during the performance run :param path: Path for output file :param filename: Name of the output file :param data: used data for generating the plot :param p: Use Percent or Percentile :return: """ from scipy import stats import math import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import make_axes_locatable plt.rc('text', usetex=True) fontsize = 12 plt.rc('font', family='serif') results_fn = os.path.join(path, filename) if not os.path.exists(path): os.makedirs(path) matrix = np.zeros((20, 20)) count = 0 max = 0 # Save 20x20 matrix for better visualization for d in data: index_l = int(np.minimum((1. + d[0]) * 10., 19)) index_e = int(np.minimum((1. + d[1]) * 10., 19)) count += 1 matrix[index_e][index_l] = matrix[index_e][index_l] + 1 if max < matrix[index_e][index_l]: max = matrix[index_e][index_l] matrix = np.asarray(matrix) if p == "percentile": # In order to have comparable heatmaps for each object, # the percentile of the visits relative to the list of all visits # is computed for each entry of the matrix # Everything is then mapped between [0,1] matrix_map = [] max = 100 for m in matrix: matrix_map.append([ stats.percentileofscore(matrix.flatten(), a, 'rank') for a in m ]) matrix_map = np.asarray(matrix_map) np.testing.assert_array_equal(matrix.shape, matrix_map.shape, 'Arrays have different shape!') elif p == "percent": matrix_map = matrix * (100. / float(count)) max = int(max * (100. / float(count))) max = int(math.ceil(max / 10.0)) * 10 if max < 1: max = 1 elif max < 2: max = 2 elif max < 3: max = 3 elif max < 4: max = 4 elif max < 5.: max = 5 elif max < 10.: max = 10 elif p == "absolute": matrix_map = matrix else: import sys print("Wrong measure option: {}".format(p)) sys.exit(0) fig = plt.figure(filename) ax = fig.add_subplot(111) im = ax.imshow(matrix_map, cmap='coolwarm', vmin=0, vmax=max, interpolation='nearest') tx = [0, 5, 9.5, 15, 19] ty = [-1, -0.5, 0, 0.5, 1] plt.xticks(tx, ty) plt.yticks(tx, [ r"$-\frac{\pi}{2}$", r"$-\frac{\pi}{4}$", r"$0$", r"$\frac{\pi}{4}$", r"$\frac{\pi}{2}$" ]) plt.xlabel(r"Position $x$", fontsize=fontsize) plt.ylabel(r"Angle $\varphi$", fontsize=fontsize) plt.tight_layout() # create an axes on the right side of ax. The width of cax will be 5% # of ax and the padding between cax and ax will be fixed at 0.05 inch. divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) plt.savefig(results_fn + ".pdf") fig.clear() plt.clf() fig = None
def updtChart(basinName, basinSites): basin = basinName print('Working on WTEQ Projection Chart for ' + basinName) statsData = [] minData = [] maxData = [] meanData = [] lowestData = [] highestData = [] lowData = [] highData = [] sliderDates = [] meanData = [] trace = [] plotData = [] basinPlotData = [] PORplotData = [] basinNormData = [] basinPlotNormData = [] validTrip = [] networks = [r'SNTL', r'SCAN', r'SNTLT'] sensor = r"WTEQ" dataPath = path.join(this_dir, 'data', 'metaData', sensor, 'metaData.json') with open(dataPath, 'r') as j: meta = json.load(j) meta[:] = [ x for x in meta if str.split(x['stationTriplet'], ":")[2] in networks and str.split(x['stationTriplet'], ":")[0] in basinSites ] validTrip = [x['stationTriplet'] for x in meta] date_series = [ date(2015, 10, 1) + datetime.timedelta(days=x) for x in range(0, 366) ] #could use any year with a leap day if validTrip: normData = [] for triplet in validTrip: dataPath = dataPath = path.join( this_dir, 'data', 'norms', sensor, triplet.replace(':', '_') + '.json') with open(dataPath, 'r') as j: jTemp = json.load(j) normData.append(jTemp) basinNormData = [ np.array(x['values'], dtype=np.float) for x in normData if x['values'] ] if basinNormData: basinPlotNormData = list( np.nanmean(np.array([i for i in basinNormData]), axis=0)) validTrip[:] = [ x for index, x in enumerate(validTrip) if normData[index]['values'] ] beginDateDict = {} for siteMeta in meta: beginDateDict.update({ str(siteMeta['stationTriplet']): dt.strptime(str(siteMeta['beginDate']), "%Y-%m-%d %H:%M:%S") }) basinBeginDate = min(beginDateDict.values()) sYear = basinBeginDate.year if basinBeginDate.year > sYear: if basinBeginDate.month < 10: sYear = basinBeginDate.year else: if basinBeginDate.month == 10 and basinBeginDate.day == 1: sYear = basinBeginDate.year else: sYear = basinBeginDate.year + 1 sDate = date(sYear, 10, 1).strftime("%Y-%m-%d") eDate = (today.date() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") data = [] for triplet in validTrip: dataPath = path.join(this_dir, 'data', sensor, triplet.replace(':', '_') + '.json') with open(dataPath, 'r') as j: jTemp = json.load(j) data.append(jTemp) for dataSite in data: if dataSite: padMissingData(dataSite, sDate, eDate) plotData = [np.array(x['values'], dtype=np.float) for x in data] with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) basinPlotData = list( np.nanmean(np.array([i for i in plotData]), axis=0)) PORplotData = list( [basinPlotData[i:i + 366] for i in range(0, len(basinPlotData), 366)]) allButCurrWY = list(PORplotData) del allButCurrWY[-1] statsData = list(map(list, zip(*allButCurrWY))) if len(statsData[0]) > 1: statsData[151] = statsData[150] with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) minData = [np.nanmin(a) for a in statsData] maxData = [np.nanmax(a) for a in statsData] meanData = [np.nanpercentile(a, 50) for a in statsData] lowestData = [np.nanpercentile(a, 10) for a in statsData] highestData = [np.nanpercentile(a, 90) for a in statsData] lowData = [np.nanpercentile(a, 30) for a in statsData] highData = [np.nanpercentile(a, 70) for a in statsData] future_date_pad = 14 if len(PORplotData[-1]) > 351: future_date_pad = 366 - len(PORplotData[-1]) - 1 sliderDates = list( chain([(date_series[0])] + [ date_series[get_last_non_zero_index(maxData[0:305]) + future_date_pad] ])) else: sliderDates = list(chain([(date_series[0])] + [date_series[-1]])) jDay = len(PORplotData[-1]) - 1 lastValue = PORplotData[-1][-1] nanList = [np.nan] * jDay projData = [ createSWEProjTrace(a, jDay, lastValue, nanList) for a in allButCurrWY ] statsProj = list(map(list, zip(*projData))) cleanStatsProj = list(statsProj) if cleanStatsProj: with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) minProj = [np.nanmin(a) for a in cleanStatsProj] maxProj = [np.nanmax(a) for a in cleanStatsProj] medianProj = [np.nanpercentile(a, 50) for a in cleanStatsProj] lowestProj = [np.nanpercentile(a, 10) for a in cleanStatsProj] highestProj = [np.nanpercentile(a, 90) for a in cleanStatsProj] lowProj = [np.nanpercentile(a, 30) for a in cleanStatsProj] highProj = [np.nanpercentile(a, 70) for a in cleanStatsProj] if len(PORplotData) > 0: for index, i in enumerate(PORplotData): if index == len(PORplotData) - 1: trace.extend([ go.Scatter(x=date_series, y=i, name=str(sYear + index + 1), visible=True, connectgaps=True, line=dict(color='rgb(0,0,0)')) ]) elif np.nansum(i) > 0: trace.extend([ go.Scatter(x=date_series, y=projData[index], name=str(sYear + index + 1), visible='legendonly', connectgaps=True) ]) if medianProj: if minProj: trace.extend([ go.Scatter(x=date_series, y=minProj, name=r'Min Proj', visible=True, connectgaps=True, line=dict(color='rgba(237,0,0,0.4)')) ]) if lowestProj: trace.extend([ go.Scatter(x=date_series, y=lowestProj, name=r'10% Proj', visible=True, connectgaps=True, line=dict(color='rgba(237,0,1,0.4)')) ]) if lowProj: trace.extend([ go.Scatter(x=date_series, y=lowProj, name=r'30% Proj', visible=True, connectgaps=True, line=dict(color='rgba(0,237,0,0.4)')) ]) if medianProj: trace.extend([ go.Scatter(x=date_series, y=medianProj, name=r'50% Proj', connectgaps=True, visible=True, line=dict(color='rgba(0,237,0,0.4)')) ]) if highProj: trace.extend([ go.Scatter(x=date_series, y=highProj, name=r'70% Proj', visible=True, connectgaps=True, line=dict(color='rgba(115,237,115,0.4)')) ]) if highestProj: trace.extend([ go.Scatter(x=date_series, y=highestProj, connectgaps=True, name=r'90% Proj', visible=True, line=dict(color='rgba(1,237,237,0.4)')) ]) if maxProj: trace.extend([ go.Scatter(x=date_series, y=maxProj, name=r'Max Proj', visible=True, connectgaps=True, line=dict(color='rgba(0,0,237,0.4)')) ]) if meanData: if lowestData: trace.extend([ go.Scatter(x=date_series, y=minData, legendgroup='centiles', name=r'Min', visible=True, mode='line', line=dict(width=0), connectgaps=True, fillcolor='rgba(237,0,1,0.15)', fill='none', showlegend=False, hoverinfo='none') ]) trace.extend([ go.Scatter(x=date_series, y=lowestData, legendgroup='centiles', name=r'10%', visible=True, mode='line', line=dict(width=0), connectgaps=True, fillcolor='rgba(237,0,1,0.15)', fill='tonexty', showlegend=False, hoverinfo='none') ]) if lowData: trace.extend([ go.Scatter(x=date_series, y=lowData, legendgroup='centiles', name=r'30%', visible=True, mode='line', line=dict(width=0), connectgaps=True, fillcolor='rgba(237,237,0,0.15)', fill='tonexty', showlegend=False, hoverinfo='none') ]) if highData: trace.extend([ go.Scatter(x=date_series, y=highData, legendgroup='centiles', name=r'Stats. Shading', visible=True, mode='line', line=dict(width=0), connectgaps=True, fillcolor='rgba(115,237,115,0.15)', fill='tonexty', showlegend=True, hoverinfo='none') ]) if highestData: trace.extend([ go.Scatter(x=date_series, y=highestData, legendgroup='centiles', connectgaps=True, name=r'90%', visible=True, mode='line', line=dict(width=0), fillcolor='rgba(0,237,237,0.15)', fill='tonexty', showlegend=False, hoverinfo='none') ]) trace.extend([ go.Scatter(x=date_series, y=maxData, legendgroup='centiles', name=r'Max', visible=True, mode='line', line=dict(width=0), connectgaps=True, fillcolor='rgba(1,0,237,0.15)', fill='tonexty', showlegend=False, hoverinfo='none') ]) if basinPlotNormData: trace.extend([ go.Scatter(x=date_series, y=basinPlotNormData, name=r"Normal ('81-'10)", connectgaps=True, visible=True, hoverinfo='none', line=dict(color='rgba(0,237,0,0.4)')) ]) if meanData: if basinPlotNormData: trace.extend([ go.Scatter(x=date_series, y=meanData, name=r'Normal (POR)', visible='legendonly', hoverinfo='none', connectgaps=True, line=dict(color='rgba(0,237,0,0.4)', dash='dash')) ]) else: trace.extend([ go.Scatter(x=date_series, y=meanData, name=r'Normal (POR)', connectgaps=True, visible=True, hoverinfo='none', line=dict(color='rgba(0,237,0,0.4)')) ]) annoText = str( r"Statistical shading breaks at 10th, 30th, 50th, 70th, and 90th Percentiles<br>Normal ('81-'10) - Official median calculated from 1981 thru 2010 data <br>Normal (POR) - Unofficial mean calculated from Period of Record data <br>For more information visit: <a href='https://www.wcc.nrcs.usda.gov/normals/30year_normals_data.htm'>30 year normals calcuation description</a>" ) asterisk = '' if not basinPlotNormData: basinPlotNormData = meanData annoText = annoText + '<br>*POR data used to calculate Normals since no published 30-year normals available for this basin' asterisk = '*' if basinPlotNormData[jDay] == 0: perNorm = r'N/A' else: perNorm = str('{0:g}'.format( 100 * round(PORplotData[-1][jDay] / basinPlotNormData[jDay], 2))) perPeak = str('{0:g}'.format( 100 * round(PORplotData[-1][jDay] / max(basinPlotNormData), 2))) if not math.isnan(PORplotData[-1][jDay]): centile = ordinal( int( round( stats.percentileofscore(statsData[jDay], PORplotData[-1][jDay]), 0))) else: centile = 'N/A' dayOfPeak = basinPlotNormData.index(max(basinPlotNormData)) if jDay > dayOfPeak: tense = r'Since' else: tense = r'Until' daysToPeak = str(abs(jDay - dayOfPeak)) annoData = str(r"Current" + asterisk + ":<br>% of Normal - " + perNorm + r"%<br>" + r"% Normal Peak - " + perPeak + r"%<br>" + r"Days " + tense + r" Normal Peak - " + daysToPeak + r"<br>" r"Percentile Rank- " + centile) layout = go.Layout(images=[ dict( source= "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7f/US-NaturalResourcesConservationService-Logo.svg/2000px-US-NaturalResourcesConservationService-Logo.svg.png", xref="paper", yref="paper", x=0, y=0.9, xanchor="left", yanchor="bottom", sizex=0.4, sizey=0.1, opacity=0.5, layer="above") ], annotations=[ dict(font=dict(size=10), text=annoText, x=0, y=-0.41, yref='paper', xref='paper', align='left', showarrow=False), dict(font=dict(size=10), text=annoData, x=0, y=0.9, yref='paper', xref='paper', align='left', xanchor="left", yanchor="top", showarrow=False) ], legend=dict(traceorder='reversed', tracegroupgap=1, bordercolor='#E2E2E2', borderwidth=2), showlegend=True, title='Snow Water Equivalent Projections in<br> ' + str(basin), height=622, width=700, autosize=False, yaxis=dict(title=r'Snow Water Equivalent (in.)', hoverformat=".1f", tickformat="0f"), xaxis=dict(range=sliderDates, tickformat="%b %e", rangeselector=dict(buttons=list([ dict(count=9, label='Jan', step='month', stepmode='todate'), dict(count=6, label='Apr', step='month', stepmode='todate'), dict(count=3, label='July', step='month', stepmode='todate'), dict(label='WY', step='all') ])), rangeslider=dict(thickness=0.1), type='date')) return {'data': trace, 'layout': layout}
"pctpm", "o3", "pcto3" ] for FIPS in percDict: curr_percentages = percDict[FIPS] outfile_line = [FIPS] for feature in perc_header: ind = perc_header.index(feature) outfile_line.append(curr_percentages[ind]) if ((curr_state == "Alaska" or curr_state == "Hawaii") and (perc_header[ind] == "pm" or perc_header[ind] == "o3")): # Don't add that to line pass else: outfile_line.append( stats.percentileofscore(feature_list[ind], curr_percentages[ind], kind="weak")) outfile_print.append(outfile_line) # Sort the entries outfile_print_Sorted = sorted(outfile_print[1:len(outfile_print)], key=lambda entry: int(entry[0])) outfile_print_Sorted.insert(0, outfile_header) print "Writing " + curr_state + " Percentiles for Pollution Data to CSV File..." outFileName = "CalEnviroScreen_tract_ALL_POLLUTION_Percentiles_" + curr_state + ".csv" with open(outFileName, "wb") as f: writer = csv.writer(f) writer.writerows(outfile_print_Sorted) f.close() print "Write complete! \n"
def home(request): #symbol = 'IBM' stocks = pd.read_csv('sp_500_stocks.csv') IEX_CLOUD_API_TOKEN = os.getenv('IEX_CLOUD_API_TOKEN') # with open('sp_500_stocks.csv', newline = '') as f: # reader = csv.reader(f) # symbols = list(reader) # my_columns = ['Ticker', 'Price'] #Single API Call #final_dataframe = pd.DataFrame(columns = my_columns) # for symbol in stocks['Ticker']: # api_url = f'https://sandbox.iexapis.com/stable/stock/{symbol}/quote?token={IEX_CLOUD_API_TOKEN}' # data = requests.get(api_url).json() # final_dataframe = final_dataframe.append(pd.Series([symbol, data['latestPrice']], index = my_columns), ignore_index = True) #Batch API Call S&P500 # symbol_groups = list(chunks(stocks['Ticker'], 100)) # symbol_strings = [] # for i in range(0, len(symbol_groups)): # symbol_strings.append(','.join(symbol_groups[i])) # final_dataframe = pd.DataFrame(columns = my_columns) # price_list = [] # for symbol_string in symbol_strings: # batch_api_url = f'https://sandbox.iexapis.com/stable/stock/market/batch/?types=quote&symbols={symbol_string}&token={IEX_CLOUD_API_TOKEN}' # data = requests.get(batch_api_url).json() # for symbol in symbol_string.split(','): # final_dataframe = final_dataframe.append( # pd.Series([symbol, # data[symbol]['quote']['latestPrice']], # index = my_columns), # ignore_index = True) symbol_groups = list(chunks(stocks['Ticker'], 100)) symbol_strings = [] for i in range(0, len(symbol_groups)): symbol_strings.append(','.join(symbol_groups[i])) my_columns = [ 'Ticker', 'Price', 'OneYearPriceReturn', 'OneYearReturnPercentile', 'SixMonthPriceReturn', 'SixMonthReturnPercentile', 'ThreeMonthPriceReturn', 'ThreeMonthReturnPercentile', 'OneMonthPriceReturn', 'OneMonthReturnPercentile', 'HQMScore' ] final_dataframe = pd.DataFrame(columns=my_columns) for symbol_string in symbol_strings: batch_api_url = f'https://sandbox.iexapis.com/stable/stock/market/batch/?types=stats,quote&symbols={symbol_string}&token={IEX_CLOUD_API_TOKEN}' data = requests.get(batch_api_url).json() for symbol in symbol_string.split(','): final_dataframe = final_dataframe.append(pd.Series( [ symbol, data[symbol]['quote']['latestPrice'], data[symbol]['stats']['year1ChangePercent'], 'N/A', data[symbol]['stats']['month6ChangePercent'], 'N/A', data[symbol]['stats']['month3ChangePercent'], 'N/A', data[symbol]['stats']['month1ChangePercent'], 'N/A', 'N/A' ], index=my_columns), ignore_index=True) time_periods = ['OneYear', 'SixMonth', 'ThreeMonth', 'OneMonth'] final_dataframe.sort_values('OneYearPriceReturn', ascending=False, inplace=True) hqm_dataframe = final_dataframe.mask( final_dataframe.astype(object).eq('None')).dropna() for row in hqm_dataframe.index: for time_period in time_periods: change_column = f'{time_period}PriceReturn' percentile_column = f'{time_period}ReturnPercentile' hqm_dataframe.loc[row, percentile_column] = stats.percentileofscore( hqm_dataframe[change_column], hqm_dataframe.loc[row, change_column]) / 100 for row in hqm_dataframe.index: momentum_percentiles = [] for time_period in time_periods: momentum_percentiles.append( hqm_dataframe.loc[row, f'{time_period}ReturnPercentile']) hqm_dataframe.loc[row, 'HQMScore'] = mean(momentum_percentiles) hqm_dataframe.sort_values('HQMScore', ascending=False, inplace=True) json_records = hqm_dataframe.reset_index().to_json(orient='records') data = [] data = json.loads(json_records) context = {'d': data} #final_dataframe.drop(final_dataframe[final_dataframe['One-Year Price Return'].index == 'None'], inplace = True) # return render(request, 'quant_momentum/momentum.html', { # 'dataframe': final_dataframe, # }) # symbol = 'IBM' # api_url = f'https://sandbox.iexapis.com/stable/stock/{symbol}/quote?token={IEX_CLOUD_API_TOKEN}' # data = requests.get(api_url).json() # #print(data) # final_dataframe = final_dataframe.append( # pd.Series([symbol, # data['latestPrice']], # index = my_columns), # ignore_index = True) # htmltable = hqm_dataframe.to_html() # return HttpResponse(htmltable) return render(request, 'quant_momentum/momentum.html', context)
def tfr_boot_sig_mask(curr_power,baseidx,n_perms=2000,alpha=0.05,method='fdr_bh',averagePower=True,useMedian=True): """ Bootstraps and significance masks time-frequency power (can perform multiple comparisons correction) """ #Bootstrap and significance mask if not(np.isnan(alpha)): if useMedian: curr_power_ave = np.median(curr_power,axis=2) else: curr_power_ave = np.mean(curr_power,axis=2) #take mean across trials #Create bootstrap distribution (based on EEGLAB's bootstat function) num_iters = ceil(n_perms/len(baseidx)) boot_dist = np.zeros([curr_power.shape[0],len(baseidx),num_iters]) for n in range(num_iters): # print(int((n+1)*len(baseidx))) #Shuffle time dimension, holding freq and trials fixed curr_power_tmp = curr_power.copy() curr_power_tmp = curr_power_tmp[:,baseidx,:] for j in range(curr_power_tmp.shape[0]): for k in range(curr_power_tmp.shape[2]): list_tmp = curr_power_tmp[j,:,k].tolist() random.shuffle(list_tmp) curr_power_tmp[j,:,k] = np.asarray(list_tmp) # np.random.shuffle(curr_power_tmp[j,:,k]) #Take median across trials boot_dist[:,:,n] = np.median(curr_power_tmp,2) #Reformat into n_perms x n_freqs boot_dist = boot_dist.reshape((curr_power.shape[0],len(baseidx)*num_iters)).T #Compute uncorrected p-values # alpha=0.05 p_raw = np.zeros(list(curr_power_ave.shape)) for j in range(p_raw.shape[0]): #freq for k in range(p_raw.shape[1]): #time percentile_temp = percentileofscore(boot_dist[:,j],curr_power_ave[j,k])/100 if percentile_temp>0.5: p_raw[j,k] = 2*(1-percentile_temp) else: p_raw[j,k] = 2*(percentile_temp) #Correct p-value with FDR if method!='none': rej, pval_corr = smm.multipletests(p_raw.flatten(), alpha=alpha, method=method)[:2] pval_corr = pval_corr.reshape(curr_power_ave.shape) else: pval_corr = p_raw.copy() #Significance mask the result curr_masked_power_ave = np.copy(curr_power_ave) curr_masked_power_ave[pval_corr>=alpha]=0 #set non-significant timepoints to 0 else: if averagePower: if useMedian: curr_power = np.median(curr_power,axis=2) else: curr_power = np.mean(curr_power,axis=2) #take mean across trials curr_masked_power_ave = np.copy(curr_power) #no significance mask return curr_masked_power_ave
def percentile(self, series): t = series.iloc[-1] p = stats.percentileofscore(series, t, kind='strict') return p
imax = np.argmax(res['lnprobability']) #N = 32768 # number of maximum lnprob values to include N = res['lnprobability'].shape[0] * res['lnprobability'].shape[1] # all values print('N is', N) imaxmult = np.argpartition(res['lnprobability'], -N, axis=None)[-N:] csz = res["chain"].shape i, j = np.unravel_index(imax, res['lnprobability'].shape) theta_max = res['chain'][i, j, :].copy() flatchain = res["chain"].reshape(csz[0] * csz[1], csz[2]) max_percentile = np.zeros(model.ndim) for i in range(model.ndim): max_percentile[i] = stats.percentileofscore(flatchain[:, i], theta_max[i]) print('max percentile', max_percentile[i]) sps = reader.get_sps(res) # generate fake obs to get full resolution spectra fake_obs = obs.copy() fake_obs['spectrum'] = None fake_obs['wavelength'] = None spec, phot, x = model.predict(theta_max, obs=obs, sps=sps) full_spec = model.predict(theta_max, obs=fake_obs, sps=sps)[0] wave_eff = [f.wave_effective for f in res['obs']['filters']]
appl_std250.plot() close_px.rolling('20D').mean() aapl_px = close_px.AAPL['2006':'2007'] ma60 = aapl_px.rolling(30, min_periods=20).mean() ewma60 = aapl_px.ewm(alpha=0.1).mean() aapl_px.plot() ma60.plot(style='k--', label='Simple MA') ewma60.plot(style='k-', label='EW MA') plt.legend() # ### Binary Moving Window Functions #%% spx_px = close_px_all['SPX'] spx_rets = spx_px.pct_change() returns = close_px.pct_change() corr = returns.AAPL.rolling(125, min_periods=100).corr(spx_rets) corr corr = returns.rolling(125, min_periods=100).corr(spx_rets) corr.plot() corr # ### User-Defined Moving Window Functions #%% from scipy.stats import percentileofscore score_at_2percent = lambda x: percentileofscore(x, 0.02) result = returns.AAPL.rolling(250).apply(score_at_2percent) result.plot()
def _map(self, val: float, metric_name: str) -> float: vals = self.percentiles[metric_name] mapped_val = stats.percentileofscore(vals, val, kind="weak") / 100.0 return mapped_val
wnba = pd.read_csv('wnba.csv') propr = wnba["Age"].value_counts(normalize=True).sort_index() * 100 proportion_25 = propr[25] / 100 percentage_30 = propr[30] percentage_over_30 = propr.loc[30:].sum() percentage_below_23 = propr.loc[:23].sum() ## 6. Percentiles and Percentile Ranks ## from scipy.stats import percentileofscore wnba = pd.read_csv('wnba.csv') percentile_rank_half_less = percentileofscore(wnba["Games Played"], 17, kind="weak") percentage_half_more = (100 - percentile_rank_half_less) ## 7. Finding Percentiles with pandas ## wnba = pd.read_csv('wnba.csv') percentiles = wnba['Age'].describe(percentiles=[.5, .75, .95]) age_upper_quartile = percentiles['75%'] age_middle_quartile = percentiles['50%'] age_95th_percentile = percentiles['95%'] question1 = True question2 = False question3 = True
def get_initial_bins(data): #t0 = [data.metric_val[owner].nonzero()[0][0] for owner in data.owner_i] y0 = get_y0(data) prc0 = [stats.percentileofscore(data.metric_val, i) for i in y0] return digitize(prc0, range(0, 101, 10))
def plot_bldg_avg_monthly_group(df, info, fuel='tot', year_range=None, figsize=(6, 5), ylabel=None): """Plot the average monthly EUI of a building by a specified fuel type compared against all other buildings in group""" # Parse building info building, full_addr, building_type, cz = _parse_building_info(df, info) # Get group group = get_group(df, building_type=building_type, cz=cz) # Define field name from fuel type and year range if year_range: start_year = str(year_range[0]) end_year = str(year_range[1]) field_prefix = '_' + str(start_year) + '_' + str(end_year) else: field_prefix = '' field_mean = ('summary', 'EUI_' + fuel + '_avg' + field_prefix) field_avg_mo = 'EUI_' + fuel + '_mo_avg' + field_prefix # Access data building_eui = building[field_mean].iloc[0] group_eui = group[field_mean] group_eui = group_eui[group_eui.notnull()] percentile = stats.percentileofscore(group_eui, building_eui) bldg_trace = building[field_avg_mo].iloc[0] group_traces = group[field_avg_mo] group_mean_trace = group[field_avg_mo].mean() # Define labels and title months = [int(mo) for mo in bldg_trace.index] if ylabel is None: ylabel = 'Average monthly EUI\nfrom 2009-2015\n(kBtu/sq. ft.)' title = full_addr + '\nType = ' + building_type + ', CZ = ' + cz # Plot fig = plt.figure(figsize=figsize) for (i, row) in group_traces.iterrows(): plt.plot(months, row, color='0.9', label='_nolegend_') plt.plot(months, bldg_trace, color='r', linewidth=3, label='Current building ' + terms[fuel]) plt.plot(months, group_mean_trace, color='b', linewidth=3, label='Group average ' + terms[fuel]) plt.xticks(months) ax = plt.gca() ax.text(12.2, bldg_trace.iloc[-1], '{:.1f}%'.format(percentile), va="center", fontsize=16) # Set miscel properties setproperties(xlabel='Month', ylabel=ylabel, title=title, xlim=(1, 12), legend=True, legend_bbox_to_anchor=(1, 1), legendloc=2, tickfontsize=16, labelfontsize=16, legendfontsize=16) return fig, ax
def cad_prob(cads, param=cad_param): import scipy.stats as stats return [("time_{}".format(time), stats.percentileofscore(cads, float(time) / (24.0 * 60.0)) / 100.0) for time in param]
dwarfs ) # these are height-units difference in mean height of dwarfs and smurfs origDiff = np.mean(smurfs) - np.mean(dwarfs) popN = smurfs + dwarfs allDiffs = [] for n in range(nResamp): random.shuffle(popN) # shuffle in place reSampDwarfs = popN[:len(dwarfs)] # resample our dwarf population; m reSampSmurfs = popN[len( dwarfs):] # assume the rest of the sample are smurfs reSampDiff = np.mean(reSampSmurfs) - np.mean(reSampDwarfs) allDiffs.append(reSampDiff) plt.hist( allDiffs ) # plot a histogram of the distribution under the null hypothesis; more accurate than real NH? #plt.show() allDiffs.sort() # sort in place bottomCrit = allDiffs[int( 0.025 * nResamp)] # bottom criterion value that must be exceeded topCrit = allDiffs[int(0.975 * nResamp)] # top criterion value that must be exceeded print bottomCrit, topCrit print stats.percentileofscore( allDiffs, origDiff ) # show percentage likelihood (i.e. p-value) of seeing these distributions under the null hypothesis
def plot_bldg_hist(df, info, value, histrange=None, figsize=(6, 5), xlabel=None): """Plot histogram of value with line indicating the value of current building""" # Parse building info building, full_addr, building_type, cz = _parse_building_info(df, info) # Extract rows from the specified building types and climate zones group = get_group(df, building_type=building_type, cz=cz) # Get values building_eui = building[value].iloc[0] group_eui = group[value] group_eui = group_eui[group_eui.notnull()] group_eui_mean = group_eui.mean() percentile = stats.percentileofscore(group_eui, building_eui) # Define xlabel and title if xlabel is None: if 'fit' in value[1]: xlabel = 'Change in annual EUI from 2009-2015\n(kBtu/ft2/year)' elif 'avg' in value[1]: xlabel = 'Average annual EUI from 2009-2015 \n(kBtu/ft2)' title = full_addr + '\nType = ' + building_type + ', CZ = ' + cz # Plot fig = plt.figure(figsize=figsize) ax = plt.gca() # num_bins = min(20, int(np.ceil(len(group_eui) / 3))) # to fix ax = sns.distplot(group_eui, hist_kws={'range': histrange}, kde_kws={'clip': histrange}) ylim = ax.get_ylim() ax.plot([building_eui, building_eui], ylim, color='r', linewidth=2, label='Current building') ax.plot([group_eui_mean, group_eui_mean], ylim, color='b', linewidth=2, label='Group average') ax.text(building_eui, ylim[1] * 1.05, '{:.1f}%'.format(percentile), ha="center", fontsize=16) # Set miscell properties setproperties(xlabel=xlabel, ylabel='Density', title=title, ylim=(ylim[0], ylim[1] * 1.15), legend=True, legend_bbox_to_anchor=(1, 1), legendloc=2, tickfontsize=18, labelfontsize=18, legendfontsize=16) return fig, ax
str(date) + "_results_random.p" for date in dates]: filename = file.split("\\")[-1] date = filename.split("_")[2] #print date summaries = pickle.load( open( label_accuracy_loc + "\\" + level + "_" + sbj_id + "_" + date + "_results.p", "rb")) summaries_random = pickle.load(open(file, "rb")) for s, summary in enumerate(summaries): ind = summary["track"] if summary['f1_score'] == -1: total_f1_percentile[ind].append(-1) else: percentile = percentileofscore(summaries_random[s]['f1_score'], summary['f1_score']) total_f1_percentile[ind].append(percentile) pickle.dump( total_f1_percentile, open(label_accuracy_loc + "percentile_f1_" + level + ".p", "wb")) percentile_save_file.writerow([sbj_id]) for key, val in total_f1_percentile.items(): percentile_save_file.writerow([key, val]) print "f1" #print_averages(total_f1_percentile) print total_f1_percentile print "------------------------Percentile of accuracy score in RANDOM------------------------------------" total_accuracy_percentile = {
plot_spikes = plot_spikes[:, :cutoff_post_sort] #channel = 0 #stft_cut = stats.zscore(dat.amplitude_array[:,:],axis=-1) #stft_cut = stft_cut[:,channel,...,time_lims[0]:time_lims[1]] #stft_cut = np.reshape(stft_cut,(-1,*stft_cut.shape[2:])) #stft_ticks = dat.time_vec[time_lims[0]:time_lims[1]]*1000 #stft_tick_inds = np.arange(0,len(stft_ticks),250) percentile_array = np.zeros((*mean_tau.shape, mean_tau.shape[-1])) for trial_num, (this_mean_tau, this_tau_dist) in \ enumerate(zip(mean_tau, np.moveaxis(tau_samples,0,-1))): for tau1_val, this_tau in enumerate(this_mean_tau): for tau2_val, this_dist in enumerate(this_tau_dist): percentile_array[trial_num, tau1_val, tau2_val] = \ percentileofscore(this_dist, this_tau) # Visually, threshold of <1 percentile seems compelling # Find all trials where all the upper triangular elements are <1 # and lower triangular elements are >99 lower_thresh = 1 upper_thresh = 100 - lower_thresh good_trial_list = np.where([all(x[np.triu_indices(states-1,1)] < lower_thresh) \ and all(x[np.tril_indices(states-1,-1)] > upper_thresh) \ for x in percentile_array])[0] # Plot only good trials # Overlay raster with CDF of switchpoints vline_kwargs = {'color': 'red', 'linewidth': 3, 'alpha': 0.7} hline_kwargs = { 'color': 'red',
def filtering(self, reservoirID=1987, method='NDWI'): landsatdate = [] surfacearea = [] landsatfile = open('./data/sarea/L8/' + str(reservoirID) + '.txt', 'r') landsatdata = csv.DictReader(landsatfile, delimiter=',') for row in landsatdata: landsatdate.append( datetime.datetime.strptime(row['Date'], "%Y-%m-%d")) surfacearea.append(float(row[method])) newdates = [] newareas = [] series = "Date,NDWI" if len(surfacearea) >= 3: for index in xrange(0, len(surfacearea)): iqr = stats.iqr(surfacearea) minsa = stats.scoreatpercentile(surfacearea, 25) - 1.5 * iqr maxsa = stats.scoreatpercentile(surfacearea, 75) + 1.5 * iqr if surfacearea[index] >= minsa and surfacearea[index] <= maxsa: if index == 0 or index == len(surfacearea) - 1: if index == 0: newdates.append(landsatdate[index]) newareas.append(surfacearea[index]) else: newdates.append(landsatdate[index]) newareas.append(surfacearea[index]) else: xdate = landsatdate[index - 1] ydate = landsatdate[index] zdate = landsatdate[index + 1] if float((ydate - xdate).days) <= 33.0 and float( (zdate - ydate).days) <= 33.0: iqr = stats.iqr(surfacearea) minsa = stats.scoreatpercentile(surfacearea, 25) - 1.5 * iqr maxsa = stats.scoreatpercentile(surfacearea, 75) + 1.5 * iqr x = stats.percentileofscore( surfacearea, surfacearea[index - 1]) y = stats.percentileofscore( surfacearea, surfacearea[index]) z = stats.percentileofscore( surfacearea, surfacearea[index + 1]) if abs(x - z) <= 20.0 and abs(x - y) >= 50.0: continue elif abs(x - z) <= 20.0 and abs(y - z) >= 50.0: continue else: newdates.append(landsatdate[index]) newareas.append(surfacearea[index]) else: newdates.append(landsatdate[index]) newareas.append(surfacearea[index]) for i in xrange(len(newdates)): series = series + '\n' + newdates[i].strftime( "%Y-%m-%d") + "," + "{0:.2f}".format(newareas[i]) with open('./data/sarea/L8_mod/' + str(reservoirID) + '.txt', 'w') as txt: txt.write(series)
def updtChart(site_triplet, siteName): print('Working on PREC POR Chart for ' + siteName) statsData = [] minData = [] maxData = [] meanData = [] lowestData = [] highestData = [] lowData = [] highData = [] sliderDates = [] meanData = [] trace = [] sitePlotData = [] PORplotData = [] sitePlotNormData = [] validTrip = [site_triplet] sensor = r"PREC" date_series = [date(2015,10,1) + datetime.timedelta(days=x) for x in range(0, 366)] #could use any year with a leap day if validTrip: normData = [] for triplet in validTrip: url = '/'.join([dataUrl,'normals', 'DAILY', sensor, triplet.replace(':','_') + '.json']) with request.urlopen(url) as d: jTemp = json.loads(d.read().decode()) normData.append(jTemp) sitePlotNormData = np.array(normData[0]['values'], dtype=np.float) sitePlotNormData = sitePlotNormData.tolist() beginDateDict = {} for siteMeta in meta: beginDateDict.update( {str(siteMeta['stationTriplet']) : dt.strptime(str(siteMeta['beginDate']), "%Y-%m-%d %H:%M:%S")}) siteBeginDate = min(beginDateDict.values()) sYear = siteBeginDate.year if siteBeginDate.year > sYear: if siteBeginDate.month < 10: sYear = siteBeginDate.year else: if siteBeginDate.month == 10 and siteBeginDate.day == 1: sYear = siteBeginDate.year else: sYear = siteBeginDate.year + 1 sDate = date(sYear, 10, 1).strftime("%Y-%m-%d") eDate = today.date().strftime("%Y-%m-%d") data = [] for triplet in validTrip: url = '/'.join([dataUrl,'DAILY', sensor, triplet.replace(':','_') + '.json']) with request.urlopen(url) as d: jTemp = json.loads(d.read().decode()) data.append(trimToOct1(jTemp)) for dataSite in data: if dataSite: padMissingData(dataSite,sDate,eDate) sitePlotData = np.array(data[0]['values'], dtype=np.float) PORplotData = list([sitePlotData[i:i+366] for i in range(0,len(sitePlotData),366)]) allButCurrWY = list(PORplotData) del allButCurrWY[-1] statsData = list(map(list,zip(*allButCurrWY))) if len(statsData[0]) > 1: statsData[151] = statsData[150] with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) minData = [np.nanmin(a) for a in statsData] maxData = [np.nanmax(a) for a in statsData] meanData = [np.nanmean(a) for a in statsData] lowestData = [np.nanpercentile(a,10) for a in statsData] highestData = [np.nanpercentile(a,90) for a in statsData] lowData = [np.nanpercentile(a,30) for a in statsData] highData = [np.nanpercentile(a,70) for a in statsData] sliderDates = list(chain([(date_series[0])] + [date_series[-1]])) else: sliderDates = list(chain([(date_series[0])] + [date_series[-1]])) if len(PORplotData) > 0: for index, i in enumerate(PORplotData): if index == len(PORplotData)-1: trace.extend( [go.Scatter( x=date_series,y=i, name=str(sYear + index + 1), visible=True,connectgaps=True, line=dict(color='rgb(0,0,0)'))]) elif np.nansum(i) > 0: trace.extend( [go.Scatter(x=date_series,y=i, name=str(sYear + index + 1), visible='legendonly', connectgaps=True)]) if meanData: if lowestData: trace.extend( [go.Scatter(x=date_series,y=minData ,legendgroup='centiles',name=r'Min', visible=True,mode='line', line=dict(width=0),connectgaps=True, fillcolor='rgba(237,0,1,0.15)', fill='none',showlegend=False, hoverinfo='none')]) trace.extend( [go.Scatter(x=date_series,y=lowestData ,legendgroup='centiles',name=r'10%', visible=True,mode='line', line=dict(width=0),connectgaps=True, fillcolor='rgba(237,0,1,0.15)', fill='tonexty',showlegend=False, hoverinfo='none')]) if lowData: trace.extend( [go.Scatter(x=date_series,y=lowData, legendgroup='centiles',name=r'30%', visible=True,mode='line', line=dict(width=0),connectgaps=True, fillcolor='rgba(237,237,0,0.15)', fill='tonexty',showlegend=False, hoverinfo='none')]) if highData: trace.extend( [go.Scatter(x=date_series,y=highData, legendgroup='centiles', name=r'Stats. Shading', visible=True,mode='line', line=dict(width=0),connectgaps=True, fillcolor='rgba(115,237,115,0.15)', fill='tonexty',showlegend=True, hoverinfo='none')]) if highestData: trace.extend( [go.Scatter(x=date_series,y=highestData, legendgroup='centiles',connectgaps=True, name=r'90%',visible=True ,mode='line',line=dict(width=0), fillcolor='rgba(0,237,237,0.15)', fill='tonexty',showlegend=False, hoverinfo='none')]) trace.extend( [go.Scatter(x=date_series,y=maxData ,legendgroup='centiles',name=r'Max', visible=True,mode='line', line=dict(width=0),connectgaps=True, fillcolor='rgba(1,0,237,0.15)', fill='tonexty',showlegend=False, hoverinfo='none')]) if minData: trace.extend( [go.Scatter(x=date_series,y=minData, name=r'Min',visible=True, hoverinfo='none',connectgaps=True, line=dict(color='rgba(237,0,0,0.5)'))]) if len(sitePlotNormData) > 0: trace.extend( [go.Scatter(x=date_series, y=sitePlotNormData, name=r"Normal ('81-'10)",connectgaps=True, visible=True,hoverinfo='none', line=dict(color='rgba(0,237,0,0.4)'))]) if meanData: if len(sitePlotNormData) > 0: trace.extend( [go.Scatter(x=date_series, y=meanData,name=r'Normal (POR)', visible='legendonly', hoverinfo='none', connectgaps=True, line=dict(color='rgba(0,237,0,0.4)', dash='dash'))]) else: trace.extend( [go.Scatter(x=date_series,y=meanData, name=r'Normal (POR)',connectgaps=True, visible=True,hoverinfo='none', line=dict(color='rgba(0,237,0,0.4)'))]) if maxData: trace.extend( [go.Scatter(x=date_series,y=maxData, name=r'Max',visible=True, hoverinfo='none',connectgaps=True, line=dict(color='rgba(0,0,237,0.4)'))]) annoText = str(r"Statistical shading breaks at 10th, 30th, 50th, 70th, and 90th Percentiles<br>Normal ('81-'10) - Official mean calculated from 1981 thru 2010 data <br>Normal (POR) - Unofficial mean calculated from Period of Record data <br>For more information visit: <a href='https://www.wcc.nrcs.usda.gov/normals/30year_normals_data.htm'>30 year normals calcuation description</a>") asterisk = '' if len(sitePlotNormData) == 0: sitePlotNormData = meanData annoText = annoText + '<br>*POR data used to calculate Normals since no published 30-year normals available for this site' asterisk = '*' jDay = len(PORplotData[-1])-1 if sitePlotNormData[jDay] == 0: perNorm = r'N/A' else: perNorm = str('{0:g}'.format(100*round( PORplotData[-1][jDay]/sitePlotNormData[jDay],2))) perPeak = str('{0:g}'.format(100*round( PORplotData[-1][jDay]/max(sitePlotNormData),2))) if not math.isnan(PORplotData[-1][jDay]): centile = ordinal(int(round( stats.percentileofscore( statsData[jDay],PORplotData[-1][jDay]),0))) else: centile = 'N/A' dayOfPeak = sitePlotNormData.index(max(sitePlotNormData)) if jDay > dayOfPeak: tense = r'Since' else: tense = r'Until' daysToPeak = str(abs(jDay-dayOfPeak)) annoData = str(r"Current" + asterisk + ":<br>% of Normal - " + perNorm + r"%<br>" + r"% of Yearly Avg - " + perPeak + r"%<br>" + r"Days " + tense + r" End of WY - " + daysToPeak + r"<br>" r"Percentile Rank- " + centile) layout = go.Layout( images= [dict( source= "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7f/US-NaturalResourcesConservationService-Logo.svg/2000px-US-NaturalResourcesConservationService-Logo.svg.png", xref="paper", yref="paper", x= 0, y= 0.9, xanchor="left", yanchor="bottom", sizex= 0.4, sizey= 0.1, opacity= 0.5, layer= "above" )], annotations=[dict( font=dict(size=10), text=annoText, x=0,y=-0.41, yref='paper',xref='paper', align='left', showarrow=False), dict(font=dict(size=10), text=annoData, x=0,y=0.9, yref='paper',xref='paper', align='left', xanchor="left", yanchor="top", showarrow=False)], legend=dict(traceorder='reversed',tracegroupgap=1, bordercolor='#E2E2E2',borderwidth=2), showlegend = True, title='Precipitation at ' + siteName, height=622, width=700, autosize=False, yaxis=dict(title=r'Precipitation (in.)',hoverformat='.1f', tickformat="0f"), xaxis=dict( range=sliderDates, tickformat="%b %e", rangeselector=dict( buttons=list([ dict(count=9, label='Jan', step='month', stepmode='todate'), dict(count=6, label='Apr', step='month', stepmode='todate'), dict(count=3, label='July', step='month', stepmode='todate'), dict(label='WY', step='all') ]) ), rangeslider=dict(thickness=0.1), type='date' ) ) return {'data': trace, 'layout': layout}
def score(x): return percentileofscore(dat[col], x[col])
def baseline_calc_pyemis_old(df_new, tdf, energy_type, iters=16): freq = calculate_frequency(df_new) if not freq: return {'error': "the frequency of the timeseries can't be analyzed"} if freq < timedelta(hours=1): model = 'Weekly30Min' else: model = 'Weekly60Min' if model == 'Weekly30Min': n = -336 frequ = 30 else: n = -168 frequ = 60 # if None I need a empty dataframe if not isinstance(df_new, pd.DataFrame): df_new = pd.DataFrame() # join dataframes (ALREADY ALIGNED AND CURATED) final = df_new.join(tdf) final = final.dropna() if final.empty: return { "error": "the df is empty", #"df_new": df_new.reset_index().to_dict(orient="records"), #"temp": tdf.reset_index().to_dict(orient="records") } # final lists ts_list = [] value_list = [] temps_list = [] for k, value in final.iterrows(): ts_list.append(k) value_list.append(float(value.value)) temps_list.append(float(value.temperature)) # calculate model using old pyemis code # create numpyarray res = [] for idx, _ in enumerate(ts_list): res.append((temps_list[idx], value_list[idx], mktime(ts_list[idx].timetuple()))) arr = np.array(res, dtype=[('temperature', 'float'), ('consumption', 'float'), ('timestamp', 'float')]) if model != 'Weekly30Min': factory = ConsumptionModels.WeeklyModelFactory( ConsumptionModels.AnyModelFactory( models=[ConsumptionModels.ConstantModel]), timedelta(minutes=frequ)) else: if not energy_type or energy_type != 'waterConsumption': factory = ConsumptionModels.WeeklyModelFactory( ConsumptionModels.AnyModelFactory(), timedelta(minutes=frequ)) else: factory = ConsumptionModels.WeeklyModelFactory( ConsumptionModels.AnyModelFactory( models=[ConsumptionModels.ConstantModel]), timedelta(minutes=frequ)) levels = {} smileys = [] prediction = [] ts_list_final = [] temps_list_final = [] value_list_final = [] for i in xrange(iters, 0, -1): # calculo el model amb totes les dades excepte la darrera setmana model_fine = True if len(arr[:n * i]) > abs(n) else False try: Model = factory(arr[:n * i]) except: model_fine = False # si el model es correcte continuare if model_fine: # poso la info que puc calcular if i > 1: ts_list_final.extend(ts_list[n * i:n * (i - 1)]) temps_list_final.extend(temps_list[n * i:n * (i - 1)]) value_list_final.extend(value_list[n * i:n * (i - 1)]) else: ts_list_final.extend(ts_list[n * i:]) temps_list_final.extend(temps_list[n * i:]) value_list_final.extend(value_list[n * i:]) # parameters = Model.parameters() # calculo els percentils per aquest model i les seves dades de partida percentiles = Model.percentiles(arr[:n * i], [5, 25, 75, 95]) # calculo la prediccio pel seguent mes que no esta al model predict = Model.prediction( arr[n * i:n * (i - 1)]) if i > 1 else Model.prediction(arr[n * i:]) prediction.extend(predict) for key in ['5', '25', '75', '95']: try: level_val = (predict[n:] + percentiles[key][n:]).tolist() levels[key].extend(level_val) except: level_val = (predict[n:] + percentiles[key][n:]).tolist() levels[key] = level_val # smiley faces res_model = Model.residuals(arr[:n * i]) res_last_week = Model.residuals( arr[n * i:n * (i - 1)]) if i > 1 else Model.residuals(arr[n * i:]) smiley = np.array( [percentileofscore(res_model, r) for r in res_last_week]) smileys.extend(smiley.tolist()) # nyapa models horaris per evitar els percentils negatius. Revisar models ! if model != 'Weekly30Min': for key in ['5', '25', '75', '95']: if key in levels: for i, val in enumerate(levels[key]): if val < -10: if i == 0: levels[key][i] = levels[key][i + 1] elif i == len(levels[key]) - 1: levels[key][i] = levels[key][i - 1] else: levels[key][i] = (levels[key][i - 1] + levels[key][i + 1]) / 2 # save model @ mongo!! return { 'timestamps': ts_list_final, 'temperatures': temps_list_final, 'values': value_list_final, 'P5': levels['5'] if '5' in levels else None, 'P25': levels['25'] if '25' in levels else None, 'P75': levels['75'] if '75' in levels else None, 'P95': levels['95'] if '95' in levels else None, 'prediction': prediction, 'smileys': smileys, }
import matplotlib.pyplot as plt BINS = 10 WORD_COUNT = 1000 (X, ids, reliability) = pickle.load( open( r'D:\JanneK\Documents\git_repos\LaureaTextAnalyzer\results\analyze_embeddings\word2vec_document_vectors.pickle', 'rb')) ind = np.argsort(reliability) reliability = reliability[ind] ids = np.array(ids)[ind] X = X[ind, :] prc_score = np.array( [stats.percentileofscore(reliability, a, 'rank') for a in reliability]) X_pooled = np.zeros((BINS, 300)) reliability_pooled = np.zeros(BINS) doc_count = np.zeros(BINS) k = 0 s = 100 / BINS for i in range(0, BINS): target = s * (i + 1) k1 = k while k < len(reliability) - 1: k += 1 if prc_score[k] >= target: break if i == 0:
def countyInfo(X): """ Summaries of initial county income data. """ Print = False if Print: print(""" # ===================================================================== # Initial County Data # ===================================================================== """) incomeP = X.TP.cols.R_wages_NP_dollars[:] + X.TP.cols.R_wages_SB_dollars[:] + \ X.TP.cols.R_gov_support_dollars[:] if Print: print("\nfamily income, median = ${0:,.9g}".format( np.round(np.median(X.TF.cols.R_dollars[:]),0).item())) print("family income, mean = ${0:,.9g}".format( np.round(X.TF.cols.R_dollars[:].mean(),0).item())) We1 = X.TP.get_where_list("(work_status >=4)") ave_working_income = incomeP[We1].mean() if Print: print("\nmean working income = ${0:,.9g}".format(np.round(ave_working_income,0).item())) ave_income = incomeP.mean() if Print: print(("\nmean person income = ${0:,.9g}, percentile of person income at " + "mean = {1:,.9g}").format( np.round(ave_income,0).item(), stats.percentileofscore(incomeP, ave_income))) W0 = np.where(incomeP < ave_income)[0] W1 = np.where(incomeP >= ave_income)[0] if Print: print("total person income < mean person income: ${0:,.9g}, size= {1:,d}".format( np.round(incomeP[W0].sum() * X.populationRatio,0).item(), int(W0.size * X.populationRatio))) print("total person income >= mean person income: ${0:,.9g}, size= {1:,d}".format( np.round(incomeP[W1].sum() * X.populationRatio,0).item(), int(W1.size * X.populationRatio))) print("total county income = ${0:,.9g}\n".format( np.round(incomeP.sum() * X.populationRatio,0).item())) for i in np.linspace(0,100,21): print(" percentile = {0:>5.4g}, person income = ${1:>12,.9g}".format( i, np.round(stats.scoreatpercentile(incomeP, i),0).item())) print("\n") famRecTot = X.TF.cols.R_dollars[:] totalNIWF = float(X.TP.get_where_list("((work_status==0) | (work_status==1))").size) totalUnemp = float(X.TP.get_where_list("((work_status==2) | (work_status==3))").size) if 1==2: # for testing, otherwise takes too long for i in np.linspace(0,100,21): famIncCut = np.round(stats.scoreatpercentile(famRecTot, i),0).item() Wc = np.where(famRecTot <= famIncCut)[0] countNIWF = 0 countUnemp = 0 for wc in Wc: fid = X.TF.cols.fid[wc] wfid = X.TP.get_where_list("fid=={0:d}".format(fid)) assert len(wfid) == 2 for pid in wfid: if X.TP.cols.work_status[pid] in [0,1]: countNIWF += 1 if X.TP.cols.work_status[pid] in [2,3]: countUnemp += 1 fNIWF = countNIWF / totalNIWF fUnemp = countUnemp / totalUnemp print( (" percentile = {0:>5.4g}, family income = ${1:>12,.9g}, fract of NIWF = {2:.4f}, " + \ "fract of Unemp = {3:.4f}").format(i, famIncCut, fNIWF, fUnemp)) print("\n") # calculate thresholds for family income target. Could increase family income target # by 3% to ensure that all families choose Wage Option 1 (and not token bonus, Wage Option 2) threshold_family = X.family_income_target_final threshold_person = threshold_family/2. persons_below_threshold = checkFamilyIncome(X, 0, threshold_family, Membership=0) percentile_threshold_person = stats.percentileofscore(X.TP.cols.R_dollars[:], threshold_person) if Print: print(("\nthreshold for membership, person = ${0:,.9g}, percentile of county " + "person income = {1:,.4g}").format( np.round(threshold_person, 0).item(), percentile_threshold_person)) X.percentile_threshold_family = stats.percentileofscore(X.TF.cols.R_dollars[:], threshold_family) if Print: print(("threshold for membership, family = ${0:,.9g}, percentile of county " + "family income= {1:,.4g}").format(np.round(threshold_family, 0).item(), X.percentile_threshold_family)) ws0 = X.TP.get_where_list("work_status==0") NIWF_below = np.intersect1d(persons_below_threshold, ws0, assume_unique=True) if Print: print("\nfraction of total NIWF below family threshold = {0:,.4g}".format( NIWF_below.size / float(ws0.size))) print("fraction below family threshold that are NIWF = {0:,.4g}".format( NIWF_below.size / float(persons_below_threshold.size))) print(("fraction below family threshold that are NIWF or unemployed (1% member " + "unemployment) = {0:,.4g}").format((NIWF_below.size / float(persons_below_threshold.size)) + \ (X.population * X.Config.labor_participation_rate * .01) / \ float(persons_below_threshold.size))) ave_income_threshold = X.TP.cols.R_dollars[:][persons_below_threshold].mean() if Print: print("\ntotal family income <= threshold_family = ${0:,.9g}".format( np.round(X.TP.cols.R_dollars[:][persons_below_threshold].sum() * X.populationRatio, 0).item())) print("mean of total family income <= threshold_family = ${0:,.9g}".format( np.round(ave_income_threshold,0).item())) #income_below_threshold = X.TP.cols.R_dollars[:][persons_below_threshold].sum() W4 = X.TP.get_where_list("(work_status >=4)") persons_below_threshold_working = np.intersect1d(W4,persons_below_threshold) ave_income_working_county_threshold = X.TP.cols.R_dollars[:][persons_below_threshold_working].mean() if Print: print("\naverage income, working, county, below threshold = ${0:,.9g}".format( np.round(ave_income_working_county_threshold,0).item())) Wnp = X.TP.get_where_list("(work_status ==4) | (work_status ==6)") if Print: print(("average income nonprofit = ${0:,.9g}, fraction of county income = " + "{1:,.4g}").format( np.round(X.TP.cols.R_dollars[:][Wnp].mean(), 0).item(), X.TP.cols.R_dollars[:][Wnp].sum() / X.TP.cols.R_dollars[:].sum())) W3 = X.TP.get_where_list("(work_status <=3)") if Print: print(("average income NIWF and unemployed = ${0:,.9g}, fraction of county " + "income = {1:,.4g}\n").format( np.round(X.TP.cols.R_dollars[:][W3].mean(), 0).item(), X.TP.cols.R_dollars[:][W3].sum() / X.TP.cols.R_dollars[:].sum()))
from scipy import stats import matplotlib.pyplot as plt # (1) 使用scipy.stats包按正态分布生成随机数。 generated = stats.norm.rvs(size=900) # (2) 用正态分布去拟合生成的数据,得到其均值和标准差: print("Mean", "Std", stats.norm.fit(generated)) # (3) 偏度(skewness)描述的是概率分布的偏斜(非对称)程度。 print("Skewtest", "pvalue", stats.skewtest(generated)) # (4) 峰度(kurtosis)描述的是概率分布曲线的陡峭程度。 print("Kurtosistest", "pvalue", stats.kurtosistest(generated)) # (5) 正态性检验(normality test)可以检查数据集服从正态分布的程度。我们来做一个正态性检验 print("Normaltest", "pvalue", stats.normaltest(generated)) # (6) 使用SciPy我们可以很方便地得到数据所在的区段中某一百分比处的数值: print("95 percentile", stats.scoreatpercentile(generated, 95)) # (7) 将前一步反过来,我们也可以从数值1出发找到对应的百分比: print("Percentile at 1", stats.percentileofscore(generated, 1)) # (8) 使用Matplotlib绘制生成数据的分布直方图。 plt.hist(generated) plt.show()