def get5num (self): mins = [min(row) for row in self.data] p25s = [pylab.percentile(row, 25) for row in self.data] medians = [pylab.percentile(row, 50) for row in self.data] p75s = [pylab.percentile(row, 75) for row in self.data] maxs = [max(row) for row in self.data] return [fivenum for fivenum in zip(mins, p25s, medians, p75s, maxs)]
def randomIntensity(im): ''' rescales the intesity of the image to random interval of image intensity distribution ''' return rescale_intensity( im, in_range=tuple( pl.percentile(im, (randRange(0, 10), randRange(90, 100)))), out_range=tuple( pl.percentile(im, (randRange(0, 10), randRange(90, 100)))))
def plot_fits(func, axis, xvar, popt, pcov, color='grey'): ''' Plot the function f with options popt ''' axis.plot(xvar, func(xvar, *popt), color=color, lw=2, alpha=0.8) xsample = np.random.multivariate_normal(popt, pcov, 50000) ysample = np.asarray([func(xvar, *pi) for pi in xsample]) lower = percentile(ysample, 0.3, axis=0) upper = percentile(ysample, 99.7, axis=0) axis.plot(xvar, lower, color=color, alpha=0.4, lw=1.0) axis.plot(xvar, upper, color=color, alpha=0.4, lw=1.0) axis.fill_between(xvar, lower, upper, color=color, alpha=0.2)
def save_sonogram( self, replace=False, n_fft=settings.N_FFT, min_freq=settings.MIN_FREQ, max_freq=settings.MAX_FREQ, dpi=100, width=1000, height=350, max_framerate=settings.MAX_FRAMERATE, ): filename = self.get_sonogram_name() name = os.path.join(settings.SONOGRAM_DIR, filename) path = os.path.join(settings.MEDIA_ROOT, name) try: if not os.path.exists(path): replace = True except (ValueError, SuspiciousOperation, AttributeError): replace = True if replace: audio, framerate = self.get_audio(max_framerate=max_framerate) Pxx, freqs, bins, im = specgram(audio, NFFT=n_fft, Fs=framerate) f = where(logical_and(freqs > min_freq, freqs <= max_freq))[0] Pxx[where(Pxx > percentile(Pxx[f].flatten(), 99.99))] = percentile(Pxx[f].flatten(), 99.99) Pxx[where(Pxx < percentile(Pxx[f].flatten(), 0.01))] = percentile(Pxx[f].flatten(), 0.01) clf() fig = figure(figsize=(float(width) / dpi, float(height) / dpi), dpi=dpi) imshow( flipud(10 * log10(Pxx[f,])), extent=(bins[0], bins[-1], freqs[f][0], freqs[f][-1]), aspect="auto", cmap=cm.gray, ) gca().set_ylabel("Frequency (Hz)") gca().set_xlabel("Time (s)") axis_pixels = gca().transData.transform(np.array((gca().get_xlim(), gca().get_ylim())).T) st, created = SonogramTransform.objects.get_or_create( n_fft=n_fft, framerate=framerate, min_freq=min_freq, max_freq=max_freq, duration=self.duration, width=width, height=height, dpi=dpi, top_px=max(axis_pixels[:, 1]), bottom_px=min(axis_pixels[:, 1]), left_px=min(axis_pixels[:, 0]), right_px=max(axis_pixels[:, 0]), ) savefig(open(path, "wb"), format="jpg", dpi=dpi) sonogram, created = Sonogram.objects.get_or_create(snippet=self, transform=st, path=name) close()
def save_sonogram(self, replace=False, n_fft=settings.N_FFT, \ min_freq=settings.MIN_FREQ, \ max_freq=settings.MAX_FREQ, \ dpi=100, width=1000, height=350, max_framerate=settings.MAX_FRAMERATE): filename = self.get_sonogram_name() name = os.path.join(settings.SONOGRAM_DIR, filename) path = os.path.join(settings.MEDIA_ROOT, name) try: if not os.path.exists(path): replace = True except (ValueError, SuspiciousOperation, AttributeError): replace = True if replace: audio, framerate = self.get_audio(max_framerate=max_framerate) Pxx, freqs, bins, im = specgram(audio, NFFT=n_fft, Fs=framerate) f = where(logical_and(freqs > min_freq, freqs <= max_freq))[0] Pxx[where(Pxx > percentile(Pxx[f].flatten(), 99.99))] = percentile( Pxx[f].flatten(), 99.99) Pxx[where(Pxx < percentile(Pxx[f].flatten(), 0.01))] = percentile( Pxx[f].flatten(), 0.01) clf() fig = figure(figsize=(float(width) / dpi, float(height) / dpi), dpi=dpi) imshow(flipud(10 * log10(Pxx[f, ])), extent=(bins[0], bins[-1], freqs[f][0], freqs[f][-1]), aspect='auto', cmap=cm.gray) gca().set_ylabel('Frequency (Hz)') gca().set_xlabel('Time (s)') axis_pixels = gca().transData.transform( np.array((gca().get_xlim(), gca().get_ylim())).T) st, created = SonogramTransform.objects.get_or_create( n_fft=n_fft, framerate=framerate, min_freq=min_freq, max_freq=max_freq, duration=self.duration, width=width, height=height, dpi=dpi, top_px=max(axis_pixels[:, 1]), bottom_px=min(axis_pixels[:, 1]), left_px=min(axis_pixels[:, 0]), right_px=max(axis_pixels[:, 0]), ) savefig(open(path, 'wb'), format='jpg', dpi=dpi) sonogram, created = Sonogram.objects.get_or_create(snippet=self, transform=st, path=name) close()
def create_uncertainty(model, rate_type): '''data without valid uncertainty is given the 10% uncertainty of the data set Parameters ---------- model : data.ModelData dismod model rate_type : str a rate model 'neg_binom', 'binom', 'normal', 'log_norm', 'poisson', 'beta' Results ------- model : data.ModelData dismod model with measurements of uncertainty for all data ''' # fill any missing covariate data with 0s for cv in list(model.input_data.filter(like='x_').columns): model.input_data[cv] = model.input_data[cv].fillna([0]) # find indices that are negative for standard error and # calculate standard error from effective sample size missing_se = pl.isnan(model.input_data['standard_error']) | (model.input_data['standard_error'] < 0) if True in set(missing_se): model.input_data['standard_error'][missing_se] = (model.input_data['upper_ci'][missing_se] - model.input_data['lower_ci'][missing_se]) / (2*1.96) missing_se_still = pl.isnan(model.input_data['standard_error']) | (model.input_data['standard_error'] < 0) if True in set(missing_se_still): model.input_data['standard_error'][missing_se_still] = pl.sqrt(model.input_data['value'][missing_se_still]*(1-model.input_data['value'][missing_se_still])/model.input_data['effective_sample_size'][missing_se_still]) # find indices that contain nan for effective sample size missing_ess = pl.isnan(model.input_data['effective_sample_size'])==1 # calculate effective sample size from standard error model.input_data['effective_sample_size'][missing_ess] = model.input_data['value'][missing_ess]*(1-model.input_data['value'][missing_ess])/(model.input_data['standard_error'][missing_ess])**2 # find effective sample size of entire dataset non_missing_ess_still = pl.isnan(model.input_data['effective_sample_size'])==0 # finds all real numbers if False in non_missing_ess_still: percent = pl.percentile(model.input_data['effective_sample_size'][non_missing_ess_still], 10.) missing_ess_still = pl.isnan(model.input_data['effective_sample_size'])==1 # finds all nan # replace nan effective sample size with 10th percentile model.input_data['effective_sample_size'][missing_ess_still] = percent # change values of 0 in lognormal model to 1 observation if rate_type == 'log_normal': # find indices where values are 0 zero_val = (model.input_data['value'] == 0) # add 1 observation so no values are zero, also change effective sample size model.input_data['effective_sample_size'][zero_val] = model.input_data['effective_sample_size'][zero_val] + 1 model.input_data['value'][zero_val] = 1.0/model.input_data['effective_sample_size'][zero_val] # update standard error model.input_data['standard_error'][zero_val] = pl.sqrt(model.input_data['value'][zero_val]*(1-model.input_data['value'][zero_val])/model.input_data['effective_sample_size'][zero_val]) return model
def add_weights(self, dict_, map_components): """ """ is_sidewalk = pl.array(dict_['type']) == 'sidewalk' is_street = pl.array(dict_['type']) == 'street' # relative tree density dict_['tree_density'] = pl.array(dict_['tree_number']).astype(float) dict_['tree_density'] /= pl.array(dict_['distance']) # sidewalks wet to maximum # max_density = max(dict_['tree_density']) # dict_['tree_density'][is_sidewalk] = max_density # dict_['tree_density'] /= max(dict_['tree_density']) dict_['tree_density'] /= pl.percentile( dict_['tree_density'][is_street], 85.) pl.hist(dict_['tree_density'], bins=pl.arange(0., 1.5, 0.05)) # randomize weight of sidewalks dict_['tree_density'][is_sidewalk] = 0.75 + 0.15 * pl.randn( len(dict_['tree_density'][is_sidewalk])) dict_['tree_density'][dict_['tree_density'] > 1.] = 1. # randomize streets with 0 trees zero_tree = pl.array(dict_['tree_number']) == 0 dict_['tree_density'][zero_tree] = 0.25 + 0.15 * pl.randn( sum(zero_tree)) # tree density defined as 1 in parks since no tree data there dict_['tree_density'][dict_['tree_density'] > 0.999] = 0.999 dict_['tree_density'][dict_['tree_density'] < 0.] = 0. pl.hist(dict_['tree_density'], bins=pl.arange(0., 1.5, 0.05)) dict_['min_dist_to_park'] = [] for i in range(len(dict_['vertex_start'])): # mean location of the segment lon = dict_['geometry'][i].representative_point().x lat = dict_['geometry'][i].representative_point().y # distance park - segment dist = angles.ang_dist( lon, lat, angles.rad_to_deg(map_components['park']['rep_x_rad']), angles.rad_to_deg(map_components['park']['rep_y_rad'])) dist = min(dist) dict_['min_dist_to_park'].append(dist) return dict_
def get_binned_data_2d(self, n_bins = 10): # generating an instance of the class BinnedData2D bd_in = BinnedData2D() # setting the attribute source of the output equal to the source of the dataset bd_in.source = self.source # setting the filter of the output equat to the filter of the dataset bd_in.filter = self.filter # setting the number of bins of the output equal to the number of bin that the method receives as an imput bd_in.n_bins = n_bins # Extracting pi and imp database_reduced = self.db_fil.loc[:,['pi','imp']] # Generating the bin extremes bin_end_imp_pi = pl.percentile(database_reduced.pi,list(100.*pl.arange(bd_in.n_bins+1.)/(bd_in.n_bins))) # Adjusting the last bin extreme bin_end_imp_pi[-1] = bin_end_imp_pi[-1] + 0.00001 # Assigning each point to a bin database_reduced['fac_pi'] = pl.digitize(database_reduced.pi,bin_end_imp_pi) # Using a groupby in order to generate average pi and imp for each bin, assigning the output to df_imp df_gp = database_reduced[['pi','imp','fac_pi']].groupby('fac_pi') df_imp = pd.concat([df_gp.mean(),df_gp.imp.std(),df_gp.imp.count()], axis=1) df_imp.columns = ['pi','imp','stdd','nn'] # Setting the data of the output equal to the result of the binning procedure bd_in.data = df_imp # returning the filled instance of the class BinnedData2D return bd_in
print(dictionary[id], freq) #step 3 create similarity matrix index = similarities.Similarity('/tmp/tst', corpus_tfidf.corpus, num_features=corpus.num_terms + 1) sims = index[corpus_tfidf] #step 3.1 percentile = { 'sicp': 90, 'sicm': 95, 'dirac': 60, 'dirac_sections': 95, 'som': 98 }[prefix] sims[sims < pylab.percentile(sims, percentile)] = 0 #step 4 convert datatype to networkx Graph print("converting similarity matrix to networkx Graph") sims = networkx.Graph(sims, node_list=list(range(len(book)))) networkx.set_node_attributes(sims, 'name', {x: y for x, y in enumerate(labels)}) networkx.set_node_attributes(sims, 'group', {x: y for x, y in enumerate(groups)}) wordcount_normalize = {'sicp': 1000, 'sicm': 500}.get(prefix, 1000) networkx.set_node_attributes( sims, 'wordcount', {x: float(y) / wordcount_normalize for x, y in enumerate(wordcount)})
for t in range(0, num_timesteps): if t == 0: # First year starting with initial assumptions for k in range(num_simulations): u[k] = triangular(marketsize_min, marketsize_mode, marketsize_max) # triangular distribution of current number of potential users s[k] = calc_marketshare(u[k], marketshare_init) # market share for product r[k] = calc_revenue(u[k], s[k]) # revenue # store values in first row of matrices: rev[t, :] += r usr[t, :] += u sha[t, :] = s #percentiles of the complete revenue row at time t percentiles_rev[t, :] = percentile(rev[t, :], perc_selection) percentiles_usr[t, :] = percentile(usr[t, :], perc_selection) percentiles_sha[t, :] = percentile(sha[t, :], perc_selection) else: # Following years starting with the previous year's data for k in range(num_simulations): # estimate how much the market has grown: loc = triangular(1, 2, 4) scale = triangular(1, 2, 3) factor = 3 marketgrowth = logist(t, loc, scale, factor) u[k] += u[k] * marketgrowth # apply market growth s[k] = calc_marketshare(u[k], s[k]) + logist(t, 4, 5, 1) # apply market share increase r[k] = calc_revenue(u[k], s[k]) # calculate revenue # store values in following rows of matrices
if plotting_tmp: plot_ge('ge') ########################### ## Generating histogram of the daily rate Q/V_D ## ## Generating evenly populated bins by means of pl.percentile ## Producing an histogram ## Plotting ########################### print('Generating histogram of the daily rate Q/V_D...') n_bins_h_pi = 500 bin_end_h_pi = pl.percentile(df_in.pi,list(100.*pl.arange(n_bins_h_pi+1.)/(n_bins_h_pi))) pdf_pi, bins_pi, patches = pl.hist(pl.array(df_in.pi), bin_end_h_pi, normed=1, histtype='step') bins_pi_cent = (bins_pi[:-1] - bins_pi[1:])/2. + bins_pi[1:] # finding the center of the bins if plotting_tmp: plot_hist('stat_pi') ########################### ## Measuring temporary impact as a function of the daily rate Q/V_D ## ## Generating evenly populated bins of \pi by means of percentile ## Assigning to each metaorder the corresponding bin in df_in.fac_pi ## Evaluating the average daily rate and impact for each bin, standard deviation and counting by means of a groupby ## Fitting a power-law and a logarithmic function ## Plotting
# example model0, to test vars and test-train model = mu.load_new_model(model_num, test_area, data_type) nan_ix = list(model.input_data['effective_sample_size'][pl.isnan(model.input_data['effective_sample_size'])==1].index) # list of nan in effective sample size model = mu.create_uncertainty(model, 'binom') for cv in list(model.input_data.filter(like='x_').columns): # fill missing with 0 model.input_data[cv] = model.input_data[cv].fillna([0]) # example model1, to test test-train model1 = mu.load_new_model(model_num, test_area, data_type) model1 = mu.create_uncertainty(model1, 'normal') # example model2, to test loading and uncertainty model2 = mu.load_new_model(model_num, test_area, data_type) non_nan_ix2 = list(model2.input_data['effective_sample_size'][pl.isnan(model2.input_data['effective_sample_size'])==0].index) # list of nan in effective sample size ten_percent = pl.percentile(model2.input_data.ix[non_nan_ix2, 'effective_sample_size'], 10.) model2 = mu.create_uncertainty(model2, 'normal') # find official areas of western europe areas = [test_area] for i in model.hierarchy.edges(test_area): areas.append(i[1]) # create data for math functions pred = pandas.DataFrame(pl.arange(10), columns=['mean']) pred_ui = pandas.DataFrame(pl.hstack((pred-1, pred+1)), columns=['lower','upper']) obs = pandas.DataFrame(pl.arange(10)+1, columns=['value']) def test_load_area(): # find model unique areas
def filter1d(x, mask_only=True, algos=['2sigma']): """ Filter vector with selected algorithms. In: x : ndarray, input vector mask_only : bool, do not touch input vector, just find "bad" values algos : list of str, algos list to apply to input vector. The sequence is the same as in the list Out: xnew : ndarray, filtered input vector (returned only if mask_only=False) mask : ndarray of bool, vector of the same length as x, where "0" represents masked values """ xnew = pl.array(x, dtype='float') mask = pl.ones(len(x), dtype='bool') for algo in algos: if algo == 'diff02': for i in range(0, len(xnew)-1): if (abs(xnew[i+1] / xnew[i] - 1) > .2): # current rr differs more then 20% of previous one mask[i] = False if not mask_only: xnew = xnew * mask xnew = pl.ma.masked_equal(xnew,0) xnew = pl.ma.compressed(xnew) elif algo == '2sigma': mean = pl.mean(xnew) std = pl.std(xnew) for i in range(0, len(xnew)): if pl.logical_or(xnew[i] < mean - 2*std, mean + 2*std < xnew[i]): mask[i] = False if not mask_only: xnew = xnew * mask xnew = pl.ma.masked_equal(xnew,0) xnew = pl.ma.compressed(xnew) elif algo == '5per95': per5 = pl.percentile(xnew,5) per95 = pl.percentile(xnew,95) #print per5,per95 for i in range(0, len(xnew)): if pl.logical_or(xnew[i] < per5, per95 < xnew[i]): mask[i] = False if not mask_only: xnew = xnew * mask xnew = pl.ma.masked_equal(xnew,0) xnew = pl.ma.compressed(xnew) elif algo == '3per97': per3 = pl.percentile(xnew,3) per97 = pl.percentile(xnew,97) for i in range(0, len(xnew)): if pl.logical_or(xnew[i] < per3, per97 < xnew[i]): mask[i] = False if not mask_only: xnew = xnew * mask xnew = pl.ma.masked_equal(xnew,0) xnew = pl.ma.compressed(xnew) elif algo == '1per99': per1 = pl.percentile(xnew,1) per99 = pl.percentile(xnew,99) for i in range(0, len(xnew)): if pl.logical_or(xnew[i] <= per1, per99 <= xnew[i]): mask[i] = False if not mask_only: xnew = xnew * mask xnew = pl.ma.masked_equal(xnew,0) xnew = pl.ma.compressed(xnew) elif algo == 'ho_moody': for i in range(2, len(xnew)-2): mean = pl.mean([xnew[i-2],xnew[i-1],xnew[i+1],xnew[i+2]]) if pl.logical_or(xnew[i] < .8 * mean, 1.2 * mean < xnew[i]): mask[i] = False if not mask_only: xnew = xnew * mask xnew = pl.ma.masked_equal(xnew,0) xnew = pl.ma.compressed(xnew) else: warn("Donno anything about such filtration algorithm") print "NB: Deleted %0.3f%% of array" % (float(len(x)-pl.sum(mask))/len(x)*100,) if mask_only: return mask else: return xnew, mask