def plot_correlations(self): smoothing_window = max(1, int(len(self.raw_data[self.targets[0]]) / 10)) for c_f in self.features: grouped_features = mlab.rec_groupby(self.normalized_data, [c_f], [(c_f, len, 'count')]) is_discrete = len(grouped_features) < 100 if c_f in self.raw_data: self.raw_data.sort(order=c_f) is_date = c_f.find('date') >= 0 if is_date: self.raw_data.sort(order=c_f) else: self.normalized_data.sort(order=c_f) for c_t in self.targets: try: f = plt.figure() if is_discrete: d = mlab.rec_groupby(self.normalized_data, [c_f], [(c_t, numpy.average, 'avg')]) plt.bar(d[c_f], d['avg']) else: y = None if c_t in self.raw_data and self.raw_data[c_t].dtype == '>f4': y = self.raw_data[c_t] else: y = self.normalized_data[c_t] convolved_y = numpy.convolve(numpy.ones(smoothing_window, 'd')/smoothing_window, y, mode='valid') x = self.raw_data[c_f] if is_date else self.normalized_data[c_f] plt.plot(x[0:convolved_y.shape[0]], convolved_y) if is_date: f.autofmt_xdate() plt.ylabel(c_t) plt.xlabel(c_f) plt.savefig("%s/%s_x_%s" % ('plots', c_f, c_t)) except: print "Error creating plot (%s, %s)" % (c_f, c_t)
def get_recommended_pricing(price_history): # Get recommended availability zone zones_stats = [] for zone in price_history: price_arr = mlab.rec_groupby( price_history[zone], ('timestamp',), (('price', np.max, 'max_price'),))['max_price'] three_sigma = np.mean(price_arr) + (3 * np.std(price_arr)) percentile = np.percentile(price_arr, 99) zones_stats.append((zone, max(three_sigma, percentile))) zone, stat = sorted(zones_stats, key=lambda x: x[1])[0] price_arr = mlab.rec_groupby( price_history[zone], ('timestamp',), (('price', np.max, 'max_price'),))['max_price'] n = len(price_arr) base = np.arange(n, dtype=np.float) / n weights = np.tanh((base - 0.7) / 0.2) weights = (weights - np.min(weights)) / (np.max(weights) - np.min(weights)) + 1 weights /= np.sum(weights) weighted = np.multiply(weights, price_arr) price = np.max( price_arr[np.where(weighted >= np.percentile(weighted, 99, interpolation='nearest'))]) return (zone, price)
def estimate_var(new_data, groupbyyear=False): summary_list = [(nm, summarize_simple, nm+'_var') for nm in new_data.dtype.names \ if nm not in ['year', 'weekoftheyear']] if groupbyyear: return rec_groupby(new_data, ['year'], summary_list) else : return rec_groupby(new_data, [], summary_list)
def gethistprices(query, numrows=1000, **kwargs): rec_arr = sqlite2rec(query, **kwargs) import matplotlib.mlab as mlab import numpy as np (syms, posuniq, pos) = np.unique(rec_arr.sym, True, True) new_rec_arr = mlab.rec_append_fields(rec_arr, 'idx', pos) nosym = mlab.rec_drop_fields(new_rec_arr, ['sym',]) recnumrecs = mlab.rec_groupby(nosym, ('idx',), (('idx', len, 'idxcount'), )) idx = np.nonzero(recnumrecs.idxcount >= numrows)[0] idxcount = len(recnumrecs[idx]) xs = np.empty((idxcount, numrows, len(nosym[0])-1), dtype=float) for i in xrange(idxcount): if kwargs.has_key('verbose') and kwargs['verbose'] and i % 50 == 0: print '%d of %d' % (i, idxcount) curdata = nosym[nosym.idx == idx[i]] curdata_arr = np.array(curdata.tolist(), dtype=float) xs[i] = curdata_arr[0:numrows:,0:-1] return (syms[idx], xs)
def write_hex_stats(self, data, id_field, stat_fields, min_plots_per_hex, out_file): # Summarize the observed output stats = mlab.rec_groupby(data, (id_field,), stat_fields) # Filter so that the minimum number of plots per hex is maintained stats = stats[stats.PLOT_COUNT >= min_plots_per_hex] # Write out the file utilities.rec2csv(stats, out_file)
def write_hex_stats(self, data, id_field, stat_fields, min_plots_per_hex, out_file): # Summarize the observed output stats = mlab.rec_groupby(data, (id_field, ), stat_fields) # Filter so that the minimum number of plots per hex is maintained stats = stats[stats.PLOT_COUNT >= min_plots_per_hex] # Write out the file utilities.rec2csv(stats, out_file)
def print_price_history(price_history, recommend=True): fields = ['Zone', 'Current', 'Min', 'Max', 'Mean', 'Std'] print('{:13}{:9}{:8}{:8}{:8}{:8}'.format(*fields)) for zone in sorted(price_history.keys()): price_arr = mlab.rec_groupby( price_history[zone], ('timestamp',), (('price', np.max, 'max_price'),))['max_price'] row = [zone, price_arr[-1], np.min(price_arr), np.max(price_arr), np.mean(price_arr), np.std(price_arr)] print('{:13}${:<8.2f}${:<7.2f}${:<7.2f}${:<7.2f}${:<7.2f}'.format(*row)) if recommend: rec_zone, rec_price = get_recommended_pricing(price_history) print('RECOMMENDED BIDDING (ZONE: {}, PRICE: {:.2f})'.format(rec_zone, rec_price))
def compute_probability_distribution(arr, is_cum_sum=True): # stats used - count agg_data = utils.array2recarray( arr ) stats = ( ("key", len, 'total'), ) res = mlab.rec_groupby(agg_data, ('key',), stats) if is_cum_sum: cumsum = np.cumsum(res.total/float(len(arr))) else: cumsum = res.total/float(len(arr)) # return array with the keys and the cummulative distribution return np.array([res.key, cumsum])
def plot_price_history(price_history, plot_hist=False): num_zones = len(price_history.keys()) plt.ion() fig_price = plt.figure(figsize=(15, 8)) ax_price = fig_price.add_subplot(111) ax_price.xaxis.set_major_locator(DayLocator()) ax_price.xaxis.set_minor_locator(HourLocator()) ax_price.xaxis.set_major_formatter(DateFormatter('%b %d')) ax_price.autoscale_view() ax_price.grid(True) if plot_hist: num_rows = int(num_zones / 2) + (num_zones % 2) fig_hist = plt.figure(figsize=(15, 8)) fig_hist.set_tight_layout(True) colors = plt.cm.Spectral(np.linspace(0, 1, num_zones)) min_date, max_date = datetime.today(), datetime(1970, 1, 1) for zone, color, i in zip(sorted(price_history.keys()), colors, range(1, num_zones + 1)): # Plot price history curves grouped = mlab.rec_groupby( price_history[zone], ('timestamp',), (('price', np.max, 'max_price'),)) price_arr, date_arr = grouped['max_price'], [ x.astype(datetime) for x in grouped['timestamp']] price_stats = [zone, price_arr[-1], np.min(price_arr), np.max(price_arr), np.mean(price_arr), np.std(price_arr)] label = '{:14}current: ${:<6.2f}min: ${:<6.2f}max: ${:<6.2f}mean: ${:<6.2f}std: ${:<6.2f}'.format( *price_stats) ax_price.plot_date(date_arr, price_arr, '-', color=color, linewidth=1.5, label=label) # Plot price history histogram if plot_hist: ax_hist = fig_hist.add_subplot(num_rows, 2, i) ax_hist.hist(price_arr, 200, range=(0, np.max(price_arr) + 0.5), color=color, alpha=0.7) ax_hist.set_title('{} (examples: {})'.format(zone, price_arr.size)) ax_hist.set_xlabel("Price") ax_hist.set_ylabel("Frequency") # Calculating time boundaries min_date, max_date = min(min_date, np.min(date_arr)), max(max_date, np.max(date_arr)) rec_zone, rec_price = get_recommended_pricing(price_history) label = 'RECOMMENDED BIDDING (ZONE: {}, PRICE: {:.2f})'.format(rec_zone, rec_price) ax_price.plot_date([min_date, max_date], [rec_price, rec_price], 'r-', linewidth=2, label=label) ax_price.legend(loc='upper center', fancybox=True, shadow=True, ncol=1).draggable() return fig_price, ax_price, fig_hist if plot_hist else None
def measure_fit(self): ''' Provide metrics of fit to determine how well the model performed ''' # TODO: code up RMSE for non-holdout predictions if self.training_type == 'make predictions': print 'RMSE for non-holdout data not yet implemented' # calculate age-adjusted rates on the test data else: predicted = self.predictions[['country','year','age','pop','actual_deaths', 'mean_deaths', 'upper_deaths', 'lower_deaths']].view(np.recarray) predicted = recfunctions.append_fields(predicted, 'mean_rate', predicted.mean_deaths / predicted.pop * 100000.).view(np.recarray) predicted = recfunctions.append_fields(predicted, 'actual_rate', predicted.actual_deaths / predicted.pop * 100000.).view(np.recarray) predicted = recfunctions.append_fields(predicted, 'weight', np.ones(predicted.shape[0])).view(np.recarray) for a in self.age_list: predicted.weight[np.where(predicted.age==a)[0]] = self.age_weights.weight[np.where(self.age_weights.age==a)[0]] predicted.mean_rate = predicted.mean_rate * predicted.weight predicted.actual_rate = predicted.actual_rate * predicted.weight from matplotlib import mlab adj_rates = mlab.rec_groupby(predicted, ('country','year'), (('mean_rate', np.sum, 'adj_mean_rate'),('actual_rate', np.sum, 'adj_actual_rate'))) # calculate RMSE/RMdSE err = adj_rates.adj_mean_rate - adj_rates.adj_actual_rate sq_err = err ** 2. mse = np.mean(sq_err) mdse = np.median(sq_err) rmse = np.sqrt(mse) rmdse = np.sqrt(mdse) # calculate AARE/MdARE abs_rel_err = np.abs(err / adj_rates.adj_actual_rate) aare = np.mean(abs_rel_err) mdare = np.median(abs_rel_err) # calculate coverage (age-specific, not age-adjusted) coverage = np.array((predicted.upper_deaths >= predicted.actual_deaths) & (predicted.lower_deaths <= predicted.actual_deaths)).astype(np.int).mean() # output fit metrics print 'Root Mean Square Error: ' + str(rmse), '\nRoot Median Square Error: ' + str(rmdse), '\nAverage Absolute Relative Error: ' + str(aare), '\nMedian Absolute Relative Error: ' + str(mdare), '\nCoverage: ' + str(coverage) pl.rec2csv(np.core.records.fromarrays([np.array(('rmse','rmdse','aare','mdare','coverage')),np.array((rmse,rmdse,aare,mdare,coverage))], names=['metric','value']), '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_fits_' + self.cause + '_' + self.sex + '.csv')
# Generate numpy array tx_recs = log_np = log_util.log_data_to_np_arrays(log_data, tx_log_index) # Define the fields to group by group_fields = ('addr1',) # Define the aggregation functions stat_calc = ( ('retry_count', np.mean, 'avg_num_tx'), ('length', len, 'num_pkts'), ('length', np.mean, 'avg_len'), ('length', sum, 'tot_len'), ('time_to_done', np.mean, 'avg_time')) # Calculate the aggregate statistics tx_stats = mlab.rec_groupby(tx_recs, group_fields, stat_calc) # Display the results print('\nTx Statistics for {0}:\n'.format(LOGFILE)) print('{0:^18} | {1:^8} | {2:^10} | {3:^14} | {4:^11} | {5:^5}'.format( 'Dest Addr', 'Num MPDUs', 'Avg Length', 'Total Tx Bytes', 'Avg Tx Time', 'Avg Num Tx')) for ii in range(len(tx_stats)): print('{0:<18} | {1:8d} | {2:10.1f} | {3:14} | {4:11.3f} | {5:5.1f}'.format( wlan_exp_util.mac_addr_to_str(tx_stats['addr1'][ii]),
log_np = log_util.log_data_to_np_arrays(log_data, tx_log_index) log_tx = log_np['TX_HIGH'] # Define the fields to group by group_fields = ('addr1',) # Define the aggregation functions stat_calc = ( ('num_tx', np.mean, 'avg_num_tx'), ('length', len, 'num_pkts'), ('length', np.mean, 'avg_len'), ('length', sum, 'tot_len'), ('time_to_done', np.mean, 'avg_time')) # Calculate the aggregate statistics tx_stats = mlab.rec_groupby(log_tx, group_fields, stat_calc) # Display the results print('\nTx Statistics for {0}:\n'.format(os.path.basename(LOGFILE))) print('{0:^18} | {1:^9} | {2:^10} | {3:^14} | {4:^11} | {5:^5}'.format( 'Dest Addr', 'Num MPDUs', 'Avg Length', 'Total Tx Bytes', 'Avg Tx Time', 'Avg Num Tx')) for ii in range(len(tx_stats)): print('{0:<18} | {1:9d} | {2:10.1f} | {3:14} | {4:11.3f} | {5:5.2f}'.format( wlan_exp_util.mac_addr_to_str(tx_stats['addr1'][ii]),
def profile( aCGH, signal=None, #chromosomes=None, ymin=None, ymax=None, cmin=-1, cmax=1, cmap=plt.cm.RdYlBu_r): """ Plots the aCGH signal (the log ratio of intensities) in a way that values are ordered according to the distribution of the probes throughout the genome. Parameters ---------- aCGH : :class:`pycgh.datatypes.ArrayCGH` The object representing the Array CGH. signal : array_like, optional (default: None) The signal representing the log 2 ratio of the intensities of the test and the reference signal. If not provided, it is reconstructed using the test and reference signal contained in the aCGH object. ymin : float, optional (default: None) The lower limit of the y axis. If None, it is set as -ymax. ymax : float, optional (default: None) The upper limit of the y axis. If None, it is chosen the value of the signal array having maximum absolute value. cmin : float, optional (default: -1) The lower color limit of the scatter plot. cmax : float, optional (default: +1) The upper color limit of the scatter plot. cmap : :class:`matplotlib.colors.Colormap`, optional (default: matplotlib.pylab.cm.RdYlBu_r) The color map to be passed to the actual plotting function. cbticks : array_like, optional (default: None) The *ticks* in the color bar. If None, the color bar is divided in 9 equal parts, including all values (colors) present in the plot. Returns ------- coords : array_like The *position* of the probes shifted by an amount which depends on the location of the DNA fragment in the genome. """ chr_map = dict((str(c), c) for c in xrange(1, 25)) chr_map['X'] = 23 chr_map['Y'] = 24 inverse_chr_map = dict((c, str(c)) for c in xrange(1, 23)) inverse_chr_map[23] = 'X' inverse_chr_map[24] = 'Y' ## Chromosomes Fitering #if not chromosomes is None: # chr_filtered = np.unique(chr_map[str(chr).strip().upper()] # for chr in chromosomes) # # chridx = np.array(np.zeros(len(aCGH.F['chromosome'])), dtype=bool) # for chr in chr_filtered: # np.logical_or(chridx, aCGH.F['chromosome'] == chr, chridx) # # #aCGH = aCGH.F[chridx] # chromosomes = sorted(chr_filtered) # # # Signal and segmentation filtering # if not signal is None: # signal = signal[chridx] # if not segmentation is None: # segmentation = segmentation[chridx] #else: # Signal Calculation if signal is None: test_signal = aCGH.M['test_signal'] reference_signal = aCGH.M['reference_signal'] signal = (np.log2(test_signal) - np.log2(reference_signal)).compressed() elif isinstance(signal, str) or isinstance(signal, unicode): signal = aCGH.F[signal] elif np.ma.getmask(signal) is np.ma.nomask: if len(signal) > aCGH.size: # Not filtered signal wrt current aCGH signal = signal[~aCGH['mask']] # Manual extraction else: signal = signal.compressed() # Plot-Coordinates Calculation from matplotlib import mlab as ml positions = aCGH.F[['chromosome', 'start_base']] # A compressed copy chromosomes = np.unique(positions['chromosome']) # Calculation of the max start_base for each Chromosome as shift coords = positions['start_base'] summary = ml.rec_groupby(positions, groupby=('chromosome', ), stats=(('start_base', np.max, 'shifts'), )) shifts = np.zeros(len(summary) + 1) shifts[1:] = np.cumsum(summary['shifts']) # Applying shifts for i, chr in enumerate(chromosomes): coords[positions['chromosome'] == chr] += shifts[i] # Chromosomes ticks and separators ticks = (shifts[:-1] + shifts[1:]) / 2.0 separators = shifts[1:-1] # CGH Scatter plot plt.scatter(coords, signal, c=signal, cmap=cmap, vmin=cmin, vmax=cmax, s=1, edgecolors='none') # Axis limits if ymax is None: ymax = max(abs(min(np.nanmin(signal), -1.1)), max(np.nanmax(signal), 1.1)) if ymin is None: ymin = -ymax plt.axis([coords.min(), coords.max(), ymin, ymax]) # Visual effects plt.colorbar(extend='both') plt.axhline(0.0, lw=1, color='gray', ls='--') plt.axhline(np.log2(3 / 2.), lw=1, color='orange', ls='--') # gain plt.axhline(1.0, lw=1, color='red', ls='--') plt.axhline(-1.0, lw=1, color='blue', ls='--') for sep in separators: plt.axvline(sep - 1, lw=1, color='gray', ls='-') label_ticks = ['Chr %s' % inverse_chr_map[k] for k in chromosomes] plt.xticks(ticks, label_ticks, rotation=90, size=8) #plt.tick_params(axis='x', direction='out', length=3, colors='black', # labelbottom='on') return coords
def volume_code(volume): 'code the continuous volume data categorically' ind = np.searchsorted([1e5,1e6, 5e6,10e6, 1e7], volume) return ind summaryfuncs = ( ('date', lambda x: [thisdate.year for thisdate in x], 'years'), ('date', lambda x: [thisdate.month for thisdate in x], 'months'), ('date', lambda x: [thisdate.weekday() for thisdate in x], 'weekday'), ('adj_close', daily_return, 'dreturn'), ('volume', volume_code, 'volcode'), ) rsum = mlab.rec_summarize(r, summaryfuncs) stats = ( ('dreturn', len, 'rcnt'), ('dreturn', np.mean, 'rmean'), ('dreturn', np.median, 'rmedian'), ('dreturn', np.std, 'rsigma'), ) print 'summary by years' ry = mlab.rec_groupby(rsum, ('years',), stats) print mlab. rec2txt(ry) print 'summary by months' rm = mlab.rec_groupby(rsum, ('months',), stats) print mlab.rec2txt(rm) print 'summary by year and month' rym = mlab.rec_groupby(rsum, ('years','months'), stats) print mlab.rec2txt(rym) print 'summary by volume' rv = mlab.rec_groupby(rsum, ('volcode',), stats) print mlab.rec2txt(rv)
rsum = mlab.rec_summarize(r, summaryfuncs) # stats is a list of (dtype_name, function, output_dtype_name). # rec_groupby will summarize the attribute identified by the # dtype_name over the groups in the groupby list, and assign the # result to the output_dtype_name stats = ( ('dreturn', len, 'rcnt'), ('dreturn', np.mean, 'rmean'), ('dreturn', np.median, 'rmedian'), ('dreturn', np.std, 'rsigma'), ) # you can summarize over a single variable, like years or months print 'summary by years' ry = mlab.rec_groupby(rsum, ('years', ), stats) print mlab.rec2txt(ry) print 'summary by months' rm = mlab.rec_groupby(rsum, ('months', ), stats) print mlab.rec2txt(rm) # or over multiple variables like years and months print 'summary by year and month' rym = mlab.rec_groupby(rsum, ('years', 'months'), stats) print mlab.rec2txt(rym) print 'summary by volume' rv = mlab.rec_groupby(rsum, ('volcode', ), stats) print mlab.rec2txt(rv)
def agg_compactness(array): stats = (('traj_compactness', len, 'total'), ) return mpl.rec_groupby(array, ('traj_compactness', ), stats)
colors = brewer2mpl.get_map('YlGnBu', 'sequential', 9).mpl_colors colors = [colors[5], colors[8]] i = 0 dows = ["wd", "we"] labels = ["WD", "WE"] stats = ( ('node_count', len, 'total'), ) for dow in dows: table = "/Users/igobrilhante/Documents/workspace/research/ComeTogether/experiments/network."+city+"_fs_poiclusterf_traj_"+dow+"_"+str(hours)+"h_trajs_communities.csv" data = mlab.csv2rec(table) agg = mlab.rec_groupby(data, ('node_count',), stats) print agg plt.plot(agg.node_count, agg.total, color=colors[i], label=labels[i], marker=markers[i], markersize=marker_size, linewidth=line_width, markeredgewidth=0.0, alpha=alpha) i += 1 plt.tick_params(axis='both', which='major', labelsize=16, colors="#000000") # plt.xlim([-100, 1000]) # plt.ylim([-0.1, 1.1]) plt.xlabel('Number of nodes') plt.ylabel('Number of communities') fig.tight_layout() #
def agg_degree( array ): stats = ( ('degree', len, 'total'), ) return mpl.rec_groupby(array, ('degree',), stats)
def make_aggregation(self,lista,grouping): #Saco el timestamp del primer taxi time = lista.values()[0].state['dtime'] #time = calendar.timegm(time.timetuple()) #great deal to obtain proper data structure for numpy operations #can be much more efficient pars_list = [] for item in Car.PARS: pars_list.append(item) dtype = [] for par in pars_list: dtype.append((par, Car.PARS[par])) # print pars_list # print dtype all_states = [] for car in lista.values(): all_states.append(tuple([car.state[par] for par in pars_list])) # print all_states arr = np.rec.array(all_states, dtype) # print arr,"\n" def queue(x): return np.sum((x > 1)) # Agrega por area o areaPic dependiendo de la capa que sea # if layer == "Money": # grouping = ('areanumPick', 'statusf') # else: # grouping = ('areanum', 'statusf') #define which metrics to calculate with which function metrics = (('areanum', np.count_nonzero, 'aggregate'), ('waitingtimef', np.min, 'minwait'), ('waitingtimef', np.mean, 'meanwait'), ('waitingtimef', np.max, 'maxwait'), ('runlengthf', np.mean, 'meanrun'), ('runlengthf', np.max, 'maxrun'), ('runDistance', np.mean, 'meanRunDistance'), ('searchlengthf', np.mean, 'meansearch'), ('searchlengthf', np.max, 'maxsearch'), ('waitingtimef', queue, 'queuelength'), ('areaSearchingTime',np.min,'minareaSearchingTime'), ('areaSearchingTime',np.mean,'meanareaSearchingTime'), ('areaSearchingTime',np.max,'maxareaSearchingTime'), ('areaTime',np.min,'minareaTime'), ('areaTime',np.mean,'meanareaTime'), ('areaTime',np.max,'maxareaTime'), ('areaIniOccupiedTime',np.min,'minareaIniOccupiedTime'), ('areaIniOccupiedTime',np.mean,'meanareaIniOccupiedTime'), ('areaIniOccupiedTime',np.max,'maxareaIniOccupiedTime'), ('areaPercentage',np.min,'minareaPercentage'), ('areaPercentage',np.mean,'meanareaPercentage'), ('areaPercentage',np.max,'maxareaPercentage'), ) #and actually make the calculations result=mlab.rec_groupby(arr, grouping, metrics) #print result #and convert it back to dictionary keys = result.dtype.names #print keys #return [dict(zip(keys, record)) for record in result] aggregates = [dict(zip(keys, record)) for record in result] # for aggregate in aggregates: # aggregate['dtime'] = time f.insertColumAggregado(aggregates,'dtime',time) return aggregates
def agg_compactness( array ): stats = ( ('traj_compactness', len, 'total'), ) return mpl.rec_groupby(array, ('traj_compactness',), stats)
def agg_degree(array): stats = (('degree', len, 'total'), ) return mpl.rec_groupby(array, ('degree', ), stats)