def build_model(): #df = get_training_data() df = get_sampling_training() targets = np.array(df['success']) del df['success'] del df['name'] columns = df.columns data = np.array(df) model = randomforest(data, targets, tree_num=200) pickle.dump(model, open("data/rf.model", "w")) # feature importance feature_importance = model.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, columns[sorted_idx]) pl.xlabel('Relative Importance') pl.title('Variable Importance') pl.savefig('plots/feature_imp.jpg')
def get_related_features(input_df, target_feature, related_feature_size): features = np.array(input_df.columns) # remove target_feature from all features index = np.argwhere(features == target_feature) features = np.delete(features, index) ##feature selection train_x, test_x, train_y, test_y = train_test_split( input_df[features], input_df[target_feature], test_size=0.25) clf = RandomForestClassifier() clf.fit(train_x, train_y) # from the calculated importances, order them from most to least important # and make a barplot so we can visualize what is/isn't important importances = clf.feature_importances_ sorted_idx = np.argsort(importances) # Return only the top features up to the related_feature_size. related_features = features[sorted_idx[-related_feature_size:]] padding = np.arange(len(features)) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, features[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance") #pl.show() return related_features
def page_freq_graph(self, transactions): import config import numpy freq_item_list = {} for trans in transactions: for item in trans: if item in freq_item_list: freq_item_list[item] += 1 else: freq_item_list[item] = 1 from operator import itemgetter pages = [] counts = [] pos = [] i = 1.0 for p, c in sorted(freq_item_list.items(), key=itemgetter(1)): pages.append(p) counts.append(c) pos.append(i) i = i + 2.0 import pylab pylab.cla() pylab.clf() pylab.figure(1) pylab.barh(numpy.array(pos), numpy.array(counts), align='center') #pylab.yticks(numpy.array(pos), tuple(pages)) pylab.xlabel("Page count") #pylab.grid(True) pylab.savefig(config.OUTPUT + "page_distribution.pdf")
def _plot_histogram(self, gs, y, scale, y_mean=None, show_len=None, label=None, sharex=None): if show_len is None: show_len = self.indicators[0].m else: scale = scale * show_len * 1. / self.indicators[0].m ax = plt.subplot(gs, sharex=sharex) price = self.history['last_price'][self.now - show_len:self.now] plt.plot(price) floor = price.min() ceil = price.max() # floor = self.history['last_price'].min() # ceil= self.history['last_price'].max() y = y[floor:ceil + 1] * scale y[y > show_len * 1.2] = show_len * 1.2 plt.barh(np.arange(floor, ceil + 1), y, 1.0, label=label, alpha=0.2, color='r', edgecolor='none') if y_mean is not None: y_mean = int(y_mean * 2. * scale) ax.set_xticks(np.arange(0, show_len, y_mean)) plt.grid() plt.legend(loc='upper right') return ax
def plot_variable_importance(feature_importance, names_cols, save_name, save): """Show Variable importance graph.""" # scale by max importance first 20 variables in column names feature_importance = feature_importance / feature_importance.max() sorted_idx = np.argsort(feature_importance)[::-1][:20] barPos = np.arange(sorted_idx.shape[0]) + .8 barPos = barPos[::-1] #plot.figure(num=None, facecolor='w', edgecolor='r') plot.figure(num=None, facecolor='w') plot.barh(barPos, feature_importance[sorted_idx]*100, align='center') plot.yticks(barPos, names_cols[sorted_idx]) plot.xticks(np.arange(0, 120, 20), \ ['0 %', '20 %', '40 %', '60 %', '80 %', '100 %']) plot.margins(0.02) plot.subplots_adjust(bottom=0.15) plot.title('Variable Importance') if save: plot.savefig(save_name, bbox_inches='tight', dpi = 300) plot.close("all") else: plot.show()
def test_feature(train_path): data = np.genfromtxt(train_path, delimiter = ',') y = data[:,0] X = data[:,1:] sample_size = len(y) train_size = int(sample_size * .95) params = {'n_estimators': 100, 'max_depth': 2, 'random_state': 1, 'min_samples_split': 5} params.update({'learn_rate': 0.02, 'subsample': 1.0}) clf = ensemble.GradientBoostingClassifier(**params) clf.fit(X, y) pl.figure() feature_names = np.array(['type', 'type', 'type', 'main', 'log_main', 'evi', 'log_evi', 'df1', 'log_df1', 'dfu8', 'log_dfu8', 'dfband', 'log_dfband']) feature_importance = clf.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance)[-8:] pos = np.arange(sorted_idx.shape[0]) + .5 pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, feature_names[sorted_idx]) pl.xlabel('Relative Importance') pl.title('Variable Importance') pl.show()
def barh(pl, x, h, title=''): pl.figure if title != '': pl.title(title) pl.barh(x, h, height=0.1) pl.show() pl.close()
def test_feature(train_path): data = np.genfromtxt(train_path, delimiter=',') y = data[:, 0] X = data[:, 1:] sample_size = len(y) train_size = int(sample_size * .95) params = { 'n_estimators': 100, 'max_depth': 2, 'random_state': 1, 'min_samples_split': 5 } params.update({'learn_rate': 0.02, 'subsample': 1.0}) clf = ensemble.GradientBoostingClassifier(**params) clf.fit(X, y) pl.figure() feature_names = np.array([ 'type', 'type', 'type', 'main', 'log_main', 'evi', 'log_evi', 'df1', 'log_df1', 'dfu8', 'log_dfu8', 'dfband', 'log_dfband' ]) feature_importance = clf.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance)[-8:] pos = np.arange(sorted_idx.shape[0]) + .5 pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, feature_names[sorted_idx]) pl.xlabel('Relative Importance') pl.title('Variable Importance') pl.show()
def plot(self, gs): unit_len = self.show_len * 1. / 5. if self.s.now - self.show_len < 0: return price = self.price[0][self.s.now - self.show_len : self.s.now] profile_range = [price.min(), price.max() + 1] floor, ceil = profile_range[0] - 1, profile_range[1] + 1 d = self.output(3, profile_range) ax = plt.subplot(gs) plt.plot(price) day_begin = np.where(self.s.history['time_in_ticks'][self.s.now - self.show_len : self.s.now] == 0)[0] for x in day_begin: plt.axvline(x, color='r', linestyle=':') y = self.smoothed_pivot_profile[floor : ceil] plt.barh(np.arange(floor, ceil) - 0.5, y * unit_len, 1.0, label=self.name, alpha=0.2, color='r', edgecolor='none') last_price = int(get(self.price)) support = last_price + int(round((d['S_offset']) * self.volatility)) resistance = last_price + int(round((d['R_offset']) * self.volatility)) highlighted = [support, resistance] plt.barh(np.array(highlighted) - 0.5, self.smoothed_pivot_profile[highlighted] * unit_len, 1.0, alpha=1.0, color='r', edgecolor='none') ax.set_xticks(np.arange(0, self.show_len * 1.22, unit_len)) ax.xaxis.grid(b=True, linestyle='--') ax.yaxis.grid(b=False) plt.legend(loc='upper right') return ax
def do_scaplots(distance_dict, after_dict, before_dict, bins, xtext, option=0): for count, name,ylims in ((0,'m_diff', (-0.5,0.5)),(1,'n diff', (-1,0.5)),(2,'r diff', (-0.5,0.5)),(3, 'ba diff', (-0.05,0.05))): pl.subplot(2,2,count+1) if 0:#count ==2: ns = np.array([after_dict[a][count]/np.max([before_dict[a][count],0.0000001])-1.0 for a in before_dict.keys()]).T else: ns = np.array([after_dict[a][count]-before_dict[a][count] for a in before_dict.keys()]).T bars, edges=np.histogram(ns, bins=100,range=ylims) bars = bars/float(ns.size) print ns #pl.step(bars, edges, *args, **kwargs) pl.barh((edges[0:-1]+edges[1:])/2, bars, align='center', height = (edges[1:]-edges[0:-1]),alpha=0.4) #pl.scatter(ns[0,:], ns[1,:], s =3, edgecolor='none', zorder = -900) nstats = bin_stats.bin_stats(0.25*np.ones_like(ns), ns, (0.0,0.5), -1000.0, 1000.0) nstats.lay_bounds(color='r', sigma_choice = [68,95]) nstats.plot_ebar('median','med95ci',color='r',ecolor='r', marker='s', markersize=3, lw=2, linestyle='none') pl.xlabel(xtext) pl.ylabel(name) pl.ylim(ylims) pl.xlim(0,0.5) #ax = pl.subplot(2,2,3) #pl.ylim(-10,10) pl.subplots_adjust(wspace=0.4, hspace=0.4) return
def arbolesRegresion(caract): clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True) importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0] mae=mse=r2=0 kf = KFold(len(boston_Y), n_folds=10, indices=True) for train, test in kf: trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test] nCar=len(caract) train=np.zeros((len(trainX), nCar)) test=np.zeros((len(testX), nCar)) trainYNuevo=trainY for i in range(nCar): for j in range(len(trainX)): train[j][i]=trainX[j][caract[i]] for k in range(len(testX)): test[k][i]=testX[k][caract[i]] trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1)) clf.fit(train, trainYNuevo) prediccion=clf.predict(test) # clf.fit(trainX, trainY) # prediccion=clf.predict(testX) mae+=metrics.mean_absolute_error(testY, prediccion) mse+=metrics.mean_squared_error(testY, prediccion) r2+=metrics.r2_score(testY, prediccion) feature_importance = clf.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) for i in range(13): importancias[i] = importancias[i] + feature_importance[i] print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf) for i in range(13): importancias[i] = importancias[i]/10 sorted_idx = np.argsort(importancias) pos = np.arange(sorted_idx.shape[0]) + .5 importancias = np.reshape(importancias, (len(importancias), -1)) boston = datasets.load_boston() pl.barh(pos, importancias[sorted_idx], align='center') pl.yticks(pos, boston.feature_names[sorted_idx]) pl.xlabel('Importancia relativa') pl.show() import StringIO, pydot dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("bostonTree.pdf")
def plot_predictions(self): data = self.get_next_batch(train=False)[2] # get a test batch num_classes = self.test_data_provider.get_num_classes() NUM_ROWS = 2 NUM_COLS = 4 NUM_IMGS = NUM_ROWS * NUM_COLS NUM_TOP_CLASSES = min(num_classes, 4) # show this many top labels label_names = self.test_data_provider.batch_meta['label_names'] if self.only_errors: preds = n.zeros((data[0].shape[1], num_classes), dtype=n.single) else: preds = n.zeros((NUM_IMGS, num_classes), dtype=n.single) rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS) print rand_idx data[0] = n.require(data[0][:,rand_idx], requirements='C') data[1] = n.require(data[1][:,rand_idx], requirements='C') data += [preds] temp = data[0] print data print temp.ndim,temp.shape,temp.size # Run the model self.libmodel.startFeatureWriter(data, self.sotmax_idx) self.finish_batch() fig = pl.figure(3) fig.text(.4, .95, '%s test case predictions' % ('Mistaken' if self.only_errors else 'Random')) if self.only_errors: err_idx = nr.permutation(n.where(preds.argmax(axis=1) != data[1][0,:])[0])[:NUM_IMGS] # what the net got wrong data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:] data[0] = self.test_data_provider.get_plottable_data(data[0]) for r in xrange(NUM_ROWS): for c in xrange(NUM_COLS): img_idx = r * NUM_COLS + c if data[0].shape[0] <= img_idx: break pl.subplot(NUM_ROWS*2, NUM_COLS, r * 2 * NUM_COLS + c + 1) pl.xticks([]) pl.yticks([]) try: img = data[0][img_idx,:,:,:] except IndexError: # maybe greyscale? img = data[0][img_idx,:,:] pl.imshow(img, interpolation='nearest') true_label = int(data[1][0,img_idx]) img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:] pl.subplot(NUM_ROWS*2, NUM_COLS, (r * 2 + 1) * NUM_COLS + c + 1, aspect='equal') ylocs = n.array(range(NUM_TOP_CLASSES)) + 0.5 height = 0.5 width = max(ylocs) pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \ color=['r' if l[1] == label_names[true_label] else 'b' for l in img_labels]) pl.title(label_names[true_label]) pl.yticks(ylocs + height/2, [l[1] for l in img_labels]) pl.xticks([width/2.0, width], ['50%', '']) pl.ylim(0, ylocs[-1] + height*2)
def visualize_chi2(summaries, genres, n_gram=(1, 3), top_features=20): """ Visualize the most discriminative features for each genre :param summaries: :param genres: :param n_gram: :param top_features: :return: """ vectorizer = TfidfVectorizer(ngram_range=n_gram, lowercase=True, norm=None, smooth_idf=True, sublinear_tf=True) new_summaries = [] new_genres = [] for (summary, genre) in (summaries, genres): for sentence in summary.split('.'): new_summaries.append(sentence) new_genres.append(genre) X_train = vectorizer.fit_transform(new_summaries) chi2score = chi2(X_train, new_genres)[0] figure(figsize=(6, 6)) wscores = zip(vectorizer.get_feature_names(), chi2score) wchi2 = sorted(wscores, key=lambda x: x[1]) topchi2 = zip(*wchi2[-top_features:]) x = range(len(topchi2[1])) labels = topchi2[0] barh(x, topchi2[1], align='center', alpha=.2, color='g') plot(topchi2[1], x, '-o', markersize=2, alpha=.8, color='g') yticks(x, labels) xlabel('$\chi^2$') ylabel('Top discriminative features') show()
def plot_occs_by_motif(by_motif): """Plot # occurrences for each motif. """ sizes = [ (len(occs), sum(occ.Z for occ in occs), name) for name, occs in by_motif.iteritems()] # expected = [(len(occs), name) for name, occs in by_motif.iteritems()] sizes.sort() bar_positions = numpy.arange(len(sizes)) num_occs = numpy.asarray([s for s, e, n in sizes]) total_Z = numpy.asarray([e for s, e, n in sizes]) pylab.barh( bar_positions, num_occs, # left=total_Z, height=.8, align='center', label='Sites', color='blue', ) pylab.barh( bar_positions, total_Z, height=.8, align='center', label='Total Z', color='blue', edgecolor='white', hatch='/', ) pylab.yticks(bar_positions, [n for x, e, n in sizes]) pylab.ylim(ymin=-.5, ymax=len(sizes) - .5) pylab.xlabel('occurrences') pylab.legend(loc='lower right')
def plot_variable_importance(feature_importance, names_cols, save_name, save): """Show Variable importance graph.""" # scale by max importance first 20 variables in column names feature_importance = feature_importance / feature_importance.max() sorted_idx = np.argsort(feature_importance)[::-1][:20] barPos = np.arange(sorted_idx.shape[0]) + .8 barPos = barPos[::-1] #plot.figure(num=None, facecolor='w', edgecolor='r') plot.figure(num=None, facecolor='w') plot.barh(barPos, feature_importance[sorted_idx] * 100, align='center') plot.yticks(barPos, names_cols[sorted_idx]) plot.xticks(np.arange(0, 120, 20), \ ['0 %', '20 %', '40 %', '60 %', '80 %', '100 %']) plot.margins(0.02) plot.subplots_adjust(bottom=0.15) plot.title('Variable Importance') if save: plot.savefig(save_name, bbox_inches='tight', dpi=300) plot.close("all") else: plot.show()
def length_stats_chart(path, prefixes, sortby=1): stats = [] for prefix in prefixes: med, m,s = length_stats(prefix) stats.append((prefix,med,m,s)) stats.sort(key=operator.itemgetter(sortby)) prefixes, med_list, mean_list, std_list = zip(*stats) blockSize = 8 ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups height = 3 # bar height p3 = p.barh(ind, std_list, 2 * height, color = 'b', linewidth = 0) p2 = p.barh(ind, med_list, height, color = 'g', linewidth = 0) p1 = p.barh(ind+height, mean_list, height, color = 'r', linewidth = 0) p.ylim(-height, len(prefixes) * blockSize) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size='smaller') p.xlabel('Unicode Codepoints') p.ylabel('Language Code') p.title('Descriptive Statistics for Document Lengths') p.gca().yaxis.tick_left() p.yticks(ind+height, prefixes, fontproperties = yfontprop) xmin, xmax = p.xlim() p.xticks( p.arange(xmin,xmax,1000),fontproperties = xfontprop) p.gca().xaxis.grid(linestyle = '-', linewidth=0.15) p.legend((p1[0], p2[0], p3[0]), ('Mean','Median','Standard Deviation'), prop = xfontprop, loc = 'lower right' ) p.savefig(path, dpi=300) p.close() p.clf()
def plot_feature_importances_cancer(model): n_features = cancer.data.shape[1] plt.barh(range(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), cancer.feature_names) plt.xlabel("Feature importance") plt.ylabel("Feature") plt.ylim(-1, n_features)
def barh(x, y=None, title='', xlabel='', ylabel=''): import pylab as P import numpy as np L = (tuple, list, np.ndarray) # separate arrays if isinstance(x,L) and isinstance(y,L): xylist = zip(x,y) # list of two-tuples elif isinstance(x, L) and isinstance(x[0], L) and len(x[0]) == 2: xylist = x else: raise TypeError P.figure(figsize=(10, 5)) # image dimensions P.title(title, size='medium') P.xlabel(xlabel) P.ylabel(ylabel) # add bars for i, item in enumerate(xylist): P.barh(i + 0.25 , item[1]) # set ylim width = np.max(zip(*xylist)[1]) P.xlim(0, width*1.1) # axis setup P.yticks(np.arange(0.65, len(xylist)), ['%s' % x for x,y in xylist], size='medium')
def plotNogazeDuration(): plt.figure(figsize=(12,12)) for vp in range(100,120): print vp plt.subplot(5,4,vp-99) plt.ion() data=readTobii(vp,0,ETDATAPATH); datT=[];datF=[] for trl in data: trl.extractBasicEvents() miss=np.int32(np.logical_and(np.isnan(trl.gaze[:,7]), np.isnan(trl.gaze[:,8]))) miss=removeShortEvs(miss,2*60) miss=1-removeShortEvs(1-miss,1*60) datT+=map(lambda x: (x[1]-x[0])/60.,tseries2eventlist(miss)) datF+=map(lambda x: (x[1]-x[0])/60.,tseries2eventlist(1-miss)) x=np.linspace(0,10,21);h=x[-1]/float(x.size-1) a=np.histogram(datT,bins=x, normed=True) plt.barh(x[:-1],-a[0],ec='k',fc='k',height=h,lw=0) a=np.histogram(datF,bins=x, normed=True) plt.barh(x[:-1],a[0],ec='g',fc='g',height=h,lw=0) plt.xlim([-0.7,0.7]); plt.gca().set_yticks(range(0,10,2)) plt.ylim([0,10]); #plt.grid(False,axis='y') if vp==10:plt.legend(['blikn','gaze'])
def main(args): # tell the interpreter we want to use the global 'jobs' list global jobs # tell the interpreter we want to use the 'pl' module global pl # parse the command line arguments # note that the first command line argument is always the name of the script if len(args) < 2: print("Usage: python plot_jobtimes.py NUMBER_OF_THREADS") exit() num_threads = int(args[1]) # do the actual plotting # loop over the number of threads for i in range(num_threads): # get the data times = get_times("jobtimes_{i}.txt".format(i=i)) # plot each data group in the corresponding colour for time in times: for key in jobs: if time[0] == key: pl.barh( i, time[2] - time[1], left=time[1] - times[0][1], color=jobs[key], ) # show the plot # the program will resume when the window is closed by the user pl.show()
def wiki_sizes_chart(path, prefixes, upperlimit = None ): prefixes, sizes = zip(*sorted( [(pr, dumpSize(pr)) for pr in prefixes] , key = operator.itemgetter(1) ) ) blockSize = 5 ind = p.arange(0, blockSize*len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] colors = html_colors thresholds = [5000, 2000,1000,500,200,100,50,20,10] #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] #colors.reverse() overall = p.barh( ind , sizes , height , color = 'b' , linewidth = 0 , align='center' ) subbars = [] for i, thresh in enumerate(thresholds) : subbars.append( p.barh( ind , [ docs_under_thresh(pr, thresh) for pr in prefixes] , height , color = colors[ i % len(colors) ] , linewidth = 0 , align='center' ) ) p.ylim(-height, len(prefixes) * blockSize) if upperlimit: p.xlim(0, upperlimit) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Documents') p.ylabel('Language Code') p.title('Number of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties = yfontprop) xmin, xmax = p.xlim() xtick_interval = rounded_interval(xmin, xmax, 20, 2) p.xticks( p.arange(xmin,xmax,xtick_interval),fontproperties = xfontprop) p.gca().xaxis.grid(linestyle = '-', linewidth=0.15) p.gca().yaxis.tick_left() p.legend( [ b[0] for b in subbars] , map(str,thresholds) , prop = xfontprop , loc = 'lower right' ) p.savefig(path, dpi=300) p.close() p.clf()
def plot_predictions(self): data = self.get_next_batch(train=False)[2] # get a test batch num_classes = self.test_data_provider.get_num_classes() NUM_ROWS = 2 NUM_COLS = 4 NUM_IMGS = NUM_ROWS * NUM_COLS NUM_TOP_CLASSES = min(num_classes, 4) # show this many top labels label_names = self.test_data_provider.batch_meta['label_names'] if self.only_errors: preds = n.zeros((data[0].shape[1], num_classes), dtype=n.single) else: preds = n.zeros((NUM_IMGS, num_classes), dtype=n.single) rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS) data[0] = n.require(data[0][:,rand_idx], requirements='C') data[1] = n.require(data[1][:,rand_idx], requirements='C') data += [preds] # Run the model self.libmodel.startFeatureWriter(data, self.sotmax_idx) self.finish_batch() fig = pl.figure(3) fig.text(.4, .95, '%s test case predictions' % ('Mistaken' if self.only_errors else 'Random')) if self.only_errors: err_idx = nr.permutation(n.where(preds.argmax(axis=1) != data[1][0,:])[0])[:NUM_IMGS] # what the net got wrong data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:] data[0] = self.test_data_provider.get_plottable_data(data[0]) pl.subplots_adjust(hspace=.3) for r in xrange(NUM_ROWS): for c in xrange(NUM_COLS): img_idx = r * NUM_COLS + c if data[0].shape[0] <= img_idx: break pl.subplot(NUM_ROWS*2, NUM_COLS, r * 2 * NUM_COLS + c + 1) pl.xticks([]) pl.yticks([]) #pl.title('test') try: img = data[0][img_idx,:,:,:] except IndexError: # maybe greyscale? img = data[0][img_idx,:,:] pl.imshow(img, interpolation='nearest') true_label = int(data[1][0,img_idx]) img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:] pl.subplot(NUM_ROWS*2, NUM_COLS, (r * 2 + 1) * NUM_COLS + c + 1, aspect='equal') ylocs = n.array(range(NUM_TOP_CLASSES)) + 0.5 height = 0.5 width = max(ylocs) pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \ color=['r' if l[1] == label_names[true_label] else 'b' for l in img_labels]) pl.title(label_names[true_label]) pl.yticks(ylocs + height/2, [l[1] for l in img_labels]) pl.xticks([width/2.0, width], ['50%', '']) pl.ylim(0, ylocs[-1] + height*2)
def plot_cascade(self, vertical=True): if vertical: fig_size = (12, 12) ax_size = [0.45, 0.05, 0.5, 0.9] else: fig_size = (16, 8) ax_size = [0.05, 0.45, 0.9, 0.5] df = sc.dcp(self.data) cutoff = 200e3 fig = pl.figure(figsize=fig_size) df.sort(col='icer', reverse=False) DA_data = hp.arr(df['opt_spend']) inds = sc.findinds(DA_data > cutoff) DA_data = DA_data[inds] DA_data /= 1e6 DA_labels = df['shortname'][inds] npts = len(DA_data) colors = sc.gridcolors(npts, limits=(0.25, 0.75)) x = np.arange(len(DA_data)) pl.axes(ax_size) for pt in range(npts): loc = x[pt:] this = DA_data[pt] start = sum(DA_data[:pt]) prop = 0.9 color = colors[pt] amount = sum(DA_data[:pt + 1]) amountstr = '%0.1f' % amount if vertical: pl.barh(loc, width=this, left=start, height=prop, color=color) pl.text(amount, x[pt], amountstr, verticalalignment='center', color=colors[pt]) else: pl.bar(loc, height=this, bottom=start, width=prop, color=color) pl.text(x[pt], amount + 1, amountstr, horizontalalignment='center', color=colors[pt]) if vertical: pl.xlabel('Spending for optimized investment cascade') pl.gca().set_yticks(x) ticklabels = pl.gca().set_yticklabels(DA_labels) else: pl.ylabel('Optimized investment cascade') pl.gca().set_xticks(x) ticklabels = pl.gca().set_xticklabels(DA_labels, rotation=90) for t, tl in enumerate(ticklabels): tl.set_color(colors[t]) pl.gca().set_facecolor('none') pl.title('Investment cascade') return fig
def histogram(c, plot_name="test", plot_title="", plot_xlabel=""): import pylab pylab.figure(1) pos = pylab.arange(len(c)) + .5 pylab.barh(pos, c, align='center') pylab.yticks(pos, range(1, len(c) + 1)) pylab.xlabel(plot_xlabel) pylab.title(plot_title) pylab.grid(True) pylab.savefig(plot_name + ".png")
def histogram(c, plot_name="test", plot_title="", plot_xlabel=""): import pylab pylab.figure(1) pos = pylab.arange(len(c))+.5 pylab.barh(pos, c, align='center') pylab.yticks(pos, range(1, len(c)+1)) pylab.xlabel(plot_xlabel) pylab.title(plot_title) pylab.grid(True) pylab.savefig(plot_name+".png")
def plot_feature_importance(feature_importance, feature_names): # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, feature_names[sorted_idx]) pl.xlabel('Relative Importance') pl.title('Variable Importance') pl.show()
def print_figure(result_db, label_txt): """ Print inference results """ if result_db is None or label_txt is None: return db = h5py.File(result_db, 'r') if db is not None: labels = np.loadtxt(label_txt, dtype='object') NUM_COLS = 6 NUM_IMGS = len(db['input_ids']) NUM_ROWS = NUM_IMGS // NUM_COLS + (NUM_IMGS % NUM_COLS > 0) NUM_TOPK_CLASSES = 3 fig = pl.figure(figsize=(16, 4)) fig.set_canvas(pl.gcf().canvas) for row in range(NUM_ROWS): for col in range(NUM_COLS): idx = row * NUM_COLS + col if idx == NUM_IMGS: break pl.subplot(NUM_ROWS * 2, NUM_COLS, row * 2 * NUM_COLS + col + 1) pl.xticks([]) pl.yticks([]) pl.imshow(db['input_data'][idx], interpolation='nearest') res = db['outputs'][db['outputs'].keys()[0]] for elem_id, elem_data in enumerate(res): row = elem_id // NUM_COLS col = elem_id % NUM_COLS img_labels = sorted(zip(elem_data, labels), key=lambda x: x[0])[-NUM_TOPK_CLASSES:] ax = pl.subplot(NUM_ROWS * 2, NUM_COLS, (row * 2 + 1) * NUM_COLS + col + 1, aspect='equal') ax.yaxis.set_label_position("right") ax.yaxis.set_label_coords(1.25, 0.5) height = 10 margin = 1 ylocs = np.array( range(NUM_TOPK_CLASSES)) * (height + margin) + margin width = max(ylocs) top_class = img_labels[-1][1] pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \ color=['r' if l[1] == top_class else 'b' for l in img_labels]) #color=['r' if l[1] == labels[true_label] else 'b' for l in img_labels]) pl.yticks(ylocs + (height + margin) / 2.0, [l[1].replace('_', '\n') for l in img_labels], fontsize=16) pl.xticks([0, width / 2.0, width], ['0%', '50%', '100%']) pl.ylim(0, ylocs[-1] + height + margin) pl.tight_layout() pl.show()
def essay_char(essay): from pylab import xlabel, ylabel, show, savefig, title,\ yticks, xlim, ylim, xticks, arange, figure, barh, grid, rcParams from string import ascii_letters global config cnt = { x:0 for x in ascii_letters } for c in essay: if cnt.has_key(c): cnt[c] += 1 titlestr = "Essay Char" figure(figsize=(max(cnt.values())/4, 15), dpi=60) rcParams['font.size'] = 17 rcParams['text.color'] = 'c' rcParams['xtick.color'] = 'r' rcParams['ytick.color'] = 'y' rcParams['figure.facecolor'] = 'k' rcParams['figure.edgecolor'] = 'b' rcParams['savefig.facecolor'] = rcParams['figure.facecolor'] rcParams['savefig.edgecolor'] = rcParams['figure.edgecolor'] rcParams['savefig.dpi'] = rcParams['figure.dpi'] xlim(0, max(cnt.values()*2)) ylim(0, len(cnt)*2) kbuf = cnt.keys() kbuf.sort() xticks(xrange(int(xlim()[0]), int(xlim()[1]), 2), rotation=45) yticks(xrange(int(ylim()[0]), int(ylim()[1]), 2), kbuf, rotation=-45) vbuf = [cnt[c] for c in kbuf] grid() for n, w in zip(xrange(len(vbuf)+1), vbuf): barh(n*2, w, height=1.5, left=0, align='center') """ bar(xrange(1, len(vbuf)+1), height=vbuf, width=[1]*len(vbuf), bottom=[0]*len(vbuf), align='center') # orientation='horizontal') # hist(vbuf, bins=range(1, len(vbuf)+1), #rwidth=1, bottom=0, # align='mid', orientation='horizontal', alpha=0.7) """ title(titlestr) xlabel('Characters Count') ylabel('Essay Characters') # show() savefig(config['/img']['tools.staticdir.dir'] + '/' + titlestr.replace(' ', '-').lower(), bbox_inches='tight', pad_inches=0)
def wiki_sizes_chart(path, prefixes, upperlimit=None): prefixes, sizes = zip(*sorted([(pr, dumpSize(pr)) for pr in prefixes], key=operator.itemgetter(1))) blockSize = 5 ind = p.arange(0, blockSize * len(prefixes), blockSize) # y location for groups height = 4 # bar height #colors = ['g','r','c','m','y'] colors = html_colors thresholds = [5000, 2000, 1000, 500, 200, 100, 50, 20, 10] #colors = [str(float(i+1) / (len(thresholds)+1)) for i in xrange(len(thresholds))] #colors.reverse() overall = p.barh(ind, sizes, height, color='b', linewidth=0, align='center') subbars = [] for i, thresh in enumerate(thresholds): subbars.append( p.barh(ind, [docs_under_thresh(pr, thresh) for pr in prefixes], height, color=colors[i % len(colors)], linewidth=0, align='center')) p.ylim(-height, len(prefixes) * blockSize) if upperlimit: p.xlim(0, upperlimit) yfontprop = FontProperties(size=4) xfontprop = FontProperties(size=4) p.xlabel('Documents') p.ylabel('Language Code') p.title('Number of Documents Under Threshold') p.yticks(ind, prefixes, fontproperties=yfontprop) xmin, xmax = p.xlim() xtick_interval = rounded_interval(xmin, xmax, 20, 2) p.xticks(p.arange(xmin, xmax, xtick_interval), fontproperties=xfontprop) p.gca().xaxis.grid(linestyle='-', linewidth=0.15) p.gca().yaxis.tick_left() p.legend([b[0] for b in subbars], map(str, thresholds), prop=xfontprop, loc='lower right') p.savefig(path, dpi=300) p.close() p.clf()
def plot_occupancy(occupancy, offset=0.0, cm=None, n_cages=None, n_animals=None, label_left=None): if cm is None: cm = default_cm # [enter, exit, cage, animal] # get all animals aids = numpy.unique(occupancy[:, 3]) aids.sort() if n_animals is None: n_aids = len(aids) else: n_aids = n_animals # find # of cages if n_cages is None: n_cages = len(numpy.unique(occupancy[:, 2])) # give each cage a color colors = { cid: cm(cid / float(n_cages - 1.0)) for cid in numpy.arange(n_cages) } bar_height = 1. / n_aids # plot each animal for (i, aid) in enumerate(aids): # get occupancy for this animal ao = occupancy[occupancy[:, 3] == aid] # add label ty = i * bar_height + offset tx = ao[0, 0] if label_left is None else label_left pylab.text(tx, ty, str(aid), ha='right', va='center', color='k') # barh(bottom, width, height, left, **kwargs) cs = [colors[b] for b in ao[:, 2]] l = numpy.ones_like(ao[:, 1] - ao[:, 0]) * i * bar_height + offset pylab.barh(l, ao[:, 1] - ao[:, 0], bar_height, ao[:, 0], color=cs, linewidth=0) yl = pylab.ylim() ylmin = min(yl[0], offset) ylmax = max(yl[1], 1 + offset) if yl != (ylmin, ylmax): pylab.ylim(ylmin, ylmax)
def plot_importance(clf, train_df, features): feature_importance = clf.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, train_df[features].columns[sorted_idx]) pl.xlabel('Relative Importance') pl.title('Variable Importance') pl.show()
def question_a(): logging.info("<Question A> Plotting histogram") #dicts containing count of files of the given type train_count = {} test_count = {} for i in range(len(proc_train_set.target)): if train_set.target_names[train_set.target[i]] in train_count: train_count[train_set.target_names[train_set.target[i]]] += 1 else: train_count[train_set.target_names[train_set.target[i]]] = 1 for i in range(len(test_set.target)): if test_set.target_names[test_set.target[i]] in test_count: test_count[test_set.target_names[test_set.target[i]]] += 1 else: test_count[test_set.target_names[test_set.target[i]]] = 1 # plot histogram for number of documents vs. topic name pl.figure(1) pl.xlabel('Topic Name') pl.ylabel('Number of Topics') yloc = pl.arange(len(train_count.keys())) pl.title('Histogram of Number of Documents Per Topic') pl.yticks(yloc, train_count.keys()) pl.barh(yloc, list(train_count.values()), align='center', color='green') pl.tight_layout() # get number of docs of each category CT_count_train = 0 CT_count_test = 0 RA_count_train = 0 RA_count_test = 0 for i in category_CT: CT_count_train += train_count[i] CT_count_test += test_count[i] for j in category_RA: RA_count_test += test_count[j] RA_count_train += train_count[j] logging.info( 'Computer Technology - train data: {0}'.format(CT_count_train)) logging.info('Computer Technology - test data: {0}'.format(CT_count_test)) logging.info( 'Recreational Activity - train data: {0}'.format(RA_count_train)) logging.info( 'Recreational Activity - test data: {0}'.format(RA_count_test)) pl.show()
def question_a(): logger.info("EXECUTING: QUESTION A") logger.info("Plotting histogram of the number of documents per topic (Training Dataset)") count_train = {} count_test = {} # count the number of documents for each topic name in training dataset for record in xrange(len(train_dataset.target)): if train_dataset.target_names[train_dataset.target[record]] in count_train: count_train[train_dataset.target_names[train_dataset.target[record]]] += 1 else: count_train[train_dataset.target_names[train_dataset.target[record]]]= 1 # count the number of documents for each topic name in testing dataset for record in xrange(len(test_dataset.target)): if test_dataset.target_names[test_dataset.target[record]] in count_test: count_test[test_dataset.target_names[test_dataset.target[record]]] += 1 else: count_test[test_dataset.target_names[test_dataset.target[record]]]= 1 logger.info("Histogram plotted") # plot histogram for number of documents vs. topic name pl.figure(1) pl.ylabel('Topic Name') jet = pl.get_cmap('jet') pl.xlabel('Number of Topics') pos = pl.arange(len(count_train.keys())) + 0.5 pl.title('Histogram of Number of Documents Per Topic') pl.yticks(pos, count_train.keys()) pl.barh(pos, count_train.values(), align='center', color=jet(np.linspace(0, 1.0, len(count_train)))) # count number of documents in CT and RA classes train_CT, train_RA, test_CT, test_RA = 0,0,0,0 for i,j in zip(category_CT,category_RA): train_CT += count_train[i] train_RA += count_train[j] test_CT += count_test[i] test_RA += count_test[j] logger.info("TRAINING DATASET") logger.info("Number of Documents in Computer Technology : {}".format(train_CT)) logger.info("Number of Documents in Recreational Activity : {}".format(train_RA)) logger.info("TESTING DATASET") logger.info("Number of Documents in Computer Technology : {}".format(test_CT)) logger.info("Number of Documents in Recreational Activity : {}".format(test_RA)) pl.show()
def summary_xyplot(df,var): #random forest features=np.array(df.ix[:, df.columns != var].describe().keys()) clf = RandomForestClassifier() clf.fit(df[features], df[var]) importances = clf.feature_importances_ sorted_idx = np.argsort(importances) padding = np.arange(len(features)) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, features[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance") return pl.show()
def plot_feature_importances(features, feature_importances): df = pd.DataFrame(feature_importances, index=features) df.sort(axis=1, ascending=False, inplace=True) df.columns = ['feature_importances'] pos = np.arange(0, len(features)) + 0.5 plt.figure(figsize=(20, 12)) plt.barh(pos, df.feature_importances, color='darkorange', align='center') plt.yticks(pos, df.index) plt.xlabel('Importance') plt.title('Feature Importances') plt.axis([0, 0.25, 0, 12]) #plt.show() plt.savefig('RF_featureimportances_2Species_3.png')
def drawChips(pl, df, df_close, title=""): """画一个竖向的直方图, 坐标显示价位, 值为仓位比率 df: df_chips """ pl.figure pl.subplot(121) pl.title(title) df_close['c'].plot() pl.subplot(122) chips = df[df.columns[1]].values pl.barh(df[df.columns[0]].values, chips) pl.show() pl.close()
def stacking_evaluation(Train, Test, comparative, treshold, fileModel, label='FRAUDE', beta=2): yTrain = Train[label] xTrain = Train del xTrain[label] names = Train.columns.values.tolist() fileNames = np.array(names) from utils.model_utils import over_sampling xTrain, yTrain = over_sampling(xTrain, yTrain, model='ADASYN') fileModel.fit(xTrain.values, yTrain.values) y_hat_test = fileModel.predict_proba(Test.drop(label, axis=1).values) df_proba = pd.DataFrame(y_hat_test, index=Test.index) df_proba = pd.concat([Test, df_proba], axis=1) df_proba.columns = ['VALOR REAL', 'VALOR_PREDICHO_NO_FRAUDE', 'VALOR_PREDICHO_FRAUDE'] df_proba.to_csv('final_files\\probabilidades_stacking.csv', sep=';', index=False, encoding='latin1') y_hat_test = np.delete(y_hat_test, 0, axis=1) y_hat_test = (y_hat_test > treshold).astype(int) y_hat_test = y_hat_test.tolist() y_hat_test = [item for sublist in y_hat_test for item in sublist] print('Final threshold: %.3f' % treshold) print('Test Recall Score: %.3f' % recall_score(y_pred=y_hat_test, y_true=Test[label].values)) print('Test Precision Score: %.3f' % precision_score(y_pred=y_hat_test, y_true=Test[label].values)) print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test, y_true=Test[label].values, beta=beta)) for i in comparative.columns.values.tolist(): if i != 'id_siniestro' and i in Test.columns.values.tolist(): del comparative[i] Test = pd.merge(Test, comparative, how='left', on='id_siniestro') cnf_matrix = confusion_matrix(Test['FRAUDE_Clusters'].values, y_hat_test) plot_confusion_matrix(cnf_matrix, classes=['No Fraude', 'Fraude'], title='Confusion matrix') cnf_matrix = confusion_matrix(Test['FRAUDE'].values, y_hat_test) plot_confusion_matrix(cnf_matrix, classes=['Normal', 'Anormal'], title='Confusion matrix') featureImportance = fileModel.feature_importances_ featureImportance = featureImportance / featureImportance.max() sorted_idx = np.argsort(featureImportance) barPos = np.arange(sorted_idx.shape[0]) + 0.5 plot.barh(barPos, featureImportance[sorted_idx], align='center') plot.yticks(barPos, fileNames[sorted_idx]) plot.xlabel('Variable Importance') plot.show()
def symhist(x1, x2, bins): ''' symmetric histogram of two data sets >>> symhist(np.random.randn(100),np.random.randn(100)+1,np.linspace(-3,4,15)) >>> plt.show() ''' bw = bins[1] - bins[0] a1 = np.histogram(x1, bins=bins, normed=True) plt.barh(bins[:-1], -a1[0], ec='w', fc='y', height=bw, lw=0.1) a2 = np.histogram(x2, bins=bins, normed=1) plt.barh(bins[:-1], a2[0], ec='w', fc='y', height=bw, lw=0.1) xmax = max(plt.xlim()) plt.xlim([-xmax, xmax]) plt.ylim([bins[0], bins[-1]]) ax = plt.gca()
def plot_rfid_events(events, timerange=None, ymin=-0.5, ymax=0.5, color='k', label=False, animals=None): rfid = db.sel(events, event='rfid', timerange=timerange, data1=0) if len(rfid) == 0: return if animals is None: animals = numpy.unique(rfid[:, consts.DATA0_COLUMN]) na = animals.size cs = numpy.arange(na) / (na - 1.) for (a, c) in zip(animals, cs): c = pylab.cm.jet(c) ae = db.sel(rfid, data0=a) if len(ae) == 0: continue pylab.vlines(ae[:, consts.TIME_COLUMN], ymin, ymax, color=c) return rfid = db.sel(events, event='rfid', timerange=timerange) if len(rfid) == 0: return # remove any read errors? #rfid = rfid[:, 4] >= 0 #pylab.vlines(rfid[:, consts.TIME_COLUMN], ymin, ymax, color=color) idi = numpy.where(rfid[:, 4] == 0)[0] if idi[0] == 0: idi = idi[1:] if idi[-1] == rfid.shape[0] - 1: idi = idi[:-1] si = rfid[idi - 1] ei = rfid[idi + 1] assert numpy.all(si[:, 3] == 1) assert numpy.all(ei[:, 3] == 0) n = si.shape[0] b = numpy.ones(n) * ymin h = numpy.ones(n) * (ymax - ymin) w = ei[:, 0] - si[:, 0] l = si[:, 0] pylab.barh(b, w, h, l, color='pink') rfid_y = (ymin + ymax) * 0.5 if not label: for ev in rfid[idi]: pylab.text(ev[consts.TIME_COLUMN], rfid_y, '%s' % ev[consts.RFID_ID_COLUMN], color='k')
def display_importance(df, label, features): ''' Given dataframe, label, and list of features, plot a graph to rank variable importance ''' clf = RandomForestClassifier() clf.fit(df[features], df[label]) importances = clf.feature_importances_ sorted_idx = np.argsort(importances) padding = np.arange(len(features)) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, np.asarray(features)[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance")
def plotCandidatoPor(atributo, candidatos): if atributo is 'partido': conjunto = [c.partido[1]['sigla'] for c in candidatos] elif atributo is 'ocupacao': conjunto = [c.ocupacao[0]['ocupacao'] for c in candidatos] elif atributo is 'cargo': conjunto = [c.cargo[0]['cargo'] for c in candidatos] elif atributo is 'grauInstrucao': conjunto = [c.grauInstrucao[0]['grauInstrucao'] for c in candidatos] elif atributo is 'estado': conjunto = [c.estado[1]['uf'] for c in candidatos] elif atributo is 'coligacao': conjunto = [c.coligacao[0]['coligacao'] for c in candidatos] elif atributo is 'estadoCivil': conjunto = [c.estadoCivil[0]['estadoCivil'] for c in candidatos] elif atributo is 'nacionalidade': conjunto = [c.nacionalidade[0]['nacionalidade'] for c in candidatos] elif atributo is 'situacao': conjunto = [c.situacao[0]['situacao'] for c in candidatos] elif atributo is 'sexo': conjunto = [c.sexo[0]['sexo'] for c in candidatos] elif atributo is 'resultadoEleicao': conjunto = [ c.resultadoEleicao[0]['resultadoEleicao'] for c in candidatos ] elif atributo is 'estadoNascimento': conjunto = [ c.cidadeNascimento[1]['estado'][1]['uf'] for c in candidatos ] elif atributo is 'cidadeNascimento': conjunto = [c.cidadeNascimento[0]['cidade'] for c in candidatos] s = [(x, len(list(y))) for x, y in groupby(sorted(conjunto))] s = sorted(s, key=lambda x: x[1]) siglas = [x[0] for x in s] qtd = [x[1] for x in s] posicoesY = pylab.arange(len(siglas)) + .5 posicoesX = qtd pylab.title('quantidade de candidatos por ' + atributo) pylab.barh(posicoesY, posicoesX, align='center') pylab.grid(True) pylab.yticks(posicoesY, tuple(siglas)) pylab.ylabel(atributo) pylab.xlabel('quantidade de candidatos') y = 0 for x in posicoesX: pylab.text(x + 5, posicoesY[y] - .5, x) y += 1 pylab.show()
def plot_correlations(filename, names, x, as_text, colourscheme, mid0, invertsign): marker_to_colour = dict() marker_to_r = dict() marker_to_p = dict() with open(filename, 'rU') as infh: for line in infh: p = line.rstrip('\r\n').split('\t') marker = p[2].rstrip('+') if marker in marker_to_r: continue # might change to raisingException marker_to_r[marker] = float(p[4]) pval = float(p[0]) qval = float(p[1]) marker_to_p[marker] = pval if marker_to_r[marker] > 0: marker_to_colour[ marker] = colourscheme[2] if pval > 0.05 else colourscheme[ 1] if qval > 0.05 else colourscheme[0] else: marker_to_colour[ marker] = colourscheme[5] if pval > 0.05 else colourscheme[ 4] if qval > 0.05 else colourscheme[3] for y, name in enumerate(names): if name == 'CD3' and name not in marker_to_colour: name = 'CD3e' if as_text: pylab.text(x, y, '%.2f' % marker_to_r[name], color=marker_to_colour[name]) elif o.mid0: pylab.barh(y, marker_to_r[name] / (-2.0 if invertsign else 2.0), color=marker_to_colour[name], linewidth=0, left=x + 0.5) else: pylab.barh(y, abs(marker_to_r[name]), color=marker_to_colour[name], linewidth=0, left=x) if marker_to_p[name] <= 0.05: if marker_to_r[name] >= 0: pylab.text(x + abs(marker_to_r[name]), y + 0.20, '+') else: pylab.text(x + abs(marker_to_r[name]) + 0.01, y + 0.20, '-')
def plot_occupancy2(occupancy, offset=0.0, cm=None, n_cages=None, n_animals=None): if cm is None: if hasattr(pylab.cm, 'viridis'): cm = pylab.cm.viridis else: cm = pylab.cm.winter # [enter, exit, cage, animal] # give each animal a color aids = numpy.unique(occupancy[:, 3]) aids.sort() if n_animals is None: n_aids = len(aids) else: n_aids = n_animals colors = { aid: cm(v) for (aid, v) in zip(aids, numpy.linspace(0., 1., n_aids)) } # find # of cages if n_cages is None: n_cages = len(numpy.unique(occupancy[:, 2])) bar_height = 1. / n_aids # plot each animal for (i, aid) in enumerate(aids): # get occupancy for this animal ao = occupancy[occupancy[:, 3] == aid] # barh(bottom, width, height, left, **kwargs) pylab.barh(ao[:, 2] + i * bar_height + offset, ao[:, 1] - ao[:, 0], bar_height, ao[:, 0], color=colors[aid]) # draw cage dividers for i in range(n_cages + 1): pylab.axhline(i + offset, color='k') yl = pylab.ylim() ylmin = min(yl[0], offset) ylmax = max(yl[1], n_cages + offset) if yl != (ylmin, ylmax): pylab.ylim(ylmin, ylmax)
def rfparameters(df,label,clf): features=np.array(df.ix[:, df.columns != label].describe().keys()) print('Running RF') clf.fit(df[features], df[label]) print('Plotting and Recording') importances = clf.feature_importances_ sorted_idx = np.argsort(importances)[:10] padding = np.arange(10) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, features[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance") best_features = features[sorted_idx][::-1] ddf=pd.DataFrame(data={'Top Features by RF': best_features}) return pl.savefig('importanceRF.png'), ddf.to_csv('importanceRF.txt',sep='\t')
def __init__(self, tree): import pylab import numpy as np costs = [] items = sorted(tree.walk(), key=lambda item: item.cost) costs = [x.cost for x in items] names = [x.name for x in items] pos = np.arange(0, len(costs)) + 0.5 pylab.barh(pos, costs, align="center") pylab.yticks(pos, names) pylab.subplots_adjust(left=0.5) pylab.show()
def plot_most_significant(labels): vectorizer = CountVectorizer() X = vectorizer.fit_transform(nndocs) chi2score = chi2(X, labels)[0] figure(figsize=(6, 6)) wscores = list(zip(vectorizer.get_feature_names(), chi2score)) wchi2 = sorted(wscores, key=lambda x: x[1]) topchi2 = list(zip(*wchi2[-10:])) x = [i for i in range(len(topchi2[1]))] label = topchi2[0] barh(x, topchi2[1], align='center', alpha=.2, color='g') plot(topchi2[1], x, '-o', markersize=2, alpha=.8, color='g') yticks(x, label) xlabel('$\chi^2$') show()
def find_features(df, features): ''' Use scikit-learn lib to determine which variables are the best at predicting risk. Then, from the calculated importances, order them from most to least important and make a barplot to visualize what is/isn't important ''' clf = RandomForestClassifier() clf.fit(df[features], df[DEP_VAR]) importances = clf.feature_importances_ sorted_idx = np.argsort(importances) padding = np.arange(len(features)) + 0.5 pl.barh(padding, importances[sorted_idx], align='center') pl.yticks(padding, features[sorted_idx]) pl.xlabel("Relative Importance") pl.title("Variable Importance")
def plot_bar_chart(self, y_labels, x): x = x[::-1] y_pos = np.arange(len(y_labels)) y_labels = y_labels[::-1] pylab.figure(figsize=(15, 5)) pylab.barh(y_pos, x, align='center') pylab.yticks(y_pos, y_labels) for i, v in enumerate(x): pylab.text(v + 0.005, i, str(round(v, 3)), color='black', fontweight='bold') pylab.xlabel(self.label) pylab.title(self.file_name) pylab.savefig('./figures/' + self.file_name + '_bar_chart' + self.ext) pylab.show()
def plot_histogram(freq, mean): # using dict comprehensions to remove not frequent words topwords = {word: count for word, count in freq.items() if count > round(8 * mean)} sorted_alpha = collections.OrderedDict(sorted(topwords.items())) # plotting y = sorted_alpha.values() x = range(len(y)) labels = topwords.keys() barh(x, y, align='center') yticks(x, labels) show()
def add_bar(self,fname, cname): fname=cdir+fname pnames=open(fname+'.paramnames').readlines() for i in range(3): loglsx=open(fname+'_'+str(i+1)+'.maxlike').readlines() if (i==0): logls2=loglsx else: if float(loglsx[0].split()[1])<float(logls2[0].split()[1]): logls2=loglsx logls=logls2[0].split(' ')[2:] chi2d={} for pname, logl in zip(pnames,logls): if "_like" in pname: ppname=pname.split(' ') if 'Betoule' in ppname[0]: chi2=-2*float(logl)-30 print "bchi2=",chi2 else: chi2=-2*float(logl) xname=ppname[0].replace('_like','') chi2d[xname]=chi2 left=0 for xname in nlist: chi2=chi2d[xname] #/defdof[xname] color=colors[xname] PP=pylab.barh(self.cy-0.25,chi2,left=left,height=0.5,color=color, linewidth=0) self.patches[xname]=PP[0] left+=chi2 self.ys.append(self.cy) self.cy+=1 self.names.append(cname)
def _create_histogram(M_c, data, columns, mc_col_indices, filename): dir = S.path.web_resources_data_dir full_filename = os.path.join(dir, filename) num_rows = data.shape[0] num_cols = data.shape[1] p.figure() # col_i goes from 0 to number of predicted columns # mc_col_idx is the original column's index in M_c for col_i in range(num_cols): mc_col_idx = mc_col_indices[col_i] data_i = data[:, col_i] ax = p.subplot(1, num_cols, col_i, title=columns[col_i]) if M_c['column_metadata'][mc_col_idx]['modeltype'] == 'normal_inverse_gamma': p.hist(data_i, orientation='horizontal') else: str_data = [du.convert_code_to_value(M_c, mc_col_idx, code) for code in data_i] unique_labels = list(set(str_data)) np_str_data = np.array(str_data) counts = [] for label in unique_labels: counts.append(sum(np_str_data == label)) num_vals = len(M_c['column_metadata'][mc_col_idx]['code_to_value']) rects = p.barh(range(num_vals), counts) heights = np.array([rect.get_height() for rect in rects]) ax.set_yticks(np.arange(num_vals) + heights/2) ax.set_yticklabels(unique_labels) p.tight_layout() p.savefig(full_filename)
def updateChart(self, ranks): teams = [] # y axis points = [] # x axis for rank in reversed(ranks): # generating axes values teams.append(rank.name) points.append(rank.points) pos = arange(len(teams))+.5 # the bar centers on the y axis figure(1) barh(pos, points, align='center') # used horizontal bar graph yticks(pos, teams) xlabel('Points') ylabel('Team') title('Ranking') grid(True) savefig("pics/chart.png", dpi=60) # saving chart in pics folder to show it later. clf() # don't forget to clear the figure to make a blank start for the next chart.
def updateChart(self, ranks): teams = [] points = [] for rank in reversed(ranks): teams.append(rank["team"]) points.append(rank["pt"]) pos = arange(len(teams))+.5 # the bar centers on the y axis figure(1) barh(pos, points, align='center') yticks(pos, teams) xlabel('Points') ylabel('Team') title('Ranking') grid(True) savefig("pics/overviewChart.png", dpi=60) # saving into different pic clf()
def add_bar(self, fname, cname, model): chiT, dof = 0, 0 fname = cdir+fname pnames = open(fname+'.paramnames').readlines() if 'Neff' in fname: loglsx = open(fname+'.maxlike').readlines() logls2 = loglsx else: for i in range(3): loglsx = open(fname+'_'+str(i+1)+'.maxlike').readlines() if (i == 0): logls2 = loglsx else: if float(loglsx[0].split()[1]) < float(logls2[0].split()[1]): logls2 = loglsx logls = logls2[0].split(' ')[2:] chi2d = {} print(' ') print('++++' + model) for pname, logl in zip(pnames, logls): if "_like" in pname: ppname = pname.split(' ') if 'SPlanck' in ppname[0]: chi2 = 0 else: if 'Neff' in fname: chi2 = float(logl) if 'Betoule' in ppname[0]: chi2 = chi2 - 692 else: chi2 = -2*float(logl) if 'Betoule' in ppname[0]: chi2 = chi2 - 30 chiT += chi2 xname = ppname[0].replace('_like', '') print(xname, chi2) chi2d[xname] = chi2 print('Min_chi2 = ', chiT+30) param = mdof[model] for xname in nlist: dof += defdof[xname] left = 0 for xname in nlist: chi2 = chi2d[xname] color = colors[xname] PP = pylab.barh(self.cy-0.25, chi2, left=left, height=0.5, color=color, linewidth=0) self.patches[xname] = PP[0] left += chi2 if "SN" in dataset: pylab.text(position, self.cy-0.25, r' %.2f / %s' % (chiT + 30, dof-param + 30), fontsize=15) else: pylab.text(position, self.cy-0.25, r' %.2f / %s' % (chiT, dof-param), fontsize=15) self.ys.append(self.cy) self.cy += 1 self.names.append(cname)
def plot_multiedge_graph(self, cmap="jet"): """Creates a multiedge graph and plots it :param cmap: a valid color map from matplotlib. jet, spring, hot, ... :return: CNOGraphMultiEdges object .. plot:: :include-source: :width: 50% # Get list of names from msdas import * from easydev import gsf m = MassSpecReader() m.read_annotations(gsf("msdas", "data", "YEAST_annotations_small.pkl")) n = network.NetworkFromUniProt(a.annotations) names = list(set(m.df.Protein)) n = network.CombineNetworks( {"Curated": gsf("msdas", "data", "PKN-yeastScaffold.sif"), "UniProt": "PKN-uniprot.sif", "PhosPho": "PKN-phospho.sif"}, signals=names[:], stimuli=["a", "NaCl"]) c = n.plot_multiedge_graph() c.plot() """ N = len(self.labels) values = pylab.linspace(.1,.9, N) # build network c = self.get_multiedge_graph() c.plot(edge_attribute="edgecolor", edge_attribute_labels=False, cmap=cmap) # #build legend for i, label in enumerate(self.labels): print label, c._get_hex_color_from_value(values[i], cmap) pylab.barh(0,0,1,color=c._get_hex_color_from_value(values[i], cmap), label=label) pylab.legend(title="edge legend", fontsize="small", loc="lower right") return c
def plot_results(regr, params, X_test, y_test, feature_names): """ Plot the results from boosting iterations and feature evaluations, using PyLab. """ ############################################################################### # Plot training deviance # Compute test set deviance """ test_score = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(regr.staged_decision_function(X_test)): test_score[i] = regr.loss_(y_test, y_pred) best = np.argmin(test_score) print "optimal", best, test_score[best] """ pl.figure(figsize=(12, 10)) pl.subplot(1, 2, 1) """ pl.title('Deviance') pl.plot(np.arange(params['n_estimators']) + 1, regr.train_score_, 'b-', label='Training Set Deviance') pl.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') pl.legend(loc='upper right') pl.xlabel('Boosting Iterations') pl.ylabel('Deviance') """ ############################################################################### # Plot feature importance feature_importance = regr.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, feature_names[sorted_idx]) pl.xlabel('Relative Importance') pl.title('Feature Importance') pl.savefig('./working/foo.png', bbox_inches='tight')
def do_fit(train_path, model_path, test_path): params = {'n_estimators': 1500, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf':1, 'random_state':None, 'do_consider_correct':1, 'learn_rate': 0.1, 'n1': 10000, 'n2': 1, 'n3': 100000, 'tau': 0.01}; ranker = GradientBoostingRanker(**params); print 'loading data...' X, dr, sr, groups = load_dataset(train_path) test_X, test_dr, test_sr, test_groups = load_dataset(test_path); print 'starting fit...' ranker.fit(X, dr, sr, groups, test_X, test_dr, test_sr, test_groups); # ranker.fit(X, dr, sr, groups); # print ranker.train_score_; pl.figure(figsize=(12, 6)) pl.subplot(1, 2, 1) pl.title('Deviance') pl.plot(np.arange(params['n_estimators']) + 1, ranker.train_score_, 'b-', label='Training Set Deviance') pl.plot(np.arange(params['n_estimators']) + 1, ranker.oob_score_, 'r-', label='Test Set Deviance') pl.legend(loc='upper right') pl.xlabel('Boosting Iterations') pl.ylabel('Deviance') # Plot feature importance feature_importance = ranker.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 pl.subplot(1, 2, 2) pl.barh(pos, feature_importance[sorted_idx], align='center') pl.yticks(pos, np.array(range(len(feature_importance)))); pl.xlabel('Relative Importance') pl.title('Variable Importance') print feature_importance; print 'storing to %s' % model_path joblib.dump(ranker, model_path, 3) pl.show()
def plotCandidatoPor(atributo, candidatos): if atributo is 'partido': conjunto = [c.partido[1]['sigla'] for c in candidatos] elif atributo is 'ocupacao': conjunto = [c.ocupacao[0]['ocupacao'] for c in candidatos] elif atributo is 'cargo': conjunto = [c.cargo[0]['cargo'] for c in candidatos] elif atributo is 'grauInstrucao': conjunto = [c.grauInstrucao[0]['grauInstrucao'] for c in candidatos] elif atributo is 'estado': conjunto = [c.estado[1]['uf'] for c in candidatos] elif atributo is 'coligacao': conjunto = [c.coligacao[0]['coligacao'] for c in candidatos] elif atributo is 'estadoCivil': conjunto = [c.estadoCivil[0]['estadoCivil'] for c in candidatos] elif atributo is 'nacionalidade': conjunto = [c.nacionalidade[0]['nacionalidade'] for c in candidatos] elif atributo is 'situacao': conjunto = [c.situacao[0]['situacao'] for c in candidatos] elif atributo is 'sexo': conjunto = [c.sexo[0]['sexo'] for c in candidatos] elif atributo is 'resultadoEleicao': conjunto = [c.resultadoEleicao[0]['resultadoEleicao'] for c in candidatos] elif atributo is 'estadoNascimento': conjunto = [c.cidadeNascimento[1]['estado'][1]['uf'] for c in candidatos] elif atributo is 'cidadeNascimento': conjunto = [c.cidadeNascimento[0]['cidade'] for c in candidatos] s = [(x,len(list(y))) for x,y in groupby(sorted(conjunto))] s = sorted(s, key=lambda x: x[1]) siglas = [x[0] for x in s] qtd = [x[1] for x in s] posicoesY = pylab.arange(len(siglas)) + .5 posicoesX = qtd pylab.title('quantidade de candidatos por ' + atributo) pylab.barh(posicoesY, posicoesX, align='center') pylab.grid(True) pylab.yticks(posicoesY, tuple(siglas)) pylab.ylabel(atributo) pylab.xlabel('quantidade de candidatos') y = 0 for x in posicoesX: pylab.text(x+5, posicoesY[y]-.5, x) y += 1 pylab.show()