def get_hara_stats(df): # gets statistics on haralick features # takes dataframe with haralick and breeds x = list(range(1,14)) xs = [] haraFG = [] breed = [] for i in range(df.shape[0]): a = df.iloc[i] xs.append(x) haraFG.append(a.fgHaralick) breed.append([a.breed]*13) newDF = pd.DataFrame(columns=['Haralick feature', 'Haralick feature value', 'breed']) newDF['Haralick feature'] = np.array(xs).flatten() newDF['Haralick FG feature value'] = np.array(haraFG).flatten() newDF['breed'] = np.array(breed).flatten() stds = [] for i in x: stds.append(newDF[newDF['Haralick feature']==i]['Haralick FG feature value'].std() / newDF[newDF['Haralick feature']==i]['Haralick FG feature value'].mean()) data = np.vstack((np.array(x), np.array(stds))).T pltDF = pd.DataFrame(columns=['Haralick feature', 'relative standard deviation'], data=data) sns.lmplot(x='Haralick feature', y='relative standard deviation', data=pltDF, fit_reg=False) plt.xticks(x) plt.show()
def finalLayer(self, X, y, epochs=1): print "Final Layer" V = self.predict(X) softmax = Layers.SoftmaxLayer(self.Layers[-1].W.shape[1], y.shape[1]) ######################### # Final layer of THE MODEL # ######################### batchsize = X.shape[0] softmax, dA_avg_perm = Trainer().train([softmax], V, y, epochs, batchsize) self.Layers.append(softmax[0]) ############## # Format ################# LLdata = [float(L) for L in dA_avg_perm] LLiter = [float(it) for it in range(epochs)] dfpredata = pd.DataFrame( LLdata ) dfpredata.columns = ['dA_avg_costs'] dfpredata['iter'] = LLiter ############ ### plotting or cost ### the cost we minimize during training is the negative log likelihood of ############ plt.figure() sns.lmplot('iter', 'dA_avg_costs', data=dfpredata, fit_reg=False) plt.xlabel('epoch', fontsize=14) plt.ylabel('softmax error', fontsize=14) plt.title('softmax_train_epochs_'+str(epochs), fontsize=9) plt.savefig('grid_searchResults/'+'softmax_train_'+str(self.item)+'.png')
def fine_tune(self, X, y, epochs=1): print "Fine Tunning" ######################### # Fine Tunning THE MODEL # ######################### batchsize = X.shape[0] self.Layers, dA_avg_perm = Trainer().train(self.Layers, X, y, epochs, batchsize) ############## # Format ################# LLdata = [float(L) for L in dA_avg_perm ] LLiter = [float(it) for it in range(epochs)] dfinedata = pd.DataFrame( LLdata ) dfinedata.columns = ['dA_avg_costs'] dfinedata['iter'] = LLiter ############ ### plotting or cost ### the cost we minimize during training is the negative log likelihood of ############ plt.figure() sns.lmplot('iter', 'dA_avg_costs', data=dfinedata, fit_reg=False) plt.xlabel('epoch', fontsize=14) plt.ylabel('finetune error', fontsize=14) plt.title('fine_tune_structure_'+str([self.structure])+'_train_epochs_'+str(epochs), fontsize=9) plt.savefig('grid_searchResults/'+'fine_tune_structure_'+str(self.item)+'.png')
def visualize_data(self): """ Transform the DataFrame to the 2-dimensional case and visualizes the data. The first tags are used as labels. :return: """ logging.debug("Preparing visualization of DataFrame") # Reduce dimensionality to 2 features for visualization purposes X_visualization = self.reduce_dimensionality(self.X, n_features=2) df = self.prepare_dataframe(X_visualization) # Set X and Y coordinate for each articles df['X coordinate'] = df['coordinates'].apply(lambda x: x[0]) df['Y coordinate'] = df['coordinates'].apply(lambda x: x[1]) # Create a list of markers, each tag has its own marker n_tags_first = len(self.df['tags_first'].unique()) markers_choice_list = ['o', 's', '^', '.', 'v', '<', '>', 'D'] markers_list = [markers_choice_list[i % 8] for i in range(n_tags_first)] # Create scatter plot sns.lmplot("X coordinate", "Y coordinate", hue="tags_first", data=df, fit_reg=False, markers=markers_list, scatter_kws={"s": 150}) # Adjust borders and add title sns.set(font_scale=2) sns.plt.title('Visualization of TMT articles in a 2-dimensional space') sns.plt.subplots_adjust(right=0.80, top=0.90, left=0.12, bottom=0.12) # Show plot sns.plt.show()
def deal_rs(): data_set = pd.read_csv('datas/result.csv') data_set.columns = ['AvH', 'AvD', 'AvA', 'Hc', 'Dc', 'Ac', 'R', 'P'] sns.set(style='ticks') sns.lmplot(x='R', y='P', data=data_set) sns.plt.show()
def view_timeline(df,x="unix_time",y="rate",plt=plt): asset = df.symbol.values[0] plt.figure(1, figsize=(15,15)) sns.lmplot(x=x, y=y, hue="type", data=df, palette=dict(sell="r", buy="g")) plt.ylim(df[y].min(), df[y].max()) plt.title(y+' over time ('+asset+')') plt.show()
def plot_compare_median_consensus(output_dir, df_order, metric, type = 'ts',DISPLAY = 0): plt.figure() if type =='ts': #sb.tsplot(data=df_order, value=metric,time='order',unit="algorithm",condition="algorithm",err_style="unit_traces") ax = sb.boxplot(x=metric, y="algorithm", data=df_order, whis=np.inf, color="c") # Add in points to show each observation sb.stripplot(x=metric, y="algorithm", data=df_order, jitter=True, size=3, color=".3", linewidth=0) ax.set_xscale("log") sb.despine(trim=True) # plt.xlabel('images sorted by the average neuron distance of the median reconstruction') plt.savefig(output_dir + '/ts_compare_median_with_consensus_'+metric+'.png', format='png') if type =='lm': sb.lmplot(x="order", y=metric, hue="algorithm", data=df_order) plt.xlabel('images sorted by the average neuron distance of the median reconstruction') plt.savefig(output_dir + '/lm_compare_median_with_consensus_'+metric+'.lm.png', format='png') if DISPLAY: plt.show() plt.close()
def vcf_stats(vcfin, outdir, sample): inp = vcf.Reader(open(vcfin)) variants = list() ref = list() alt = list() basename = os.path.splitext(os.path.basename(vcfin))[1] genotype = {'0/0':'Homozygous Rerfeence','0/1':'Heterozygous','1/1':'Homozygous Alternate', '1/2':'Non Reference Heterozygous'} for lines in inp: try: var = {'Chrom':lines.CHROM,'Pos':lines.POS, 'Ref':lines.REF, 'Alt': ','.join([str(alt) for alt in lines.ALT]), 'Sample':sample, 'Genotype': genotype[lines.genotype(sample)['GT']],'Depth_at_reference':lines.genotype(sample)['AD'][0], 'Depth_at_alternate':lines.genotype(sample)['AD'][1]} variants.append(var) except KeyError: continue variants = pd.DataFrame(variants) plt.figure() sns.set(style='ticks', context='talk') sns.lmplot('Depth_at_reference','Depth_at_alternate',hue='Genotype', data=variants, fit_reg=False) plt.xlim([0,max([max(variants.Depth_at_reference), max(variants.Depth_at_alternate)])]) plt.ylim([0,max([max(variants.Depth_at_reference), max(variants.Depth_at_alternate)])]) plt.xlabel('Depth at reference allele') plt.ylabel('Depth at alternate allele') plt.title('Allelic depth distribution') plt.savefig(basename+'_allele_depth') plt.close() return
def plot_building_temp(): sns.set_context("paper", font_scale=1.5) b = 'AZ0000FF' s = 'KTUS' filelist = glob.glob(os.getcwd() + '/csv_FY/testWeather/{0}*.csv'.format(b)) dfs = [pd.read_csv(csv) for csv in filelist] col = 'eui_gas' dfs2 = [df[[col, 'month', 'year']] for df in dfs] df3 = (pd.concat(dfs2)) temp = pd.read_csv(os.getcwd() + '/csv_FY/weather/weatherData_meanTemp.csv') temp['year'] = temp['Unnamed: 0'].map(lambda x: float(x[:4])) temp['month'] = temp['Unnamed: 0'].map(lambda x: float(x[5:7])) temp.set_index(pd.DatetimeIndex(temp['Unnamed: 0']), inplace=True) temp = temp[[s, 'month', 'year']] joint2 = pd.merge(df3, temp, on = ['year', 'month'], how = 'inner') joint2.to_csv(os.getcwd() + '/csv_FY/testWeather/test_temp.csv', index=False) sns.lmplot(s, col, data=joint2, col='year', fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + '/csv_FY/testWeather/plot/scatter_temp_byyear.png', dpi=150) plt.close() joint2 = joint2[(2012 < joint2['year']) & (joint2['year'] < 2015)] sns.regplot(s, col, data=joint2, fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + '/csv_FY/testWeather/plot/scatter_temp_1314.png', dpi=150) plt.close()
def plot_scatter_n_accuracy_joint(self, data_objects, labels, label_self, markers): """Make plot from this and other data objects. Args: data_objects ([Data]): Other Data objects to include in plot. labels ([str]): Labels to use for Data_objects. label_self (str): Label to use for this Data object. Returns: Axis object. """ dataframes = [self.df] + [data.df for data in data_objects] labels = [label_self] + labels acc = [] n = [] statistics = [] for df, label in zip(dataframes, labels): acc = df.groupby('worker')['correct'].mean() n = df.groupby('worker')['question'].count() df_new = pd.concat([acc, n], axis=1) df_new['dataset'] = label statistics.append(df_new) df = pd.concat(statistics, axis=0) sns.lmplot('question', 'correct', data=df, hue='dataset', markers=markers, fit_reg=False) plt.xlabel('Number of questions answered') plt.ylabel('Accuracy') plt.xlim((0, None)) plt.ylim((0, 1)) plt.title('') return plt.gca()
def plot_data(data, has_label=True): import numpy as np import seaborn as sns from sklearn.manifold import TSNE from sklearn.decomposition import PCA if not has_label: data = data.copy() data['label'] = np.zeros([len(data),1]) LIMIT = 4000 if data.shape[0] > LIMIT: dt = data.sample(n=LIMIT, replace=False) X = dt.ix[:,:-1] labels = dt.ix[:,-1] else: X = data.ix[:,:-1] labels = data.ix[:,-1] tsne_model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) points1 = tsne_model.fit_transform(X) df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"]) sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind')) sns.plt.title('TNSE') pca = PCA(n_components=2) pca.fit(X) points2 = pca.transform(X) df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"]) sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind')) sns.plt.title('PCA')
def fig2(ppl, fname): '''For each contact, plot number of characters sent and received. (UNUSED)''' sns.lmplot("lensent", "lenrec",ppl) plt.xlabel('Characters Sent') plt.ylabel('Characters Received') sns.despine() savefig(fname)
def show_examples(idxs, printStd=True): # prints example dataset from supplied indexs, idxs # and plots the foreground haralick features x = list(range(1,14)) xs = [] hara = [] breed = [] for idx in idxs: a = hNt.iloc[idx] xs.append(x) hara.append(np.log(abs(a.fgHaralick))) breed.append([a.breed] * 13) if printStd: print('breed:', a.breed) print('filename:', a.file) print('foreground Haralick:', a.fgHaralick) print('background Haralick:', a.bgHaralick) newDF = pd.DataFrame(columns=['Haralick feature', 'log(Haralick feature value)', 'breed']) newDF['Haralick feature'] = np.array(xs).flatten() newDF['log(Haralick feature value)'] = np.array(hara).flatten() newDF['breed'] = np.array(breed).flatten() newDF.sort_values(by='breed', inplace=True) sns.lmplot(x='Haralick feature', y='log(Haralick feature value)', data=newDF, fit_reg=False, hue='breed') plt.xticks(x) plt.show()
def plot_building_temp(): sns.set_context("paper", font_scale=1.5) b = "AZ0000FF" s = "KTUS" filelist = glob.glob(os.getcwd() + "/csv_FY/testWeather/{0}*.csv".format(b)) dfs = [pd.read_csv(csv) for csv in filelist] col = "eui_gas" dfs2 = [df[[col, "month", "year"]] for df in dfs] df3 = pd.concat(dfs2) temp = pd.read_csv(os.getcwd() + "/csv_FY/weather/weatherData_meanTemp.csv") temp["year"] = temp["Unnamed: 0"].map(lambda x: float(x[:4])) temp["month"] = temp["Unnamed: 0"].map(lambda x: float(x[5:7])) temp.set_index(pd.DatetimeIndex(temp["Unnamed: 0"]), inplace=True) temp = temp[[s, "month", "year"]] joint2 = pd.merge(df3, temp, on=["year", "month"], how="inner") joint2.to_csv(os.getcwd() + "/csv_FY/testWeather/test_temp.csv", index=False) sns.lmplot(s, col, data=joint2, col="year", fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + "/csv_FY/testWeather/plot/scatter_temp_byyear.png", dpi=150) plt.close() joint2 = joint2[(2012 < joint2["year"]) & (joint2["year"] < 2015)] sns.regplot(s, col, data=joint2, fit_reg=False) plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10)) plt.ylim((0, joint2[col].max() + 0.1)) P.savefig(os.getcwd() + "/csv_FY/testWeather/plot/scatter_temp_1314.png", dpi=150) plt.close()
def relationship_spearman_size_source(dir, model="logreg3", feats="ecfps1", dset="bcrp"): """ Plots the relationship between the size of the source vs the average relevant Spearman corr coeff. One point per source on the plot. """ small_dict = defaultdict(list) # list all spearman files for f in glob.glob(op.join(dir, "spearmans_*")): if "relfeats" in op.basename(f): source = op.basename(f).partition("_lso_relfeats_")[2].partition("_logreg")[0] print source small_dict["source"].append(source) small_dict["size"].append(len(ManysourcesDataset(dset).mols().sources2molids([source]))) with open(f, "rb") as reader: dict_spearman = pickle.load(reader) spearmans = map(lambda x: x[0], dict_spearman.values()) small_dict["average spearman"].append(np.mean(np.array(spearmans))) df = pd.DataFrame.from_dict(small_dict) import seaborn seaborn.set_style("ticks") seaborn.set_context("talk") seaborn.lmplot( "size", "average spearman", data=df, scatter_kws={"marker": "o", "color": "slategray"}, line_kws={"linewidth": 1, "color": "seagreen"}, ) plt.show()
def _corr(self, sel, suffix): formula = str('model_accuracy ~ human_accuracy') logreg = smf.logit(formula=formula, data=sel).fit() summ = logreg.summary() if self.html is None: print(summ) else: summ = summ.as_html().replace('class="simpletable"', 'class="simpletable table"') sel = sel.rename(columns={'human_accuracy': 'human accuracy', 'model_accuracy': 'model accuracy'}) sns.lmplot('human accuracy', 'model accuracy', data=sel, x_jitter=.01, y_jitter=.05, logistic=True, truncate=True) bins = np.digitize(sel['human accuracy'], np.arange(.05,1,.1)) #bins[bins==11] = 10 count = sel['model accuracy'].groupby(bins).count() mean = sel['model accuracy'].groupby(bins).mean() sns.plt.scatter(.1*mean.index, mean, s=10*count, c='.15', linewidths=0, alpha=.8) sns.plt.title(models.NICE_NAMES[self.model_name]) sns.plt.xlim([-.1, 1.1]) sns.plt.ylim([-.1, 1.1]) self.show(pref='corr_sil', suffix=self.model_name + '_' + suffix, caption=suffix + summ)
def aucplot(cls, df): seaborn.lmplot(data=df, x="b", y="auc", lowess=True, size=5, aspect=2)
def plot(data, total, title, width=800.0, unit='', dosort=True, target=None, target2=None): """A HTML bar plot given a dictionary and max value.""" if len(data) > 30 and target is not None: df = pandas.DataFrame(index=data) df[title] = pandas.Series(data, index=df.index) df[target.name] = target.ix[df.index] if target2 is not None: df[target2.name] = target2.ix[df.index] if target.dtype == numpy.number: if target2 is None: seaborn.jointplot(target.name, title, data=df, kind='reg') else: seaborn.lmplot(target.name, title, data=df, hue=target2.name) else: # X-axis is categorical df.sort_values(by=target.name, inplace=True) if target2 is None: seaborn.barplot(target.name, title, data=df) else: seaborn.barplot(target.name, title, data=df, hue=target2.name) fig = plt.gcf() fig.autofmt_xdate() # Convert to D3, SVG, javascript etc. # import mpld3 # result = mpld3.fig_to_html(plt.gcf(), template_type='general', # use_http=True) # Convert to PNG figfile = io.BytesIO() plt.savefig(figfile, format='png') result = '<div><img src="data:image/png;base64, %s"/></div>' % ( base64.b64encode(figfile.getvalue()).decode('utf8')) plt.clf() return result result = ['<div class=barplot>', ('<text style="font-family: sans-serif; font-size: 16px; ">' '%s</text>' % title)] if target is not None: data = OrderedDict([(key, data[key]) for key in target.sort_values().index if key in data]) keys = {key.split('_')[0] if '_' in key else key[0] for key in data} color = {} if len(keys) <= 5: color.update(zip(keys, range(1, 6))) keys = list(data) if dosort: keys.sort(key=data.get, reverse=True) for key in keys: result.append('<br><div style="width:%dpx;" class=b%d></div>' '<span>%s: %g %s</span>' % ( int(round(width * data[key] / total)) if data[key] else 0, color.get(key.split('_')[0] if '_' in key else key[0], 1) if data[key] else 0, htmlescape(key), data[key], unit,)) result.append('</div>\n') return '\n'.join(result)
def plot_complexity(models=ALL_MODELS, save_path='../resources/cached_model_grid_scores.csv'): grid = pd.read_csv(save_path) grid = grid[grid['model_names'].isin(models)] plt.figure(figsize=(12,12)); sns.lmplot(data=grid, x='time_to_train', y='params', hue='model_names', fit_reg=False, legend=False); plt.legend(bbox_to_anchor=(1.05, 1), loc='lower right', borderaxespad=0.); plt.show();
def plotScatterLabelled(data, x_param, y_param, huey, output_path, output_directory, output_filename): sns.lmplot(x_param, y_param, data, hue=huey, fit_reg=False); output_ = "%s/%s/%s" % (output_path, output_directory, output_filename) try: plt.savefig(output_) except IOError: os.makedirs('%s/%s/' % (output_path, output_directory)) plt.savefig(output_) plt.close()
def grafico_l2(conjunto, xl=None, yl=None, titulox="", tituloy="", titulo="", filename="", tamanho=5): a = np.array(conjunto[0].map(_dic_cruzes)) b = np.array(conjunto[1].map(_dic_cruzes)) c = DataFrame([a, b]).transpose() c.columns = ["A", "B"] sns.lmplot("A", "B", c, x_jitter=0.2, y_jitter=0.3, size=tamanho) plt.title(titulo, fontsize=16) sns.axlabel(titulox, tituloy, fontsize=fontetamanho) plt.savefig(filename)
def seabornScatterPlot(data,xName,yName,titleIn): ''' seabornScatterPlot plots a scatter plot using seaborn. :param X: x axis data :param Y: y axis data :param xName: name of x axis :param yName: name of y axis :param titleIn: plot title ''' sns.lmplot(xName, yName, data, palette="Set1", fit_reg=False); plt.title(titleIn);
def lm_plot(df,dep_var, indep_var,grpby,units): if grpby: seaborn.lmplot(x=indep_var, y=dep_var, data=df, hue =grpby,fit_reg=False ) else: seaborn.lmplot(x=indep_var, y=dep_var, data=df,fit_reg=False) #seaborn.lmplot(x=indep_var, y=dep_var, data=df, fit_reg=False) #would be great to figure out how to remove '_cat' plt.xlabel(indep_var) plt.ylabel(dep_var + ", " + units) plt.title("Scatterplot of " + dep_var + " versus " + indep_var) plt.savefig(wd + "Scatterplot_" + dep_var + "_vs_"+ indep_var + '.png') plt.close
def plot_avg_width_exp(mark="H3K4me3"): def get_90quantile(arr): arr.sort() return arr[int(0.9*len(arr))] gene_id, EID_list, exp_matrix = get_gene_exp_matrix() _, len_dict = get_len_num(mark) quantile_arr = np.array([get_90quantile(len_dict[EID]) for EID in EID_list]) gene_avg = np.mean(exp_matrix, axis=0) print quantile_arr print gene_avg result = pd.DataFrame({'quantile90':quantile_arr,"gene_avg":gene_avg}) sns.lmplot('quantile90','gene_avg',result) plt.show()
def plot_scatter_n_accuracy(self): ax = plt.gca() acc = self.df.groupby('worker')['correct'].mean() n = self.df.groupby('worker')['question'].count() condition = self.df.groupby('worker')['condition'].first() df = pd.concat([acc, n, condition], axis=1) sns.lmplot('question', 'correct', data=df, hue='condition', fit_reg=False) plt.xlabel('Number of questions answered') plt.ylabel('Accuracy') plt.xlim((0, None)) plt.ylim((0, 1)) plt.title('') return ax
def createScatter(self, event): dlg = GraphDialog(self.parent, "Scatterplot Input", ("X", "Y"), size=(700, 200), groups=False) regress = wx.CheckBox(dlg, label="Add Regression Polynomial?") regress.SetValue(True) jitter = wx.CheckBox(dlg, label="Jitter?") jitter.SetValue(False) dlg.Add(jitter) ci = dlg.AddSpinCtrl("Confidence (>=100 for None)", 0, 101, 95) order = dlg.AddSpinCtrl("Polynomial Degree", 1, 10, 1) regress.Bind(wx.EVT_CHECKBOX, lambda e: ci.Enable(regress.GetValue()) and order.Enable(regress.GetValue())) dlg.Add(regress) if dlg.ShowModal() == wx.ID_OK: ds = dlg.GetName() dlg.Destroy() regress, ci = regress.GetValue(), ci.GetValue() order, jitter = order.GetValue(), jitter.GetValue() data = self.parent.data[list({b for bs in ds for b in bs})].astype(float) snData = pd.DataFrame() for x, y in ds: # Deals with silly SNS stuff d = {"x":data[x], "y":data[y], "group":np.repeat(y, len(data[x]))} d = pd.DataFrame(d) snData = snData.append(d, ignore_index=True) if jitter: xjitter = snData["x"].std() / 4 yjitter = snData["y"].std() / 4 else: xjitter, yjitter = 0, 0 try: if ci < 100 and regress: sns.lmplot("x", "y", snData, hue="group", ci=ci, order=order, x_jitter=xjitter, y_jitter=yjitter) else: sns.lmplot("x", "y", snData, fit_reg=regress, ci=None, order=order, x_jitter=xjitter, y_jitter=yjitter) plt.show() except np.RankWarning: dlg = wx.MessageDialog(self.parent, "Polynomial Degree Too High", style = wx.OK | wx.ICON_ERROR) dlg.ShowModal() dlg.Destroy() plt.show()
def latmeanbw(self): # take log of bw array for better sizing self.load_block_times() normbws = np.array(self.df.bandwidths) g = sns.lmplot("latencies", "means", data=self.df[['latencies', 'means']], scatter_kws={"s": np.log2(normbws) * 10, "alpha" : .5}) g.set(ylim=(0, 400)) g = self.with_title(g)
def bttime(self): print 'loading block time vs time...' # get block_time rows for most recent run self.cur.execute('SELECT timestamp, time, runid FROM block_times where runid=(select max(runid) from runs)') rows = self.cur.fetchall() rid = (rows[0][2],) # get tuple reflecting run config to show under graph self.cur.execute('SELECT * FROM runs where runid=?', rid) config = self.cur.fetchone() config = map(str, config) names = [i[0] for i in self.cur.description] desc = str(zip(names, config)) timestamps = [] times = [] for ts, time, rid in rows: timestamps.append(ts) times.append(time) timedf = pd.DataFrame.from_dict({'timestamps' : timestamps, 'times' : times}) # change nanosecond timestamps to seconds timedf['timestamps'] = timedf['timestamps'].astype(float) / (1000 * 1000) g = sns.lmplot("timestamps", "times", data=timedf) print desc g.ax.set_title(self.wl) g.set_axis_labels("time (seconds)", "block times (ms)")
def latdur(self): print 'latency vs duration' filtered = util.lock_float_field(self.df, 'bandwidths', self.bws) if filtered is None: return self.latmeanbw() g = sns.lmplot("latencies", "durations", data=filtered[['latencies', 'durations', 'bandwidths']].astype(float), col='bandwidths')
def draw_boundary(power, l): """ power: polynomial power for mapped feature l: lambda constant """ density = 1000 threshhold = 2 * 10**-3 final_theta = feature_mapped_logistic_regression(power, l) x, y = find_decision_boundary(density, power, final_theta, threshhold) df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted']) sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100}) plt.scatter(x, y, c='R', s=10) plt.title('Decision boundary')
sns.boxplot(data=[data1, data2], whis=np.inf) plt.show() sns.violinplot(data=[data1, data2]) plt.show() # バンド幅を細かくしてみましょう。 sns.violinplot(data=data2, bw=0.01) plt.show() # Seabornにサンプルデータがあります。 tips = sns.load_dataset("tips") tips.head() sns.lmplot("total_bill", "tip", tips, size=10) plt.show() # グラフごとにパラメータを変えられます。 sns.lmplot("total_bill", "tip", tips, scatter_kws={'marker': 'o', 'color': 'indianred', 's': 10}, line_kws={'linewidth': 1, 'color': 'blue'}) plt.show() # 4次関数で回帰曲線をひくこともできます。 sns.lmplot("total_bill", "tip", tips, order=4, scatter_kws={"marker": "o", "color": "indianred", "s": 8}, line_kws={"linewidth": 1, "color": "blue"}) plt.show()
return data_mean, lower, upper if __name__ == "__main__": df = pd.read_csv('./salaries.csv') data = df.values.T[1] boots = [] for i in range(100, 100000, 1000): boot = boostrap(data, data.shape[0], i) boots.append([i, boot[0], "mean"]) boots.append([i, boot[1], "lower"]) boots.append([i, boot[2], "upper"]) df_boot = pd.DataFrame(boots, columns=['Boostrap Iterations', 'Mean', "Value"]) sns_plot = sns.lmplot(df_boot.columns[0], df_boot.columns[1], data=df_boot, fit_reg=False, hue="Value") sns_plot.axes[0, 0].set_ylim(0, ) sns_plot.axes[0, 0].set_xlim(0, 100000) sns_plot.savefig("bootstrap_confidence.png", bbox_inches='tight') sns_plot.savefig("bootstrap_confidence.pdf", bbox_inches='tight') #print ("Mean: %f")%(np.mean(data)) #print ("Var: %f")%(np.var(data))
def learn(self, message): try: for i in range(len(self.connections)): print('start learning') #우선 필요한 파일을 부른다 learningFileName = 'learning_1_saved.sav' df = joblib.load(learningFileName) # 해당 파일에서의 DataFrame (df)를 불러서 출력해본다 sb.lmplot('x', 'y', data=df, fit_reg=False, scatter_kws={"s": 150}, hue="cluster") plt.title('Before') #이전 df2의 마지막 index 번호를 구한다 lastIndex = len(df.index) - 1 #그 후 message에서 숫자를 추출하고 새로운 값을 넣되 초기 cluster 번호를 0으로 지정한다. Xtext = message[message.index('_') + 1:message.index(',')] Ytext = message[message.index(',') + 1:message.index('.')] inputX = int(Xtext) inputY = int(Ytext) df.loc[lastIndex + 1] = [inputX, inputY, 0] print('Received input: ', inputX, ', ', inputY) # 재학습을 실행한다 newpoints = df.values kmeans = KMeans(n_clusters=5).fit(newpoints) # 새로운 'cluster' 라벨을 붙여주고 러닝 결과를 출력한다. df['cluster'] = kmeans.labels_ sb.lmplot('x', 'y', data=df, fit_reg=False, scatter_kws={"s": 150}, hue="cluster") plt.title('After') # 각 클러스터의 중심값을 읽어오기 clusterData = kmeans.cluster_centers_ # 이 clusterData를 X축 기준으로 정렬하기 sortedCluster = clusterData[clusterData[:, 0].argsort()] # 여기서 3번째 열(클러스터 번호)를 잘라낸다 finalCutCluster = np.delete(sortedCluster, np.s_[2], axis=1) #최종 결과를 파일에 저장하고 (러닝파일 + 클러스터) joblib.dump(df, learningFileName) np.save('clusterCenter', finalCutCluster) print("Learning is complete!") #마지막으로 성공했다는 메세지를 보내기 newMessage = "complete\n" print(type(newMessage)) self.connections[i].sendall(newMessage.encode()) print("(", newMessage, ") has been sent to client") except: pass
def plot_iris(iris, col1, col2): sns.lmplot(x=col1, y=col2, data=iris, hue="Species", fit_reg=False) plt.xlabel(col1) plt.ylabel(col2) plt.title("Iris species shown by colour") plt.show
g.add_legend(); # Difference between M & F split fractions sns.kdeplot(data.split_frac[data.gender=='M'], label='men', shade=True) sns.kdeplot(data.split_frac[data.gender=='W'], label='women', shade=True) plt.xlabel('split_frac'); # Bimodal distribution among M & F : Violinplot sns.violinplot? sns.violinplot('gender', 'split_frac', data=data, palette=['lightblue', 'lightpink']); # Violin plot as a function of gender and age data['age_dec'] = data.age.map(lambda age: 10 * (age//10)) data.head() sns.violinplot('age_dec', 'split_frac', data=data, palette=['lightblue', 'lightpink']); # OR sns.violinplot('age_dec', 'split_frac', hue='gender', data=data, palette=['lightblue', 'lightpink']); # OR sns.violinplot('age_dec', 'split_frac', hue='gender', data=data, split=True, inner='quartile', palette=['lightblue', 'lightpink']); # OR # men = (data.gender == 'M') women = (data.gender == 'W') with sns.axes_style(style=None): sns.violinplot("age_dec", "split_frac", hue="gender", data=data, split=True, inner="quartile", palette=["lightblue", "lightpink"]); # Elder aged (data.age>80).sum() # regplot to fit a linear regression to the data automatically g = sns.lmplot('final_sec', 'split_frac', col='gender', data=data,markers=".", scatter_kws=dict(color='c')) g.map(plt.axhline, y=0.1, color="k", ls=":");
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os import matplotlib.pyplot as plt import seaborn as sns #%matplotlib inline customers = pd.read_csv('H:\Ecommerce prediction\Datasets\dt1.csv') customers.head() sns.set_palette("GnBu_d") sns.set_style('whitegrid') sns.jointplot(x='Time on Website', y='Yearly Amount Spent', data=customers) sns.jointplot(x='Time on App', y='Yearly Amount Spent', data=customers) sns.jointplot(x='Time on App', y='Length of Membership', kind="hex", data=customers) sns.pairplot(customers) sns.lmplot(x='Length of Membership', y='Yearly Amount Spent', data=customers) plt.show()
Τυπώνουμε το min και το max κάθε τιμής. """ columns = df_dataset.columns for num in range(1, 31): print('Min value of ', columns[num], ' is', min(df_dataset[columns[num]]), ' and max value is ', max(df_dataset[columns[num]]), '\n') # Βρίσκουμε τον βέλτιστο αριθμό γειτόνων find_best_K(10, dataset) # Βλέπουμε πως αξιολογείται το μοντέλο μας ανάλογα τον αριθμό των folds. fold_num_and_accuracy(11, dataset) df_dataset.dtypes # Σχεδιασμός διαγραμμάτων ως προς τα χαρακτηριστικά των όγκων και κατάταξη τους σε καλοήθεις και κακοήθεις. df = pd.read_csv( 'C:/Users/user/Desktop/ERGASIES_&_ARXEIA/Διαχείριση_Γνώσης_2/data.csv') sns.lmplot(x='radius_mean', y='texture_mean', hue='diagnosis', data=df) sns.lmplot(x='perimeter_mean', y='smoothness_mean', hue='diagnosis', data=df) sns.lmplot(x='area_mean', y='compactness_mean', hue='diagnosis', data=df) # Κάνουμε προβλέψεις με δεδομένα που ξέρουμε σε ποιο class ανήκουν. k_nearest_neighbors(dataset, dataset[0:10], 5) # 1 λάθος. k_nearest_neighbors(dataset, dataset[0:20], 5) # 3 λάθη. k_nearest_neighbors(dataset, dataset[0:100], 5) #Στα 100 παίρνουμε 10 λάθη. k_nearest_neighbors(dataset, dataset[0:200], 5) # 16 λάθη. add_new_patient_data_and_predict(df_dataset, dataset, 10) add_new_patient_data_and_predict(df_dataset, dataset, 20) add_new_patient_data_and_predict(df_dataset, dataset, 30)
import numpy as np import pandas as pd from pandas import Series, DataFrame from sklearn.datasets import load_boston import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') boston = load_boston() boston_df = DataFrame(boston['data']) boston_df.columns = boston['feature_names'] boston_df['Price'] = boston['target'] print(boston_df.head()) # plt.hist(boston['target'],bins=50) # plt.scatter(boston['data'][:,5],boston['target']) # plt.xlabel('Prices in $1000s') # plt.ylabel('Number of houses') # plt.show() sns.lmplot('RM', 'Price', data=boston_df) plt.show()
# sns.lmplot('Flour', 'Sugar', data=recipes, hue='Type', palette='Set1', fit_reg=False, scatter_kws={"s": 70}); # plt.plot(xx, yy, linewidth=2, color='black') # plt.plot(xx, yy_down, 'k--') # plt.plot(xx, yy_up, 'k--') # plt.show() # create function to predict muffin of cupcake def muffin_or_cupcake(flour, sugar): if (modal.predict([[flour, sugar]])) == 0: print("Muffin") else: print("Cupcake") muffin_or_cupcake(10, 55) # plotting Predicted data sns.lmplot('Flour', 'Sugar', data=recipes, hue='Type', palette='Set1', fit_reg=False, scatter_kws={"s": 70}) plt.plot(xx, yy, linewidth=2, color='black') plt.plot(10, 55, 'yo', markersize=9) plt.show()
# Import Data # =========== # # Load data from Google Trends. data = pd.read_csv('data/GoogleTrendsData.csv', index_col='Date', parse_dates=True) data.head() # Show DJIA vs. debt related query volume. display_charts(data, chart_type="stock", title="DJIA vs. Debt Query Volume", secondary_y="debt") seaborn.lmplot("debt", "djia", data=data, size=7) # Detect if search volume is increasing or decreasing in # any given week by forming a moving average and testing if the current value # crosses the moving average of the past 3 weeks. # # Let's first compute the moving average. data['debt_mavg'] = data.debt.rolling(window=3, center=False).mean() data.head() # Since we want to see if the current value is above the moving average of the # *preceeding* weeks, we have to shift the moving average timeseries forward by one. data['debt_mavg'] = data.debt_mavg.shift(1) data.head()
def association_userInput(): print("\n===============================================================================") print("a. Fixed Acidity") print("b. Volatile Acidity") print("c. Citric Acid") print("d. Residual Sugar") print("e. Chlorides") print("f. Free Sulfur") print("g. Dioxide") print("h. Total Sulfur Dioxide") print("i. Density") print("j. pH") print("k. Sulphates") print("l. Alcohol") print("m. Quality") print("===============================================================================") print("\nPlease select two characteristics from above to test an association for (enter the letter)") print("Note: If one of the characteristics you want to test for is quality, it is recommended you choose this characteristic for characteristic 1.") while True: choice1 = input("\nCharacteristic 1: ").lower().strip() if choice1 == "a": choice1 = "fixed acidity" break if choice1 == "b": choice1 = "volatile acidity" break if choice1 == "c": choice1 = "citric acid" break if choice1 == "d": choice1 = "residual sugar" break if choice1 == "e": choice1 = "chlorides" break if choice1 == "f": choice1 = "free sulfur" break if choice1 == "g": choice1 = "dioxide" break if choice1 == "h": choice1 = "total sulfur dioxide" break if choice1 == "i": choice1 = "density" break if choice1 == "j": choice1 = "pH" break if choice1 == "k": choice1 = "sulphates" break if choice1 == "l": choice1 = "alcohol" break if choice1 == "m": choice1 = "quality" break else: print("\nYou must select only one menu choice from above by typing the letter. Please try again.") while True: choice2 = input("\nCharacteristic 2: ").lower().strip() if choice2 == "a": choice2 = "fixed acidity" break if choice2 == "b": choice2 = "volatile acidity" break if choice2 == "c": choice2 = "citric acid" break if choice2 == "d": choice2 = "residual sugar" break if choice2 == "e": choice2 = "chlorides" break if choice2 == "f": choice2 = "free sulfur" break if choice2 == "g": choice2 = "dioxide" break if choice2 == "h": choice2 = "total sulfur dioxide" break if choice2 == "i": choice2 = "density" break if choice2 == "j": choice2 = "pH" break if choice2 == "k": choice2 = "sulphates" break if choice2 == "l": choice2 = "alcohol" break if choice2 == "m": choice2 = "quality" break else: print("\nYou must select only one menu choice from above by typing the letter. Please try again.") while True: wine_choice = input("\nWould like to test for red or white wine? (enter 'red' or 'white'): ").strip().lower() if wine_choice == "red": try: WineCharX = choice1 WineCharY = choice2 allWines = pd.read_csv('winequality-both.csv', sep=',', header=0) red = allWines.loc[allWines['type'] == 'red', :] getCorr = scipy.stats.pearsonr(red[WineCharX], red[WineCharY]) correlation = str(getCorr[0]) pValue = str(getCorr[1]) print("\nFor red wine, the correlation between " + WineCharX + " and " + WineCharY + " is: " + correlation) print("With p-value of: " + pValue) seaborn.lmplot(x=WineCharX, y=WineCharY, data=red) plt.xlabel(WineCharX) plt.ylabel(WineCharY) plt.title("Red Wine: " + WineCharX + " X " + WineCharY) plt.show() except (KeyError) as e: print("\nError. Please check that your spelling is correct of the wine characteristic you wish to test.") break if wine_choice == "white": try: WineCharX = choice1 WineCharY = choice2 allWines = pd.read_csv('winequality-both.csv', sep=',', header=0) white = allWines.loc[allWines['type'] == 'white', :] getCorr = scipy.stats.pearsonr(white[WineCharX], white[WineCharY]) correlation = str(getCorr[0]) pValue = str(getCorr[1]) print("\nFor white wine, the correlation between " + WineCharX + " and " + WineCharY + " is: " + correlation) print("With p-value of: " + pValue) seaborn.lmplot(x=WineCharX, y=WineCharY, data=white) plt.xlabel(WineCharX) plt.ylabel(WineCharY) plt.title("White Wine: " + WineCharX + " X " + WineCharY) plt.show() except (KeyError) as e: print("\nError. Please check that your spelling is correct of the wine characteristic you wish to test.") break if wine_choice != "red" or wine_choice != "white": print("\nYou must enter either 'red' or 'white' based on which wine you want to test associations for. Please try again.") while True: after = input("\nWould you like to test more associations or return to the main menu? (enter 'test' or 'main'): ").lower().strip() if after == "test": association_userInput() break if after == "main": break else: print("\nYou must enter either 'test' or 'main' based on what you want to do. Please try again.")
medals_all = round( medals_all.groupby(['NOC', 'Country']).Medal_Perc.mean(), 2).reset_index() #remove season medals_all.columns = ['NOC', 'Country', 'Medal_Perc'] # remove season host_medals = games_total_df[['Year', 'Host_NOC', 'Host_Medal_Perc']] #remove season, games host_medals.columns = ['Year', 'NOC', 'Host_Medal_Perc'] #remove season, games host_difference = pd.merge(host_medals, medals_all, how='left') print(host_difference) print(noc_total_df) # Plot of difference with hosting facet = sns.lmplot(data=host_difference, x='Medal_Perc', y='Host_Medal_Perc', robust=True, palette=['C1']) plt.plot([0, 15], [0, 15], 'black', linewidth=2, linestyle='dashed') facet.ax.set_xticks(np.arange(0, 15, 2.5)) facet.ax.set_yticks(np.arange(0, 36, 2.5)) plt.text(8, 7, 'x=y') facet.ax.ticklabel_format(useOffset=False) facet.ax.set_xlim(left=0) facet.ax.set_ylim(bottom=0) plt.title('The difference in percentage of medals won by host countries') plt.show() # Get the top 20 countries noc_colors = sns.color_palette("Paired", n_colors=11) noc_colors[-1] = (0.0, 0.0, 0.0)
def main(): # input_dir = "/Users/odedkushnir/Google Drive/Studies/PhD/Stretch_analysis" mutation_lst = ["A>G", "T>C", "G>A", "C>T"] # ["A>G", "T>C", "G>A", "C>T", "A>C", "T>G", "A>T", "T>A", "G>C", "C>G", "C>A", "G>T"] input_dir = "C:/Users/odedku/Stretch_analysis"#.format(mutation.replace(">", "")) for mutation in mutation_lst: # mutation = "A>G" mutation_in_stretch = 13 output_dir = input_dir + "_{0}".format(mutation.replace(">", "")) try: os.mkdir(output_dir) except OSError: print("Creation of the directory {0} failed".format(output_dir)) else: print("Successfully created the directory {0}".format(output_dir)) prefix = "20201012_q38/all_parts.blast" p2_1 = pd.read_table(input_dir + "/p2_1/{0}".format(prefix), sep="\t") p2_2 = pd.read_table(input_dir + "/p2_2/{0}".format(prefix), sep="\t") p5_1 = pd.read_table(input_dir + "/p5_1/{0}".format(prefix), sep="\t") p5_2 = pd.read_table(input_dir + "/p5_2/{0}".format(prefix), sep="\t") p8_1 = pd.read_table(input_dir + "/p8_1/{0}".format(prefix), sep="\t") p8_2 = pd.read_table(input_dir + "/p8_2/{0}".format(prefix), sep="\t") p10_1 = pd.read_table(input_dir + "/p10_1/{0}".format(prefix), sep="\t") p10_2 = pd.read_table(input_dir + "/p10_2/{0}".format(prefix), sep="\t") p12_1 = pd.read_table(input_dir + "/p12_1/{0}".format(prefix), sep="\t") p12_2 = pd.read_table(input_dir + "/p12_2/{0}".format(prefix), sep="\t") barcode_data = pd.read_csv(input_dir + "/barcode/PrimerID_barcode_Results.csv") # Dictionary of passage and number of PrimerID data_dict = {"p2_1": [p2_1, 23507], "p2_2": [p2_2, 38726], "p5_1": [p5_1, 17903], "p5_2": [p5_2, 12395], "p8_1": [p8_1, 8666], "p8_2": [p8_2, 9990], "p10_1": [p10_1, 6068], "p10_2": [p10_2, 40623], "p12_1": [p12_1, 9668], "p12_2": [p12_2, 11110]} control_id = 27962 """NOT from memory""" passage_lst = glob.glob(input_dir + "/p*") for passage in passage_lst: passage_num = passage.split("\\")[-1] try: os.mkdir(output_dir + "/{0}".format(passage_num)) os.mkdir(output_dir + "/{0}/20201012_q38".format(passage_num)) except OSError: print("Creation of the directory {0}/{1}/20201012_q38 failed".format(output_dir, passage_num)) else: print("Successfully created the directory {0}/{1}/20201012_q38".format(output_dir, passage_num)) create_crosstab_df(input_dir, output_dir, prefix, data_dict, control_id, mutation, mutation_in_stretch) """from memory""" passage_lst = glob.glob(input_dir + "/p*") crosstab_lst = [] for passage in passage_lst: passage_num = passage.split("\\")[-1] crosstab_df = pd.read_pickle(output_dir + "/{0}/20201012_q38/corsstab_df.pkl".format(passage_num)) crosstab_lst.append(crosstab_df) """Creation of the final tables and figs""" crosstab_df_all = pd.concat(crosstab_lst, axis=1) crosstab_df_all = crosstab_df_all[ ["Control", "p2_1", "p2_2", "p5_1", "p5_2", "p8_1", "p8_2", "p10_1", "p10_2", "p12_1", "p12_2"]] crosstab_df_all = crosstab_df_all.iloc[0:4, 9:] crosstab_df_all = crosstab_df_all.transpose() crosstab_df_all["Stretch_percentage"] = crosstab_df_all["No._of_reads_with_stretch_{0}".format(mutation)] / \ (crosstab_df_all["No._of_reads_with_stretch_{0}".format(mutation)] + crosstab_df_all["No._of_reads_without_stretch_{0}".format(mutation)]) crosstab_df_all["Stretch_percentage"] = crosstab_df_all["Stretch_percentage"] * 100 crosstab_df_all.reset_index(inplace=True, drop=False) crosstab_df_all = crosstab_df_all.rename(columns={"index": "Sample"}) crosstab_df_all = crosstab_df_all.merge(barcode_data, on="Sample", how="inner") crosstab_df_all["Hyper mutation read frequency/sequenced genome"] = crosstab_df_all["Stretch_percentage"] / \ crosstab_df_all["PrimerID_barcode"] crosstab_df_all["Hyper mutation read frequency/sequenced genome"] = crosstab_df_all[ "Hyper mutation read frequency/sequenced genome"].astype(float) crosstab_df_all["passage"] = np.where(crosstab_df_all["Sample"] != "Control", crosstab_df_all.apply(lambda x: str(x["Sample"]).split("_")[0].split("p")[-1], axis=1), 0) crosstab_df_all["replica"] = np.where(crosstab_df_all["Sample"] != "Control", crosstab_df_all.apply(lambda x: str(x["Sample"]).split("_")[-1], axis=1), 1) crosstab_df_all["passage"] = crosstab_df_all["passage"].astype(int) crosstab_df_all.to_csv(output_dir + "/crosstab_df_all.csv", sep=",") mean_crosstab_df_all = crosstab_df_all.groupby("passage", as_index=False).mean() mean_crosstab_df_all["sem"] = crosstab_df_all.groupby("passage", as_index=False).sem()[ "Hyper mutation read frequency/sequenced genome"] mean_crosstab_df_all["PrimerID_barcode"] = round(mean_crosstab_df_all["PrimerID_barcode"]) mean_crosstab_df_all.to_csv(output_dir + "/mean_crosstab_df_all.csv", sep=",") try: os.mkdir(output_dir + "/figs") except OSError: print("Creation of the directory {0}/figs failed".format(output_dir)) else: print("Successfully created the directory {0}/figs".format(output_dir)) crosstab_df = pd.read_pickle(output_dir + "/{0}/20201012_q38/corsstab_df.pkl".format(passage_num)) crosstab_lst.append(crosstab_df) slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(crosstab_df_all['passage'], crosstab_df_all[ 'Stretch_percentage']) fig1 = sns.lmplot(x="passage", y="Stretch_percentage", data=crosstab_df_all, fit_reg=True, line_kws={'label': "Linear Reg"}, ) fig1.set(xlabel="Passage", ylabel="Stretch Percentage [%]", xlim=(0, 12)) ax = fig1.axes[0, 0] ax.legend() leg = ax.get_legend() leg._loc = 2 L_labels = leg.get_texts() label_line_1 = "y={0:.3g}x+{1:.3g}\nstderr={2:.3g} Rsq={3:.3g}".format(slope1, intercept1, std_err1, r_value1 ** 2) L_labels[0].set_text(label_line_1) plt.savefig(output_dir + "/figs/points.png", dpi=300) slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(mean_crosstab_df_all['passage'], mean_crosstab_df_all[ 'Stretch_percentage']) fig2 = sns.lmplot(x="passage", y="Stretch_percentage", data=mean_crosstab_df_all, fit_reg=True, line_kws={'label': "Linear Reg"}, ) fig2.set(xlabel="Passage", ylabel="Stretch Percentage [%]", xlim=(0, 12)) ax = fig2.axes[0, 0] ax.legend() leg = ax.get_legend() leg._loc = 2 L_labels = leg.get_texts() label_line_2 = "y={0:.3g}x+{1:.3g}\nstderr={2:.3g} Rsq={3:.3g}".format(slope2, intercept2, std_err2, r_value2 ** 2) L_labels[0].set_text(label_line_2) plt.savefig(output_dir + "/figs/mean.png", dpi=300)
df['Hour'] = df['timeStamp'].apply(lambda time: time.hour) df['Month'] = df['timeStamp'].apply(lambda time: time.month) df['Day of Week'] = df['timeStamp'].apply(lambda time: time.dayofweek) dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'} df['Day of Week'] = df['Day of Week'].map(dmap) sns.countplot(x='Day of Week',data=df,hue='Reason',palette='viridis') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) sns.countplot(x='Month',data=df,hue='Reason',palette='viridis') byMonth = df.groupby('Month').count() byMonth.head() byMonth['twp'].plot() sns.lmplot(x='Month',y='twp',data=byMonth.reset_index()) df['Date']=df['timeStamp'].apply(lambda t: t.date()) df.groupby('Date').count()['twp'].plot() plt.tight_layout() df[df['Reason']=='Traffic'].groupby('Date').count()['twp'].plot() plt.title('Traffic') plt.tight_layout() df[df['Reason']=='Fire'].groupby('Date').count()['twp'].plot() plt.title('Fire') plt.tight_layout() df[df['Reason']=='EMS'].groupby('Date').count()['twp'].plot() plt.title('EMS')
catMTCARS = ['gear','cyl','am','carb','vs'] mtcars[catMTCARS] = mtcars[catMTCARS].astype('category') plt.figure(figsize=(5,2)) sns.countplot(data=mtcars, x='gear') plt.figure(figsize=(3,5)) sns.countplot(data=mtcars, x='gear') #--- #needs one numeric g = sns.catplot(data=mtcars, x='gear', y='mpg', hue='am') g.fig.set_figheight(6) g.fig.set_figheight(3) #--- sns.lmplot()), use the size and aspect # sns.catplot(data=mtcars, x='gear', y='mpg', hue='am', height=5, aspect=1/1) # sns.countplot(data=mtcars, x='gear') plt.gcf().set_size_inches(4, 3) # fig, ax = plt.subplots() # the size of A4 paper fig.set_size_inches(5, 4) sns.violinplot(data=mtcars[['mpg','wt']], inner="points", ax=ax) sns.despine()
def single_data(df): sns.lmplot('square', 'price', df, fit_reg=True) plt.show() print(df.head()) print(df.info())
geo = geo[f1] geo # Dataframe of regions lifeEx # Dataframe of life expectancy dataset1 = lifeEx.merge(geo, how = 'inner',on ='CountryCode') type(dataset1) stats.columns = ['CountryName', 'CountryCode', 'BirthRate', 'InternetUsers','IncomeGroup'] type(dataset1) merged_dataset = stats.merge(dataset1, how = 'inner',on ='CountryCode') merged_dataset # Create the Final Dataset final_dataset = merged_dataset[['CountryName_x','CountryCode','BirthRate','InternetUsers','IncomeGroup',1960,2013,'Region']] final_dataset.columns = ['CountryName','CountryCode','BirthRate','InternetUsers','IncomeGroup','Year_1960','Year_2013','Region'] final_dataset # Visualizations # Regression Plot : Life Expectancy in 1960 vs BirthRate per Region vis3 = sns.lmplot(x='BirthRate',y='Year_1960', data = final_dataset ,fit_reg=False, hue = 'Region', height = 10, aspect = 1) # Regression Plot : Life Expectancy in 2013 vs BirthRate per Region vis3 = sns.lmplot(x='BirthRate',y='Year_2013', data = final_dataset ,fit_reg=False, hue = 'Region', height = 10, aspect = 1) # Regression Plot : BirthRate vs Internet Users per Region vis3 = sns.lmplot(x='BirthRate',y='InternetUsers', data = final_dataset ,fit_reg=False, hue = 'Region', height = 10, aspect = 1)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') %matplotlib inline sns.lmplot('weight', 'mpg', data=df, fit_reg=False, aspect=1, size=5, hue='cylinders', col='origin') plt.show()
import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import numpy as np sns.set() df = pd.read_csv("times_bt_podas_mejor_caso.csv") df_p = pd.read_csv("times_bt_podas_peor_caso.csv") # Nano -> Milli df['time'] /= 1000000.0 df_p['time'] /= 1000000.0 df['peor'] = df_p['time'] df.plot('n', ['time', 'peor'], title='Mejor vs Peor') r = np.corrcoef(df['time'], df['peor'])[0, 1] print(r) #out: r = 0.9366349410059747 ax1 = sns.lmplot(x='time', y='peor', data=df) plt.xlabel("N") plt.ylabel("tiempo") plt.show()
vectors_set.append( [np.random.normal(3.0, 0.5), np.random.normal(1.0, 0.5)]) import matplotlib.pyplot as plt # 데이터 조작 패키지 import pandas as pd # 시각화 패키지 import seaborn as sns #난수 데이터 그래프 df = pd.DataFrame({ "x": [v[0] for v in vectors_set], "y": [v[1] for v in vectors_set] }) sns.lmplot("x", "y", data=df, fit_reg=False, size=6) plt.show() # 4개의 군집으로 그룹화하는 K-평균 알고리즘 import tensorflow as tf # 무작위 데이터를 가지고 상수 텐서를 생성 vectors = tf.constant(vectors_set) # 입력데이터에서 무작위로 K개의 데이터를 선택하는 방법 => 텐서플로가 무작위로 섞어서 K개의 중심을 선택하게 함 # K개의 데이터 포인트는 2D텐서로 저장됨 k = 4 centroids = tf.Variable(tf.slice(tf.random_shuffle(vectors), [0, 0], [k, -1])) # 텐서 구조 확인 print vectors.get_shape() print centroids.get_shape()
# No outlier , No need any operation for Item_Weight #full.loc[full['Item_Weight'].isin(outlier),'Item_Weight']=full['Item_Weight'].mean() #Item_Outlet_Sales BoxPlot=boxplot(full[0:8522]['Item_Outlet_Sales']) outlier= BoxPlot['fliers'][0].get_data()[1] full.loc[full['Item_Outlet_Sales'].isin(outlier),'Item_Outlet_Sales']=full[0:8522]['Item_Outlet_Sales'].mean() #-----------------Step 5:Exploration analysis of data--------------------------------------------------------- # Create photocopy of trian portion of full and assign it full1 full1=full[0:8522].copy() # Analying relation between Item_Weight & Item_Outlet_Sales sns.lmplot(x='Item_Weight', y='Item_Outlet_Sales', data=full1) # Analying relation between Item_MRP & Item_Outlet_Sales sns.lmplot(x='Item_MRP', y='Item_Outlet_Sales', data=full1) # Analying relation between Item_Visibility & Item_Outlet_Sales full2= full1[(full1['Item_MRP']>=240) & (full1['Item_MRP']<=241)] sns.lmplot(x='Item_Visibility', y='Item_Outlet_Sales', data=full2) # Analying relation between Item_Id & Item_Outlet_Sales # Retrieve numeric part of Item_Identifier and create new column full1['Item_Id'] = full1['Item_Identifier'].str[3:].astype(int) full2= full1[(full1['Item_MRP']>=240) & (full1['Item_MRP']<=241)]
print("Grafica para ver que genero de pelicula obtuvo mas likes en facebook.\n") print(separador) df.groupby('genres')['movie_facebook_likes'].sum().plot(kind='barh',legend='Reverse',color="green") plt.xlabel("Suma de likes") plt.show() elif opcion =="2": print("Grafica para ver el promedio de ganancias.\n") print(separador) df.gross.groupby(df.genres).mean().plot(kind='pie',cmap="Paired") plt.axis("equal") plt.ylabel("") plt.title("Promedio de ganancias") plt.show() elif opcion =="3": print("Grafica para comparar el presupuesto con la calificacion de la pelicula.\n") print(separador) df.groupby('budget')['imdb_score'].sum().plot(kind='bar',legend='Reverse',color="Black") plt.xlabel("Presupuesto") plt.ylabel("Calificación") plt.show() elif opcion =="4": print("Grafica de Dispercion para ver la pelicula con mas likes.\n") print(separador) sns.lmplot(x="num",y="movie_facebook_likes",data=df,fit_reg=False,hue="num",legend=False,palette="Paired") plt.show() elif opcion =="5": darInicio=False else: print("Debes de elegir una opción valida\n ") else: print("Programa Terminado.")
https://en.wikipedia.org/wiki/Median_absolute_deviation http://stackoverflow.com/questions/8930370/where-can-i-find-mad-mean-absolute-deviation-in-scipy """ arr = np.ma.array( arr).compressed() # should be faster to not use masked arrays. med = np.median(arr) return np.median(np.abs(arr - med)) #------------------------------------------------------------------------------- # Main program. #------------------------------------------------------------------------------- if __name__ == "__main__": df = pd.read_csv('./customers.csv') print((df.columns)) sns_plot = sns.lmplot(df.columns[0], df.columns[1], data=df, fit_reg=False) sns_plot.axes[0, 0].set_ylim(0, ) sns_plot.axes[0, 0].set_xlim(0, ) sns_plot.savefig("s_scaterplot.png", bbox_inches='tight') sns_plot.savefig("s_scaterplot.pdf", bbox_inches='tight') data = df.values.T[1] print((("Mean: %f") % (np.mean(data)))) print((("Median: %f") % (np.median(data)))) print((("Var: %f") % (np.var(data)))) print((("std: %f") % (np.std(data)))) print((("MAD: %f") % (mad(data))))
#fig, axes = plt.subplots(1, 3) # plot learning rate vs CRPS #ax = sns.lmplot(x="hp_learning_rate", y="metric_CRPS", hue="task", data=df,) #ax = sns.scatterplot(data=df, x='hp_learning_rate', y='metric_CRPS', hue='task') #ax.set(xscale="log") #ax.set_xlabel("x (learning rate)") #ax.set_ylabel("y") height = 4 aspect = 1.2 ax = sns.lmplot(x="hp_learning_rate", y="metric_CRPS", hue="task", ci=None, data=df, height=height, aspect=aspect, legend_out=False, fit_reg=False) ax.set(xscale="log", yscale="log") ax.ax.set_ylim(0.02, ) ax.ax.set_xlabel("x (learning rate)") ax.ax.set_ylabel("y") plt.tight_layout() plt.savefig("y_plot.jpg") plt.show() # plot learning rate vs CRPS mapped through psi = Phi^{-1} o F for task in df.task.unique():
from sklearn import svm import pandas import matplotlib.pyplot as plt import seaborn as sns df = pandas.read_csv('flowers.csv') df.columns = ['X1', 'X2', 'X3', 'X4', 'Y'] df.head() from sklearn.model_selection import train_test_split support = svm.SVC() X = df.values[:, 0:2] Y = df.values[:, 4] #print(Y) trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3) support.fit(trainX, trainY) print('Accuracy: \n', support.score(testX, testY)) pred = support.predict(testX) print("!") sns.set_context("notebook", font_scale=1.1) sns.set_style("ticks") sns.lmplot('X1', 'X2', scatter=True, fit_reg=False, data=df, hue='Y') plt.ylabel('X2') plt.xlabel('X1') plt.show()
ress_HC_meta]).reset_index(drop=True) sns.pairplot(ress, hue="Label") #------------------------------------------------------------------------------ from scipy.stats import spearmanr import seaborn as sns automl = AutoML_Regression() lasso_best, _, _, _ = automl.XGBoost(X_train, y_train, X_test, y_test) lasso_best.fit(X_train, y_train) y_pred = lasso_best.predict(X_test) dt = {"True RRS_Brooding": y_test, "Predicted RRS_Brooding": y_pred} df = pd.DataFrame(dt) g = sns.lmplot(x="True RRS_Brooding", y="Predicted RRS_Brooding", data=df) g.set(ylim=(min(y_test), max(y_test))) g.set(xlim=(min(y_test), max(y_test))) plt.text(-3.9, max(y_test) - 1, r'MSE = %.2f' % (mean_squared_error(y_test, y_pred))) plt.text(-3.9, max(y_test) - 2, r'Corr = %.2f' % (spearmanr(y_test, y_pred)[0])) plt.scatter(y_pred, y_test, s=8) plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--k') plt.xlim(min(y_test), max(y_test)) plt.ylim(min(y_test), max(y_test)) plt.ylabel('True RRS_Brooding') plt.xlabel('Predicted RRS_Brooding') #plt.text(s='Random Forest without Forward varible', x=1, # y=2, fontsize=12, multialignment='center')
testje.rolling(3).mean().plot(figsize=(20,10), linewidth=5, fontsize=20) plt.xlabel('Date', fontsize=20) # In[29]: testje.diff().plot(figsize=(20,10),linewidth=5,fontsize=20) plt.xlabel('Date',fontsize=20) # In[30]: import seaborn as sns sns.set() # In[31]: sns.lmplot(x='Br_Mean', y='Gr_Mean',fit_reg=False, data=tableJoin, hue='OBJECTID') # In[32]: tableJoin.corr() # In[33]: tableJoin.groupby(['OBJECTID']).corr() # In[34]: test[["Br_Mean","Gr_Mean","We_Mean"]].diff().plot(figsize=(20,10),linewidth=5,fontsize=20) plt.xlabel('Date',fontsize=20)
plot = sns.catplot(x="BldgType", y="SalePrice", data=df, kind="boxen") plot.savefig(path) # Still, the type of a dwelling seems like it should be important information. # Investigate whether BldgType produces a significant interaction with either of the following: # GrLivArea - Above ground living area # MoSold - Month sold feature = "GrLivArea" path = "../../../data/kaggleTutorials/output/figures/miE3.png" plot = sns.lmplot( x=feature, y="SalePrice", hue="BldgType", col="BldgType", data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4, ) plot.savefig(path) print(mi_scores.head(10)) # Do you recognize the themes here? Location, size, and quality. # You needn't restrict development to only these top features, # but you do now have a good place to start. # Combining these top features with other related features, # especially those you've identified as creating interactions, # is a good strategy for coming up with a highly informative set of features to train your model on.
# Boxplot for tip by sex sns.boxplot(x='sex', y='tip', data=tips) plt.show() # Scatter plot of total_bill and tip sns.regplot(x='total_bill', y='tip', data=tips) plt.show() ############################################ # Facet plots in Seaborn import seaborn as sns import matplotlib.pyplot as plt # Scatter plot of total_bill and tip faceted by smoker and colored by sex sns.lmplot(x='total_bill', y='tip', data=tips, hue='sex', col='smoker') plt.show() # FacetGrid of time and smoker colored by sex facet = sns.FacetGrid(tips, col="time", row='smoker', hue='sex') # Map the scatter plot of total_bill and tip to the FacetGrid facet.map(plt.scatter, 'total_bill', 'tip') plt.show() ############################################ # Univariate and Bivariate plots in Matplotlib import matplotlib.pyplot as plt # Univariate histogram
X = pd.to_datetime(CATdf.Date) y = CATdf.Close #plot plt.plot(X, y) plt.gcf().autofmt_xdate() plt.show() # In[32]: #Linear Plot of Volume and HLcat on Market Up/Down #illustrates low volatilty days more liekly to finish net positive #also higher volume on low and high volatility days more likely to finish net positive sns.lmplot('Volume', 'NetUpDown', data=CATdf, hue='HLcat') # In[33]: #Graph DJIA Close with HLdiffernce and Volume for insight index = pd.read_csv('djia_df_cat.csv') index.Date = pd.to_datetime(index.Date) plt.figure(figsize=(10, 8)) plt.plot(index.Date, index.Close, label="DJIA closing price") plt.plot(index.Date, index.HLdifference * 10, label="HLDifference") #scale volume for readability plt.plot(index.Date, index.Volume / 100000, label="Volume") plt.legend() plt.title("DJIA stocks")