def plot_chemical_trajectory(self, environment, filename): """ Plot the trajectory through chemical space. Parameters ---------- environment : str the name of the environment for which the chemical space trajectory is desired """ chemical_state_trajectory = self.extract_state_trajectory(environment) visited_states = list(set(chemical_state_trajectory)) state_trajectory = np.zeros(len(chemical_state_trajectory)) for idx, chemical_state in enumerate(chemical_state_trajectory): state_trajectory[idx] = visited_states.index(chemical_state) with PdfPages(filename) as pdf: sns.set(font_scale=2) fig = plt.figure(figsize=(28, 12)) plt.subplot2grid((1,2), (0,0)) ax = sns.scatterplot(np.arange(len(state_trajectory)), state_trajectory) plt.yticks(np.arange(len(visited_states)), visited_states) plt.title("Trajectory through chemical space in {}".format(environment)) plt.xlabel("iteration") plt.ylabel("chemical state") plt.tight_layout() plt.subplot2grid((1,2), (0,1)) ax = sns.countplot(y=state_trajectory) pdf.savefig(fig) plt.close()
def map_plotter(): """Plot cholopleth map and scatterplot below it""" print('Plotting the map... ', end='') fig, ax = plt.subplots(2,1, figsize=(10.7, 8.3),gridspec_kw={'height_ratios':[2, 1]}) fig.tight_layout(pad=1, w_pad=0.5, h_pad=0.5) fig.patch.set_facecolor('#f2f2f2') # set colormap cmap = plt.cm.Reds cmap.set_under(color='white') # set min and max for the axes, adjust to create some space for legend etc. vmin = min(avg_ward_prices['price']) vmax = max(avg_ward_prices['price']) xmin = lambda x: round(vmin, -2) - 100 if round(vmin, -2) > vmin else round(vmin, -2)-50 xmax = lambda x: round(vmax, -2) + 100 if round(vmax, -2) <= vmax else round(vmax, -2)+50 xmin = xmin(xmin) xmax = xmax(vmax) # set limits on map axis (coordinates) ax[0].set_xlim([500000, 563000]) ax[0].set_ylim([155000, 202000]) # remove axes ax[0].axis('off') # use the ward results and obtain the centre of polygon for each ward map_pos_wards = mapped_wards[['DISTRICT', 'geometry', 'price']] map_pos_wards = map_pos_wards[map_pos_wards['price'] != 0] centrx = map_pos_wards['geometry'].apply(lambda c: c.centroid.x) centry = map_pos_wards['geometry'].apply(lambda c: c.centroid.y) # set up a kernel density estimation plot masked by the ward boundaries j_map = cascaded_union(map_pos_wards['geometry']) j = ax[0].add_patch(PolygonPatch(j_map, fc='none', ec='#c1c1c1')) try: sns.kdeplot(centrx, centry, ax=ax[0], n_levels=len(map_pos_wards.index), cmap=cmap, shade=True, shade_lowest=False, zorder=11, kernel='biw', gridsize=1000, alpha=1) for col in ax[0].collections: col.set_clip_path(j) # plot map map_df.plot(linewidth=0.5, ax=ax[0], edgecolor='#c1c1c1', color='white') map_pos_wards.plot(linewidth=0, ax=ax[0], edgecolor='#c1c1c1', column=map_pos_wards['price'], vmin=xmin, vmax=xmax, cmap=cmap, zorder=10) ax[0].add_patch(PolygonPatch(j_map, fc='none', ec='#c1c1c1')) except ValueError: print('Single result cannot be plotted on a kdeplot. Please broaden the search criteria.') ax[0].annotate('*', xy=(centrx[0], centry[0]), xycoords='data', horizontalalignment='left', verticalalignment='top', fontsize=8, color='black', zorder=11) ax[0].annotate('* Single result cannot be plotted on a kernel density estimation plot. Please broaden the search criteria.', xy=(0.25, 0.92), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') map_df.plot(linewidth=0.5, ax=ax[0], edgecolor='#c1c1c1', color='white') map_pos_wards.plot(linewidth=0.8, ax=ax[0], edgecolor='#c1c1c1', column=map_pos_wards['price'], vmin=xmin, vmax=xmax, cmap=cmap, zorder=10) ax[0].add_patch(PolygonPatch(j_map, color='gray', ec='#c1c1c1')) # set up the colorbar sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=xmin, vmax=xmax)) sm._A = [] fig.colorbar(sm,format='£%.0f',ax=ax[0]) print('Map plotted.\nCreating a scatter plot...', end='') # scatterplot - sorted by average district price sns.set(style="whitegrid") selected_properties = curated_data[curated_data.district.isin(highest_dist)].sort_values(by='district') avg_d_for_sorting = selected_properties[['district','price']].groupby('district').mean() avg_d_for_sorting.columns = ['avgprice'] selected_properties = pd.merge(selected_properties,avg_d_for_sorting,left_on='district',right_on=avg_d_for_sorting.index).sort_values(by='avgprice') # created a simple formula for the marker size plot_markersize = [abs(int(-40*a+120)) for a in selected_properties['distance']] sns.scatterplot(x="price", y="district", hue='district', data=selected_properties, alpha=0.6, edgecolor='none',zorder=4, s=plot_markersize) # adding range bars to the scatterplot (min and max) - it's actually two overlapping bars, one white one colored barplotdata = selected_properties[['district','price']] bpdata = [[barplotdata[barplotdata['district']==row['district']].min().values[1] for index,row in barplotdata.iterrows()], [barplotdata[barplotdata['district'] == row['district']].max().values[1] for index, row in barplotdata.iterrows()]] barplotdata.loc[:,'min'] = bpdata[0] barplotdata.loc[:,'max'] = bpdata[1] # calculating the averages and ensuring each district has only 1 result (to prevent multiple circles on top of each other average_circles = [(row['district'], barplotdata[barplotdata['district'] == row['district']].mean().values[0]) for index, row in barplotdata.iterrows()] avgC = [] for i in range(len(average_circles)): try: if average_circles[i][0] == average_circles[i+1][0]: avgC.append(np.nan) else: avgC.append(int(average_circles[i][1])) except: if i+1 == len(average_circles): avgC.append(int(average_circles[i][1])) else: avgC.append(np.nan) pass barplotdata.loc[:,'avg'] = avgC # legend for the scatterplot legend1 = Line2D(range(1), range(1), linewidth=0, marker='o', markerfacecolor='gray', markeredgewidth=0, markersize=min(plot_markersize) ** (1 / 2.0), alpha=0.6) legend2 = Line2D(range(1), range(1), linewidth=0, marker='o', markerfacecolor='gray', markeredgewidth=0, markersize=max(plot_markersize) ** (1 / 2.0), alpha=0.6) legend3 = Line2D(range(1), range(1), linewidth=0, marker='o', markersize=5, markerfacecolor="white", fillstyle=None, markeredgecolor='gray', markeredgewidth=0.5, alpha=0.5) legend4 = Line2D(range(1), range(1), color="#c6e2ff", linewidth=5, alpha=0.5) ax[1].legend((legend1, legend2, legend3, legend4), ( "{0:.1f} miles\nfrom station".format(max(selected_properties['distance'])), "{0:.1f} miles\nfrom station".format(min(selected_properties['distance'])), 'District mean', 'Range'), numpoints=1, loc=1, fontsize=8) # actual barplots and scatterplot sns.barplot(x='max', y=barplotdata['district'], data=barplotdata, label="Total", alpha=0.5,ci=None,zorder=2) sns.barplot(x='min', y=barplotdata['district'], data=barplotdata, label="Total", color="white",zorder=3) sns.scatterplot(size=5, x=barplotdata['avg'], y=barplotdata['district'], legend=False, data=barplotdata, markers="o", color='white', edgecolor='black',alpha=0.5, zorder=5) # sns.color_palette("hls", len(new_bar.index)) #sns.hls_palette(8, l=.3, s=.8)) data=new_bar, join=False, markers="s", palette=['black'],zorder=5, alpha=0.5) #sns.color_palette("hls", len(new_bar.index)) #sns.hls_palette(8, l=.3, s=.8)) print('Scatter plot created. \nAdjusting axes and adding annotations...') ax[1].set_xlabel('Price', fontsize=10, color='#555555', fontweight='bold') ax[1].set_ylabel('District', fontsize=10, color='#555555', fontweight='bold') ax[1].set_xlim([xmin, xmax]) plt.xticks(fontsize=8, rotation=90) ax[1].xaxis.set_major_formatter(FormatStrFormatter('£%.0f')) ax[1].xaxis.grid(False) plt.yticks(fontsize=8) # ANNOTATIONS # add a title... ax[0].set_title('Rental property pricing in London on %s\n' % str(datetime.now())[:10], fontsize=16, fontweight='bold', color='#333333') # create an annotations for the logo description, and query details ax[0].annotate('Powered by:', xy=(0.022, 0.975), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') fig.figimage(plt.imread('data/rm_logo.png'), xo=50, yo=1520) ax[0].annotate('Search query details', xy=(0.022, 0.90), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555', fontweight='bold') ax[0].annotate('Price range:', xy=(0.022, 0.88), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('£%i - %i' % (inputdict['minprice'],inputdict['maxprice']), xy=(0.17, 0.88), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('Search radius:', xy=(0.022, 0.86), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(str(inputdict['radius']), xy=(0.17, 0.86), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('Property type:', xy=(0.022, 0.84), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(inputdict['propertytype'], xy=(0.17, 0.84), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('No. of bedrooms:', xy=(0.022, 0.82), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(str(inputdict['min_number_bedrooms'])+' - '+str(inputdict['max_number_bedrooms']), xy=(0.17, 0.82), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') if len(inputdict['furnishTypes'])>0: ax[0].annotate('Furnished:', xy=(0.022, 0.80), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(str(inputdict['furnishTypes']), xy=(0.17, 0.80), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') if len(inputdict['letType'])>0: ax[0].annotate('Let type:', xy=(0.022, 0.78), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(str(inputdict['letType']), xy=(0.17, 0.78), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') if len(inputdict['includeLetAgreed'])>0: ax[0].annotate('Incl. let agreed:', xy=(0.022, 0.76), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(str(inputdict['includeLetAgreed']), xy=(0.17, 0.76), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('Results summary', xy=(0.022, 0.72), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555', fontweight='bold') ax[0].annotate('No. of reported properties:', xy=(0.022, 0.70), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(no_of_total_results, xy=(0.17, 0.70), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('No. of initial results:', xy=(0.022, 0.68), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(len(rm.index), xy=(0.17, 0.68), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('No. of final results:', xy=(0.022, 0.66), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(str(len(curated_data.index))+' ({0:.0f}%)'.format(100*len(curated_data.index)/no_of_total_results), xy=(0.17, 0.66), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('No. of wards:', xy=(0.022, 0.64), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(len(avg_ward_prices.index), xy=(0.17, 0.64), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('No. of districts:', xy=(0.022, 0.62), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate(len(avg_district_prices.index), xy=(0.17, 0.62), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('Median price:', xy=(0.022, 0.60), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('£'+str(int(np.median(curated_data['price']))), xy=(0.17, 0.60), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=8, color='#555555') ax[0].annotate('Top results', xy=(0.18, 0.37), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=10, color='#555555', fontweight='bold') plt.subplots_adjust(left=0.17, bottom=0.09, right=0.95, top=0.9) # output files' formats plotdirs = {'pdf': 'output/' + str(datetime.now())[:10] + '_' + str(inputdict['minprice']) + '_' + str(inputdict['maxprice']) +'_'+ str(inputdict['min_number_bedrooms']) + '_' + str(inputdict['max_number_bedrooms']) + '_' + '_' + str(inputdict['propertytype']) + '.pdf', 'jpeg': 'output/' + str(datetime.now())[:10] + '_' + str(inputdict['minprice']) + '_' + str(inputdict['maxprice']) +'_'+ str(inputdict['min_number_bedrooms']) + '_' + str(inputdict['max_number_bedrooms']) + '_' + '_' + str(inputdict['propertytype']) + '.jpeg'} print('Saving figures to the following directories:') print('\t'+plotdirs['jpeg']) plt.savefig(plotdirs['jpeg'], dpi=200, bbox_inches='tight',edgecolor='black') print('\t'+plotdirs['pdf']) plt.savefig(plotdirs['pdf'], dpi=200, bbox_inches='tight', edgecolor='black') # timer measures the time from beginning to the plot saving global timer2 timer2 = datetime.now()
def pairplot(df, **kwargs): import numpy as np import matplotlib.pyplot as plt from matplotlib.figure import figaspect import seaborn as sns def corrplot(x, y, data, cmap='coolwarm', correlation='spearman', **kwargs): from scipy import stats from matplotlib.patches import Ellipse data_x = data[x] data_y = data[y] is_x_category = data_x.dtype.name == 'category' is_y_category = data_y.dtype.name == 'category' if is_x_category: data_x = data_x.cat.codes if is_y_category: data_y = data_y.cat.codes if correlation is 'pearson' or not is_x_category or not is_y_category: method = 'pearson' else: method = 'spearman' r = data_x.corr(data_y, method=method) if type(cmap) is str: cmap = plt.get_cmap(cmap) color = cmap((r + 1) / 2) ax.axis('off') ax.add_artist( Ellipse((0.5, 0.5), width=np.sqrt(1 + r), height=np.sqrt(1 - r), angle=45, color=color)) ax.text(0.5, 0.5, '{:.2f}'.format(r), size='x-large', horizontalalignment='center', verticalalignment='center') def crosstabplot(x, y, data, ax, **kwargs): import pandas as pd cross = pd.crosstab(data[x], data[y]).values size = cross / cross.max() * 500 crosstab_kws = kwargs[ 'crosstab_kws'] if 'crosstab_kws' in kwargs else {} scatter_kws = dict(color=sns.color_palette()[0], alpha=0.3) scatter_kws.update(crosstab_kws['scatter_kws'] if 'scatter_kws' in crosstab_kws else {}) text_kws = dict(size='x-large') text_kws.update(crosstab_kws['text_kws'] if 'text_kws' in crosstab_kws else {}) for (xx, yy), count in np.ndenumerate(cross): ax.scatter(xx, yy, s=size[xx, yy], **scatter_kws) ax.text(xx, yy, count, horizontalalignment='center', verticalalignment='center', **text_kws) def show_off_legend(ax): legend = ax.get_legend() if legend: legend.set(visible=False) n_variables = df.columns.size hue = kwargs['hue'] if 'hue' in kwargs else None figsize = kwargs['figsize'] if 'figsize' in kwargs else figaspect( 1) * 0.5 * n_variables _, axes = plt.subplots(n_variables, n_variables, figsize=figsize) plt.subplots_adjust(hspace=0.1, wspace=0.1) for i in range(n_variables): axes[i, i].get_shared_x_axes().join(*axes[i:n_variables, i]) if i > 1: axes[i, 0].get_shared_y_axes().join(*axes[i, :i - 1]) for (row, col), ax in np.ndenumerate(axes): x = df.columns[col] y = df.columns[row] x_data = df[x] y_data = df[y] x_dtype = x_data.dtype.name y_dtype = y_data.dtype.name is_x_category = x_dtype == 'category' is_y_category = y_dtype == 'category' if is_x_category: x_categories = x_data.cat.categories if is_y_category: y_categories = y_data.cat.categories if row == col: # diagonal hue_data = df[hue] if hue else None if is_x_category: bar_kws = dict(alpha=0.4, orientation='vertical') bar_kws.update(kwargs['bar_kws'] if 'bar_kws' in kwargs else {}) if hue: cross = pd.crosstab(x_data, hue_data) cross.index = cross.index.categories if hue_data.dtype.name == 'category': cross.columns = cross.columns.categories else: cross.columns = hue_data.unique() cross.reset_index(inplace=True) melt = pd.melt(cross, id_vars='index', var_name='hue') hue_values = melt['hue'].unique() colors = sns.color_palette(n_colors=hue_values.size) for i in range(hue_values.size): hue_value = hue_values[i] color = colors[i] subset = melt[melt['hue'] == hue_value] if i is 0: bottom = 0 else: bottom = melt.loc[melt['hue'].isin( hue_values[:i])].groupby('index').sum().loc[ subset['index']].values.ravel() ax.bar(subset['index'], subset['value'], bottom=bottom, color=color, **bar_kws) else: cross = pd.crosstab(x_data, []).values.ravel() sns.barplot(x_data.cat.categories, cross, color=sns.color_palette()[0], ci=None, ax=ax, **bar_kws) else: dist_kws = kwargs['dist_kws'] if 'dist_kws' in kwargs else {} if hue: colors = sns.color_palette(n_colors=hue_data.unique().size) hist_kws = dict(color=colors, alpha=0.4) hist_kws.update(dist_kws['hist_kws'] if 'hist_kws' in dist_kws else {}) if hue_data.dtype.name == 'category': hue_values = df[hue].cat.categories else: hue_values = df[hue].unique() ax.hist([df.loc[df[hue] == v, x] for v in hue_values], density=True, histtype='barstacked', **hist_kws) for c, v in zip(colors, hue_values): sns.distplot(df.loc[df[hue] == v, x], hist=False, color=c, ax=ax, **dist_kws) else: sns.distplot(x_data, ax=ax, **dist_kws) elif row < col: # upper corr_kws = kwargs['corr_kws'] if 'corr_kws' in kwargs else {} corrplot(x, y, data=df, **corr_kws) else: # lower if is_x_category and is_y_category: crosstabplot(x, y, data=df, ax=ax) else: violin_kws = kwargs[ 'violin_kws'] if 'violin_kws' in kwargs else {} if is_x_category or is_y_category: orient = 'v' if is_x_category else 'h' sns.violinplot(x, y, hue, df, orient=orient, ax=ax, **violin_kws) show_off_legend(ax) else: scatter_kws = kwargs[ 'scatter_kws'] if 'scatter_kws' in kwargs else {} sns.scatterplot(x, y, hue, data=df, ax=ax, **scatter_kws) show_off_legend(ax) if row < n_variables - 1: ax.set(xlabel='') for ticklabel in ax.get_xticklabels(): ticklabel.set(visible=False) else: ax.set(xlabel=x) if is_x_category: ax.set(xticks=np.arange(x_categories.size), xticklabels=x_data.cat.categories) if col > 0: ax.set(ylabel='') for ticklabel in ax.get_yticklabels(): ticklabel.set(visible=False) else: ax.set(ylabel=y) if row > 0 and is_y_category: ax.set(yticks=np.arange(y_categories.size), yticklabels=y_data.cat.categories) return axes
verbose=True) methylation_results_df.sort_values(by='p_value').head(n=10) # In[7]: expression_results_df['nlog10_p'] = -np.log10(expression_results_df.corr_pval) methylation_results_df['nlog10_p'] = -np.log10( methylation_results_df.corr_pval) sns.set({'figure.figsize': (20, 8)}) fig, axarr = plt.subplots(1, 2) # plot cancer type prediction from expression, in a volcano-like plot sns.scatterplot(data=expression_results_df, x='delta_mean', y='nlog10_p', hue='reject_null', hue_order=[False, True], ax=axarr[0]) # add vertical line at 0 axarr[0].axvline(x=0, linestyle=':', color='black') # add horizontal line at statistical significance threshold l = axarr[0].axhline(y=-np.log10(SIG_ALPHA), linestyle=':') # label horizontal line with significance threshold # (matplotlib makes this fairly difficult, sadly) axarr[0].text(0.9, -np.log10(SIG_ALPHA) + 0.3, r'$\alpha = {}$'.format(SIG_ALPHA), va='center', ha='center', color=l.get_color(), backgroundcolor=axarr[0].get_facecolor())
# the predicted values versus the true values. # %% predicted_actual = { "True values (k$)": target_test, "Predicted values (k$)": target_predicted } predicted_actual = pd.DataFrame(predicted_actual) # %% import matplotlib.pyplot as plt import seaborn as sns sns.scatterplot(data=predicted_actual, x="True values (k$)", y="Predicted values (k$)", color="black", alpha=0.5) plt.axline((0, 0), slope=1, label="Perfect fit") plt.axis('square') _ = plt.title("Regression using a model without \ntarget transformation") # %% [markdown] # On this plot, correct predictions would lie on the diagonal line. This plot # allows us to detect if the model makes errors in a consistent way, i.e. # has some bias. # # On this plot, we see that for the large True price values, our model tends to # under-estimate the price of the house. Typically, this issue arises when the # target to predict does not follow a normal distribution. In this case the # model would benefit from target transformation.
#feature names print(data["feature_names"]) #concatenating the target class with the input classes. dataframe = pd.DataFrame(np.c_[data["data"], data["target"]], columns=np.append(data["feature_names"], ["target"])) # visualization sns.pairplot(dataframe, hue="target", vars=[ "mean radius", "mean area", "mean smoothness", "mean texture", "mean perimeter", "mean compactness", "mean symmetry" ]) sns.scatterplot(x="mean radius", y="mean compactness", data=dataframe, hue="target") sns.scatterplot(x="mean radius", y="mean smoothness", data=dataframe, hue="target") sns.scatterplot(x="mean radius", y="mean symmetry", data=dataframe, hue="target") # checking the correlation between the features. sns.heatmap(dataframe.corr("kendall")) # seperating out the input and output for training our model. X = dataframe.iloc[:, :30]
check = True kf.transitionMatrix = np.array([[1, 0, DELTA_T, 0], [0, 1, 0, DELTA_T], [0, 0, 1, 0], [0, 0, 0, -1 * BOUNCE_COEFF]]) kf.predict(const_mat) if check: kf.transitionMatrix = np.array([[1, 0, DELTA_T, 0], [0, 1, 0, DELTA_T], [0, 0, 1, 0], [0, 0, 0, 1]]) cv2.imshow('frame', fullgray) cv2.waitKey(50) cv2.imshow('frame2', res2) cv2.waitKey(50) cap.release() cv2.destroyAllWindows() sns.scatterplot(*zip(*graph)) plt.show() base = snapshots[0] for img in snapshots: base = cv2.bitwise_or(base, img) plt.imshow(base) plt.show()
def graf_scatter(x, y): 'ScatterPlot' sns.set_theme(style="darkgrid") sns.scatterplot(x=x, y=y, palette='deep') return
from keras.layers import Dense, LSTM, Activation, Input from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt df = pd.read_csv( r"C:\Users\nisha_000\Documents\GitHub\WAIxASUA\MCI_2014_to_2018.csv") test = df.loc[df["MCI"] == "Theft Over", df.columns[[0, 1, 4]]] #4 = occurencedate test["occurrencedate"] = pd.to_datetime( test["occurrencedate"]) #.astype(int)/10**17 test = test.sort_values('occurrencedate').drop('occurrencedate', axis='columns') sns.scatterplot(test["X"], test["Y"]) dataset = test.values dataset = dataset.astype('float32') # normalize the dataset scaler = MinMaxScaler(feature_range=(0, 1)) dataset = scaler.fit_transform(dataset) sns.scatterplot(dataset[:, 0], dataset[:, 1]) # test model 1 X = dataset[:-1] Y = dataset[1:] Y_og = Y nrow = len(X) # reshape into X=t and Y=t+1
# # improvement - plot sample name next to (or overlapping) points def label_point(x, y, val, ax): a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1) for i, point in a.iterrows(): ax.text(point['x'] + .02, point['y'], str(point['val']), fontsize='small') with PdfPages('multipage_pdf.pdf') as pdf: # scatter 1 sns.set_style("whitegrid") plt.figure(figsize=(7, 7)) ax = sns.scatterplot(data=pca_tot, x="pc1", y="pc2", hue="celltype") ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('Preliminary PCA for PCA generator, all genes', fontsize=20) pdf.savefig() # saves the current figure into a pdf page plt.close() # scatter 2 plt.figure(figsize=(7, 7)) ax = sns.scatterplot(data=pca_tot, x="pc1", y="pc2", hue="celltype") ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('Preliminary PCA for PCA generator, all genes', fontsize=20) label_point(pca_tot.pc1, pca_tot.pc2, pca_tot.id, plt.gca()) pdf.savefig() # saves the current figure into a pdf page plt.close() # scatter 3
def circumplex_scatter( data, ax=None, title="Soundscape Scatter Plot", hue=None, x="ISOPleasant", y="ISOEventful", prim_labels=True, diagonal_lines=False, figsize=(5, 5), palette=None, legend=False, legend_loc="lower left", s=10, **scatter_kwargs, ): """Plot ISOcoordinates as scatter points on a soundscape circumplex grid Makes use of seaborn.scatterplot Parameters ---------- ax : plt.Axes, optional existing matplotlib axes, by default None title : str, optional , by default "Soundscape Scatter Plot" hue : vector or key in data, optional Grouping variable that will produce points with different colors. Can be either categorical or numeric, although color mapping will behave differently in latter case, by default None x : str, optional column name for x variable, by default "ISOPleasant" y : str, optional column name for y variable, by default "ISOEventful" prim_labels : bool, optional whether to include ISOPleasant and ISOeventful labels, by default True diagonal_lines : bool, optional whether to include diagonal dimension labels (e.g. calm, etc.), by default False figsize : tuple, optional by default (5, 5) palette : string, list, dict or matplotlib.colors.Colormap, optional Method for choosing the colors to use when mapping the hue semantic. String values are passed to seaborn.color_palette(). List or dict values imply categorical mapping, while a colormap object implies numeric mapping. by default None legend : bool, optional whether to include legend with the hue values, by default False legend_loc : str, optional relative location of legend, by default "lower left" s : int, optional size of scatter points, by default 10 Returns ------- plt.Axes """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) if palette is None: n_colors = len(data[hue].unique()) if hue else len(data) palette = sns.color_palette("husl", n_colors, as_cmap=False) s = sns.scatterplot( data=data, x=x, y=y, hue=hue, s=s, ax=ax, legend=legend, palette=palette, zorder=data_zorder, **scatter_kwargs, ) ax = _deal_w_default_labels(ax, prim_labels) _circumplex_grid(ax, prim_labels, diagonal_lines) _set_circum_title(ax, prim_labels, title) if legend: _move_legend(ax, legend_loc) return s
def circumplex_density( data, ax=None, title="Soundscape Density Plot", x="ISOPleasant", y="ISOEventful", prim_labels=True, diagonal_lines=False, incl_scatter=False, incl_outline=False, figsize=(5, 5), palette="Blues", scatter_color="black", outline_color="black", fill_color="blue", fill=True, hue=None, common_norm=False, bw_adjust=default_bw_adjust, alpha=0.95, legend=False, legend_loc="lower left", s=10, scatter_kwargs={}, **density_kwargs, ): """Create a bivariate distribution plot of ISOCoordinates This method works by creating a circumplex_grid, then overlaying a sns.kdeplot() using the ISOCoordinate data. IF a scatter is also included, it overlays a sns.scatterplot() using the given options underneath the density plot. If using a hue grouping, it is recommended to only plot the 50th percentile contour so as to not create a cluttered figure. This can be done with the options thresh = 0.5, levels = 2. Parameters ---------- ax : plt.Axes, optional existing subplot axes, by default None title : str, optional by default "Soundscape Density Plot" x : str, optional column name for x variable, by default "ISOPleasant" y : str, optional column name for y variable, by default "ISOEventful" prim_labels : bool, optional whether to include ISOPleasant and ISOEventful axis labels, by default True diagonal_lines : bool, optional whether to include diagonal dimension axis labels (i.e. calm, etc.), by default False incl_scatter : bool, optional plot coordinate scatter underneath density plot, by default False incl_outline : bool, optional include a thicker outline around the density plot, by default False figsize : tuple, optional by default (5, 5) palette : str, optional Method for choosing the colors to use when mapping the hue semantic. String values are passed to seaborn.color_palette(). List or dict values imply categorical mapping, while a colormap object implies numeric mapping. by default "Blues" scatter_color : str, optional define a color for the scatter points. Does not work with a hue grouping variable, by default "black" outline_color : str, optional define a color for the add'l density outline, by default "black" fill_color : str, optional define a color for the density fill, does not work with a hue grouping variable, by default "blue" fill : bool, optional whether to fill the density plot, by default True hue : vector or key in data, optional Grouping variable that will produce points with different colors. Can be either categorical or numeric, although color mapping will behave differently in latter case, by default None common_norm : bool, optional [description], by default False bw_adjust : [type], optional [description], by default default_bw_adjust alpha : float, optional [description], by default 0.95 legend : bool, optional whether to include the hue labels legend, by default False legend_loc : str, optional relative location of the legend, by default "lower left" s : int, optional size of the scatter points, by default 10 scatter_kwargs : dict, optional additional arguments for sns.scatterplot(), by default {} Returns ------- plt.Axes """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) if incl_scatter: d = sns.scatterplot( data=data, x=x, y=y, hue=hue, s=s, ax=ax, legend=legend, color=scatter_color, palette=palette, zorder=data_zorder, **scatter_kwargs, ) if incl_outline: d = sns.kdeplot( data=data, x=x, y=y, fill=False, ax=ax, alpha=1, color=outline_color, palette=palette, hue=hue, common_norm=common_norm, legend=legend, zorder=data_zorder, bw_adjust=bw_adjust, **density_kwargs, ) d = sns.kdeplot( data=data, x=x, y=y, fill=fill, ax=ax, alpha=alpha, palette=palette, color=fill_color, hue=hue, common_norm=common_norm, legend=legend, zorder=data_zorder, bw_adjust=bw_adjust, **density_kwargs, ) _circumplex_grid(ax, prim_labels, diagonal_lines) _set_circum_title(ax, prim_labels, title) _deal_w_default_labels(ax, prim_labels) if legend: _move_legend(ax, legend_loc) return d
"""Plot accuracy vs model size for our visual wake word trained models.""" import seaborn as sns from matplotlib import pyplot as plt from pathlib import Path import pandas as pd accuracy = Path.cwd() / 'accuracy.log' sizes = Path.cwd() / "sizes.log" acc_dict = eval(accuracy.read_text()) size_dict = eval(sizes.read_text()) data = [] for model_name in acc_dict.keys(): data.append((model_name, acc_dict[model_name], size_dict[model_name])) df = pd.DataFrame(data) df.columns = ['model', 'accuracy', 'size'] print(df) sns.scatterplot(y='accuracy', x='size', data=df) plt.title("Visual Wake Words Model Test") plt.xlabel("Bytes") plt.ylabel("Dev set accuracy") plt.show()
# I'd make their values all 0, but that would skew the model to cluster pitchers into what they DONT pitch # Imputing meaans shouldn't affect the model, and doesn't pretend to know what a pitcher's unknown pitches would be like. df3b = df3.fillna(df3.mean()) ###### Clustering Analysis ###### from sklearn.cluster import KMeans import seaborn as sns n_clusters = [3, 5, 8, 10, 15, 20, 25, 30, 40, 50] error = list() for clusters in n_clusters: print(clusters, 'clusters...') kmeans = KMeans(n_clusters=clusters, random_state=523).fit(df3b) error.append(kmeans.inertia_) sns.scatterplot(n_clusters, error) # Should really be looking for elbow in the above scatterplot, but hard to see one...going with 15. # Make use the average silouette method, or gap statistic method, to find a better number. TBD. kmeans = KMeans(n_clusters=15, random_state=523).fit(df3b) idx = kmeans.fit_predict(df3b) pitcher_years = df3b.index.tolist() final = pd.DataFrame({'pitcher_year': pitcher_years, 'cluster': idx}) final['pitcher'] = final['pitcher_year'].apply(lambda x: x[:len(x) - 7]) final['year'] = final['pitcher_year'].apply(lambda x: x[-4:]) ### NEXT STEP: ATTACH FIPS FOR EACH PLAYER_YEAR, THEN LOOK AT TRENDS PER CLUSTER # ALSO NEED TO INVESTIGATE WHY DATA ONLY UP TO 2017! lahman_pitching = pd.read_csv('lahman/core/pitching.csv')
def ratioConc(ax, respDF, cell1, cell2, time, mutAffDF, pseudo=0.1, legend=False): """Plots Ratio of cell 1 to cell 2 over a range of concentrations""" hillDF = hillRatioDosePlot(ax[0], respDF, time, cell1, cell2, pseudo=pseudo) fitDF = pd.DataFrame() for ligand in hillDF.Ligand.unique(): for valency in hillDF.loc[hillDF.Ligand == ligand].Valency.unique(): isoData = hillDF.loc[(hillDF.Ligand == ligand) & (hillDF.Valency == valency)] fitDF = pd.concat([ fitDF, pd.DataFrame({ "Ligand": [ligand], "Valency": valency, cell2 + " Max": isoData.Ratio.max(), cell2 + " Dose": isoData.loc[isoData.Ratio == isoData.Ratio.max()].Dose.values }) ]) ax[0].set(title="Ratio of " + cell1 + " to " + cell2) if legend: h, l = ax[0].get_legend_handles_labels() ax[0].legend(h[-3:], l[-3:]) fitDF = fitDF.merge(mutAffDF) maxLineDF = pd.DataFrame() doseLineDF = pd.DataFrame() affs = np.linspace(-1, 1, 100).flatten() for valency in fitDF.Valency.unique(): valData = fitDF.loc[fitDF.Valency == valency] mMax, bMax = np.polyfit(np.log10(valData["IL2Rα $K_{D}$ (nM)"].values), valData[cell2 + " Max"], 1) mDose, bDose = np.polyfit( np.log10(valData["IL2Rα $K_{D}$ (nM)"].values), np.log10(valData[cell2 + " Dose"]), 1) maxLineDF = pd.concat([ maxLineDF, pd.DataFrame({ "Valency": valency, "IL2Rα $K_{D}$ (nM)": np.power(10, affs), cell2 + " Max": mMax * affs + bMax }) ]) doseLineDF = pd.concat([ doseLineDF, pd.DataFrame({ "Valency": valency, "IL2Rα $K_{D}$ (nM)": np.power(10, affs), cell2 + " Dose": np.power(10, mDose * affs + bDose) }) ]) maxLineDF, doseLineDF = maxLineDF.reset_index(), doseLineDF.reset_index() sns.scatterplot(data=fitDF, x="IL2Rα $K_{D}$ (nM)", y=cell2 + " Max", hue="Ligand", style="Valency", ax=ax[1], palette=ligDict, legend=False) sns.lineplot(data=maxLineDF, x="IL2Rα $K_{D}$ (nM)", y=cell2 + " Max", style="Valency", ax=ax[1], color="k", linewidth=1., legend=False) ax[1].set(xscale="log", title="Ratio of " + cell1 + " to " + cell2, xlim=(1e-1, 1e1), ylim=(0, None), ylabel=cell1 + " to " + cell2 + " Ratio Max Magnitude") sns.scatterplot(data=fitDF, x="IL2Rα $K_{D}$ (nM)", y=cell2 + " Dose", hue="Ligand", style="Valency", ax=ax[2], palette=ligDict, legend=False) sns.lineplot(data=doseLineDF, x="IL2Rα $K_{D}$ (nM)", y=cell2 + " Dose", style="Valency", ax=ax[2], color="k", linewidth=1., legend=False) ax[2].set(xscale="log", yscale="log", title="Ratio of " + cell1 + " to " + cell2, xlim=(1e-1, 1e1), ylim=(1e-2, 1e2), ylabel=cell1 + " to " + cell2 + " Ratio Max Dose")
def calibrateExperiment(folderName, secondPath, concUnit, concUnitPrefix, numberOfCalibrationSamples, initialStandardVolume): #Get cytokine calibration curve data tempExperimentParameters = {'overallPlateDimensions': [8, 12]} calibrationFileNames = glob.glob('inputData/bulkCSVFiles/Calibration*') print(calibrationFileNames) calibrationNames = [] kitNames = [] for calibrationFileName in calibrationFileNames: newName = calibrationFileName.split('.')[0].split('_')[0].split( '/')[-1] kitNames.append(newName) print(kitNames) sortedData, sortedFiles = cleanUpFlowjoCSV(kitNames, folderName, 'cyt', tempExperimentParameters) for i, newName in enumerate(kitNames): if '-' in newName: newName2 = newName.split('-')[1] else: newName2 = newName kitNames[i] = newName2 rsquaredList = [] concLODList = [] fittingParametersList = [] cbaStandardsMFIList = [] cbaPlotPointsMFIList = [] cbaStandardsConcentrationList = [] cbaPlotPointsConcentrationList = [] numberOfPlotPoints = 101 xaxistitle = 'Concentration of Cytokine Standards Standards (' + concUnitPrefix + ')' yaxistitle = 'GeoMFI' allCytokinesHaveMWInDict = True for calibration in sortedData: cytokines = parseCytokineCSVHeaders(calibration.columns) for cytokine in cytokines: if cytokine[0] not in completeCytokineMWDict: allCytokinesHaveMWInDict = False print(cytokine[0]) print('wat') print(allCytokinesHaveMWInDict) for calibration in sortedData: data = np.array(calibration.values[:, 1:-1], dtype=float) cytokines = parseCytokineCSVHeaders(calibration.columns) fittingParameters = np.zeros((data.shape[1], 4)) concLOD = np.zeros((data.shape[1], 4)) #Initial concentration all cytokine standards is given by CBA kit manual as 5000 pGg/mL: when standards are diluted in 2mL conc = 5000 #pg/mL serialDilutionFactor = 2 #1:serialDilutionFactor dilution between each standard well #Smaller initial dilution (0.5mL instead of 2mL for example) increase the initial concentration of the first calibration sample initialConc = (conc * 1e-12) / ( (initialStandardVolume * 1e-3) / 2 ) #g/L (pg/mL * 1e-12 g/pg)/(1e-3 L/mL) #Calibration samples are always diluted by a factor of serialdilutionFactor (so with 12 calibration samples, the last sample is (serialDilutionFactor^-11) the concentration of the first, which is pure standard (2^0) cbaStandardsConcentrations = np.flipud(initialConc * np.power( serialDilutionFactor, np.linspace(-numberOfCalibrationSamples + 1, 0, numberOfCalibrationSamples))) #More x values along the above concentration bounds are sampled to use to construct calibration curve. Plot points are extended slightly at high range to allow visualization of upper LOD not accessible with experimental dilution cbaStandardsConcentrationsPlotPoints = np.flipud( initialConc * np.power( 2, np.linspace(-numberOfCalibrationSamples + 1, 4, numberOfPlotPoints))) cbaStandardsConcentrationMatrix = np.zeros( [len(cytokines), cbaStandardsConcentrations.shape[0]]) cbaStandardsConcentrationPlotPointsMatrix = np.zeros( [len(cytokines), cbaStandardsConcentrationsPlotPoints.shape[0]]) cbaStandardsMFIMatrix = np.zeros( [len(cytokines), cbaStandardsConcentrations.shape[0]]) cbaStandardsMFIPlotPointsMatrix = np.zeros( [len(cytokines), cbaStandardsConcentrationsPlotPoints.shape[0]]) color_list = sns.color_palette(sns.color_palette(), len(cytokines)) print(cytokines) print(data) for i, cytokineList in enumerate(cytokines): cytokine = cytokineList[0] #amplitude bounded from range/2 to range*2, EC50 bounded from minimum to maximum standard concentration tested, Hill coefficient bounded from 0 to 2, Background bounded from 0 to minimum GFI*2 lowerCurveFitBounds = [ (np.max(data[:, i]) - np.min(data[:, i])) / 2, np.min(cbaStandardsConcentrations), 0, 0 ] upperCurveFitBounds = [ (np.max(data[:, i]) - np.min(data[:, i])) * 2, np.max(cbaStandardsConcentrations), 2, np.min(data[:, i]) * 2 ] #use scipy curve fit to determine best hill equation fit for data, searching within the bounds given above popt, pcov = curve_fit(Hill, cbaStandardsConcentrations, np.log10(data[:, i]), sigma=np.log10(data[:, i]), bounds=(lowerCurveFitBounds, upperCurveFitBounds)) rsquared = round( r_squared(cbaStandardsConcentrations, np.log10(data[:, i]), Hill, popt), 3) print(rsquared) rsquaredList.append(rsquared) for j in range(len(popt)): #Convert just ec50 value to desired units (nM,uM etc) if cytokine has a molar mass in dict if j == 1 and allCytokinesHaveMWInDict: fittingParameters[i, j] = np.multiply( popt[j], (concUnit / completeCytokineMWDict[cytokine])) #other values in 4 parameter logistic equation are tied to intensity y-value, which doesn't change, or are the hill coefficient, which is completely separate, so parameters are kept the same else: fittingParameters[i, j] = popt[j] #Convert x values of experimental data points and curve fit points to desired units (nM,uM,etc.) if allCytokinesHaveMWInDict: cbaStandardsConcentrationMatrix[i, :] = np.multiply( cbaStandardsConcentrations, (concUnit / completeCytokineMWDict[cytokine])) cbaStandardsConcentrationPlotPointsMatrix[i, :] = np.multiply( cbaStandardsConcentrationsPlotPoints, (concUnit / completeCytokineMWDict[cytokine])) else: cbaStandardsConcentrationMatrix[ i, :] = cbaStandardsConcentrations cbaStandardsConcentrationPlotPointsMatrix[ i, :] = cbaStandardsConcentrationsPlotPoints cbaStandardsMFIMatrix[i, :] = data[:, i] print(fittingParameters[i, :]) cbaStandardsMFIPlotPointsMatrix[i, :] = np.power( 10, Hill(cbaStandardsConcentrationPlotPointsMatrix[i, :], *fittingParameters[i, :])) #Plot on log-log scale the experimental points and the curve fit line with previously determined curve fitting parameters #plt.loglog(cbaStandardsConcentrations,data[:,i],'o',color=color_list[i,:],label=listOfCytokines[i]) #plt.loglog(cbaStandardsConcentrationsPlotPoints,np.power(10,Hill(convertedCBAStandardsPlotPoints,*fittingParameters[i,:]))) #'_fit; R2 = '+str(rsquared) #Get LOD for each cytokine calibration curve (aka the linear range of the calibration curve) backgroundGFI = fittingParameters[i, 3] amplitudeGFI = fittingParameters[i, 0] #Approximate LOD by determining concentration values at LOD% and 1-LOD% (3% and 97%) of curve. Must be used on log10(curve), as calibration curve is plotted in logscale LODpercent = 0.03 #LOD% more than background GFI used for lower LOD GFI lowerGFILOD = math.log10(10**((1 + LODpercent) * math.log10(backgroundGFI))) #LOD% less than maximum GFI (Background + amplitude) used for upper LOD GFI upperGFILOD = math.log10( 10**((1 - LODpercent) * math.log10(amplitudeGFI + backgroundGFI))) #Log10(upper/lowerGFILOD) converted back into normal GFI by 10 to its power, then fed into inverse hill equation with current cytokine fitting parameters to obtain corresponding concentration values lowerConcLOD = InverseHill(lowerGFILOD, fittingParameters[i, :]) upperConcLOD = InverseHill(upperGFILOD, fittingParameters[i, :]) #Create dict with keys as cytokines, values as GFI/conc LODs concLOD[i, :] = np.array( [10**lowerGFILOD, 10**upperGFILOD, lowerConcLOD, upperConcLOD]) flattenedMatrix = cbaStandardsMFIMatrix.flatten() reshapedMatrix = np.reshape( flattenedMatrix, (numberOfCalibrationSamples, len(cytokines)), order='F') flattenedMatrix2 = cbaStandardsMFIPlotPointsMatrix.flatten() reshapedMatrix2 = np.reshape(flattenedMatrix2, (numberOfPlotPoints, len(cytokines)), order='F') flattenedMatrix3 = cbaStandardsConcentrationMatrix.flatten() reshapedMatrix3 = np.reshape( flattenedMatrix3, (numberOfCalibrationSamples, len(cytokines)), order='F') flattenedMatrix4 = cbaStandardsConcentrationPlotPointsMatrix.flatten() reshapedMatrix4 = np.reshape(flattenedMatrix4, (numberOfPlotPoints, len(cytokines)), order='F') realCytokineList = [] for cytokine in cytokines: realCytokineList.append(cytokine[0]) dataValsList = [] plotPointsList = [] for j in range(1, numberOfCalibrationSamples + 1): dataValsList.append([j]) for j in range(1, numberOfPlotPoints + 1): plotPointsList.append([j]) dataValsIndex = pd.MultiIndex.from_tuples(dataValsList, names=['Standard']) plotPointsIndex = pd.MultiIndex.from_tuples(plotPointsList, names=['Standard']) currentCBAStandardsMFIDf = pd.DataFrame(reshapedMatrix, index=dataValsIndex, columns=realCytokineList) currentCBAPlotPointsMFIDf = pd.DataFrame(reshapedMatrix2, index=plotPointsIndex, columns=realCytokineList) currentCBAStandardsConcentrationDf = pd.DataFrame( reshapedMatrix3, index=dataValsIndex, columns=realCytokineList) currentCBAPlotPointsConcentrationDf = pd.DataFrame( reshapedMatrix4, index=plotPointsIndex, columns=realCytokineList) currentCBAStandardsMFIDf.columns.name = 'Cytokine' currentCBAPlotPointsMFIDf.columns.name = 'Cytokine' currentCBAStandardsConcentrationDf.columns.name = 'Cytokine' currentCBAPlotPointsConcentrationDf.columns.name = 'Cytokine' mic1 = pd.MultiIndex.from_tuples(cytokines, names=['Cytokine']) print(cytokines) print(mic1) fittingParametersDf = pd.DataFrame( fittingParameters, index=mic1, columns=['Amplitude', 'EC50', 'HillCoeff', 'Background']) mic2 = pd.MultiIndex.from_tuples([['MFI', 'Lower'], ['MFI', 'Upper'], ['Concentration', 'Lower'], ['Concentration', 'Upper']]) concLODDf = pd.DataFrame(concLOD, index=mic1, columns=mic2) concLODList.append(concLODDf) fittingParametersList.append(fittingParametersDf) cbaStandardsMFIList.append(currentCBAStandardsMFIDf) cbaPlotPointsMFIList.append(currentCBAPlotPointsMFIDf) cbaStandardsConcentrationList.append( currentCBAStandardsConcentrationDf) cbaPlotPointsConcentrationList.append( currentCBAPlotPointsConcentrationDf) #fullFittingParametersDf = pd.concat(fittingParametersList,keys=kitNames,names=['Kit Name']) #fullConcLODDf = pd.concat(concLODList,keys=kitNames,names=['Kit Name']) fullFittingParametersDf = pd.concat(fittingParametersList) fullConcLODDf = pd.concat(concLODList) print(fullFittingParametersDf) print(fullConcLODDf) fullCBAStandardsMFIDf = pd.concat(cbaStandardsMFIList, keys=kitNames, names=['Kit Name'], axis=1) fullCBAPlotPointsMFIDf = pd.concat(cbaPlotPointsMFIList, keys=kitNames, names=['Kit Name'], axis=1) fullCBAStandardsConcentrationDf = pd.concat(cbaStandardsConcentrationList, keys=kitNames, names=['Kit Name'], axis=1) fullCBAPlotPointsConcentrationDf = pd.concat( cbaPlotPointsConcentrationList, keys=kitNames, names=['Kit Name'], axis=1) fullCBAStandardsList = [ fullCBAStandardsMFIDf.stack().stack(), fullCBAStandardsConcentrationDf.stack().stack() ] fullCBAPlotPointsList = [ fullCBAPlotPointsMFIDf.stack().stack(), fullCBAPlotPointsConcentrationDf.stack().stack() ] fullCBAStandardsDf = pd.concat(fullCBAStandardsList, axis=1, keys=[yaxistitle, xaxistitle]) fullCBAPlotPointsDf = pd.concat(fullCBAPlotPointsList, axis=1, keys=[yaxistitle, xaxistitle]) plottingPointsDf = fullCBAPlotPointsDf.reset_index() plottingStandardsDf = fullCBAStandardsDf.reset_index() numCyt = len(pd.unique(plottingPointsDf['Cytokine'])) if numCyt <= 10: fullpalette = sns.color_palette(sns.color_palette(), numCyt) else: fullpalette = sns.color_palette('hls', numCyt) g = sns.relplot(data=plottingPointsDf, x=xaxistitle, y=yaxistitle, hue='Cytokine', col='Kit Name', kind='line', col_order=pd.unique(plottingPointsDf['Kit Name']), hue_order=pd.unique(plottingPointsDf['Cytokine']), height=10, palette=fullpalette) #Plot vertical lines at lower and upper concentration limits of detection colorDict = {} for j, cytokine in enumerate(pd.unique(plottingPointsDf['Cytokine'])): colorDict[cytokine] = fullpalette[j] for axis, kitName in zip(g.axes.flat, pd.unique(plottingPointsDf['Kit Name'])): currentpalette = [] for cytokine in pd.unique(plottingStandardsDf[ plottingPointsDf['Kit Name'] == kitName]['Cytokine']): currentColor = colorDict[cytokine] currentpalette.append(currentColor) cytokineLODValues = fullConcLODDf.loc[cytokine, :]['Concentration'] axis.axvline(x=cytokineLODValues['Lower'].values, color=currentColor, linestyle=':') axis.axvline(x=cytokineLODValues['Upper'].values, color=currentColor, linestyle=':') g2 = sns.scatterplot(data=plottingStandardsDf[ plottingStandardsDf['Kit Name'] == kitName], x=xaxistitle, y=yaxistitle, hue='Cytokine', ax=axis, legend=False, palette=currentpalette) axis.set_xscale('log') axis.set_yscale('log') plt.savefig('plots/calibrationCurves-' + folderName + '-' + concUnitPrefix + '.png') #Save fitting parameters and LOD for curve fit for each cytokine with open( 'misc/fittingParameters-' + folderName + '-' + concUnitPrefix + '.pkl', "wb") as f: pickle.dump(fullFittingParametersDf, f) with open( 'misc/LODParameters-' + folderName + '-' + concUnitPrefix + '.pkl', "wb") as f: pickle.dump(fullConcLODDf, f)
g2.set_xticklabels(labels, rotation=0) fig.add_subplot(2, 2, 3) g3 = sns.countplot(x='Transmission', data=dt_train) loc, labels = plt.xticks() g3.set_xticklabels(labels, rotation=0) fig.add_subplot(2, 2, 4) g4 = sns.countplot(x='Owner_Type', data=dt_train) loc, labels = plt.xticks() g4.set_xticklabels(labels, rotation=0) plt.show() fig = plt.figure(figsize=(15, 15)) fig.subplots_adjust(hspace=0.2, wspace=0.2) ax1 = fig.add_subplot(2, 2, 1) plt.xlim([0, 100000]) p1 = sns.scatterplot(x="Kilometers_Driven", y="Price", data=dt_train) loc, labels = plt.xticks() ax1.set_xlabel('Kilometer') ax2 = fig.add_subplot(2, 2, 2) #plt.xlim([0, 100000]) p2 = sns.scatterplot(x="Mileage_upd", y="Price", data=dt_train) loc, labels = plt.xticks() ax2.set_xlabel('Mileage') ax3 = fig.add_subplot(2, 2, 3) #plt.xlim([0, 100000]) p3 = sns.scatterplot(x="Engine_upd", y="Price", data=dt_train) loc, labels = plt.xticks() ax3.set_xlabel('Engine')
axes[1, 1].set_xlabel('Price', fontsize=14) axes[1, 1].set_ylabel('HP', fontsize=14) axes[1, 1].yaxis.set_label_position("right") axes[1, 1].yaxis.tick_right() axes[1, 1].set(ylim=(40, 160)) plt.show() f, axes = plt.subplots(1, 2, figsize=(14, 4)) sns.distplot(dataset['KM'], ax=axes[0]) axes[0].set_xlabel('KM', fontsize=14) axes[0].set_ylabel('Count', fontsize=14) axes[0].yaxis.tick_left() sns.scatterplot(x='Price', y='KM', data=dataset, ax=axes[1]) axes[1].set_xlabel('Price', fontsize=14) axes[1].set_ylabel('KM', fontsize=14) axes[1].yaxis.set_label_position("right") axes[1].yaxis.tick_right() plt.show() fuel_list = Counter(dataset['FuelType']) labels = fuel_list.keys() sizes = fuel_list.values() f, axes = plt.subplots(1, 2, figsize=(14, 4)) sns.countplot(dataset['FuelType'], ax=axes[0], palette="Set1") axes[0].set_xlabel('Fuel Type', fontsize=14)
p.set_xticklabels(p.get_xticklabels(), rotation=90) plt.tight_layout() plt.show() # In[27]: #No. of hours spent on each Offence by district df = data.groupby(["DISTRICT", "OFFENSE_CODE_GROUP" ])["HOUR"].sum().reset_index().sort_values("HOUR", ascending=False) df fig = plt.figure(figsize=(12, 12)) ax = fig.add_subplot(111) p = sns.scatterplot(x="HOUR", y="OFFENSE_CODE_GROUP", hue="DISTRICT", data=df, palette="summer") p.set_ylabel("No. of Crimes Occurred") p.set_xlabel("Hours") plt.tight_layout() plt.show() # In[41]: #Year wise percentage rate yrlbl = data['YEAR'].astype('category').cat.categories.tolist() yrwisecount = data['YEAR'].value_counts() sizes = [yrwisecount[year] for year in yrlbl] fig1, ax1 = plt.subplots() ax1.pie(sizes, labels=yrlbl, autopct='%1.1f%%', shadow=True)
def plot_classification(self, figsize=(16, 9)) -> Dict: import seaborn as sns import matplotlib.pyplot as plt from matplotlib import gridspec from pandas.plotting import register_matplotlib_converters # get rid of deprecation warning register_matplotlib_converters() probability_cutoff = self.probability_cutoff pc = self.probability_cutoff plots = {} for target in unique_top_level_columns( self.df) if self.df.columns.nlevels == 3 else [None]: # get target and frame df = self.df[target] if target is not None else self.df # define grid fig = plt.figure(figsize=figsize) gs = gridspec.GridSpec(2, 1, height_ratios=[1, 3]) ax0 = plt.subplot(gs[0]) ax1 = plt.subplot(gs[1]) # plot probability bar = sns.lineplot(x=range(len(df)), y=df[PREDICTION_COLUMN_NAME].iloc[:, 0], ax=ax0) ax0.hlines(probability_cutoff, 0, len(df), color=sns.xkcd_rgb['silver']) # plot loss color = pd.Series(0, index=df.index) color.loc[(df[PREDICTION_COLUMN_NAME].iloc[:, 0] > pc) & df[LABEL_COLUMN_NAME].iloc[:, 0] > pc] = 1 color.loc[(df[PREDICTION_COLUMN_NAME].iloc[:, 0] <= pc) & df[LABEL_COLUMN_NAME].iloc[:, 0] > pc] = 2 colors = { 0: sns.xkcd_rgb['white'], 1: sns.xkcd_rgb['pale green'], 2: sns.xkcd_rgb['cerise'] } palette = [ colors[color_index] for color_index in np.sort(color.unique()) ] sns.scatterplot(ax=ax1, x=range(len(df)), y=df[GROSS_LOSS_COLUMN_NAME].iloc[:, 0].clip(upper=0), size=df[GROSS_LOSS_COLUMN_NAME].iloc[:, 0] * -1, hue=color, palette=palette) plt.close() plots[target] = fig return plots
def plot(x, y): ''' plots points given coordinate lists ''' fig = sns.scatterplot(x, y).get_figure() fig.savefig('plot.png')
filter = np.all( np.array([ joined_df['infections_based_on_cases_as_percent_of_infectious'].notna( ).values, #joined_df['infections_based_on_cases_as_percent_of_infectious'] < 10, #joined_df['case_based_infectious_population'] > 100, ]), axis=0) data = joined_df.reset_index()[filter] sns.boxplot(data=data, x='maxtempC', y='infections_based_on_deaths_as_percent_of_infectious') sns.scatterplot(data=data, x='maxtempC', y='infections_based_on_deaths_as_percent_of_infectious', hue="Country/Region") sns.scatterplot(data=data, x='maxtempC', y='infections_based_on_deaths_as_percent_of_infectious', hue="case_based_infectious_population") line_df = data[data['Country/Region'] == 'France'].set_index('date')[[ 'new_cases', 'infections_based_on_cases', 'case_based_infectious_population' ]] sns.lineplot(data=line_df) ax2 = plt.twinx()
model = LinearRegression().fit(X.T, Y[i]) #model = LinearRegression().fit(X[2].reshape(-1,1), Y[0]) r_sq = model.score(X.T, Y[i]) #r_sq = model.score(X[2].reshape(-1,1), Y[2]) y_pred2 = model.predict(X2.T) #y_pred= model.predict(X[2].reshape(-1,1)) intercept, coefficients = model.intercept_, model.coef_ #--------------------------------------------------------------------------------------- x_labels = [0.1, 0.5, 1, 2, 5, 10] y_labels = [0, 0.2, 0.4, 0.6, 0.8, 1.0] fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True) #fig.suptitle(string1.format(c0,c1,c2,indir_01,indir_02,indir_21,r,R)) fig.suptitle("Linear regression for X") sns.scatterplot(ax=axes[0], x=y_pred2[:-2], y=Y2[i][:-2]) axes[0].set_title("x= y_pred; y=Y_ideal") sns.scatterplot(ax=axes[1], x=Y2[i][:-2], y=Y2[i][:-2]) axes[1].set_title("x=Y_ideal; y=Y_ideal") #sns.plot(ax=axes[2], x=y_pred2[:-2]-Y2[2][:-2],y=np.arange(10000)) #axes[2].set_title("x=delta t; error") plt.plot(y_pred2[:-2] - Y2[i][:-2], 'k') plt.xlabel('time_steps') plt.ylabel('error') plt.title('Linear regression')
) print(imdb['color'].value_counts(normalize=True)) # Normalizar os valores # print(imdb['director_name'].value_counts().tail(20)) # Lista os Diretores e quantos filmes fizeram e depois Pega as ultimas linhas a = 1 # Pegar a coluna das cores #### REfazer esta parte color_or_bw = imdb.query( "color in ['Color', ' Black and White']" ) # Criar nova tabela para estudar a importância/relação de filmes coloridos/PeB (remove os demais) color_or_bw = color_or_bw.dropna().query( 'budget > 0 | gross > 0' ) # Remove as linhas sem dados (dropna()) e as linhas com dados color_or_bw['color_0_ou_1'] = ( color_or_bw['color'] == 'Color' ) * 1 # Tentativa de Criar uma coluna que torna a variável Preto/Branco em binária sns.scatterplot(data=color_or_bw, x="color_0_ou_1", y="gross") plot.show() print('\n\nDataframe organizado por filmes coloridos Ou preto e branco') print(color_or_bw["color_0_ou_1"].value_counts()) # color_or_bw['color_0_ou_1'] = color_or_bw['color'] == 'Color' # Outra forma de fazer, mas não etá aceitando # df["b"] = df["value"] == 3 # Exemplo # print(f'\n\n Tamanho da variável: {len(color_or_bw)}') ### Verificar gasto e ganho apenas com os filmes dos USA, devido a imprecissão de conversão monetária imdb2 = imdb.drop_duplicates() print( f'\nHaviam {len(imdb)} colunas e após remover os duplicado, passamos a ter {len(imdb2)} colunas.\nOu seja, haviam {len(imdb)-len(imdb2)} filmes duplicados' ) imdb_usa = imdb2.query('country == "USA"') budget_gross = imdb_usa[[ 'budget', 'gross'
x_merged_to_y = pd.concat([X_data_visualize, y], axis=1) x_merged_to_y.shape # In[12]: print(x_merged_to_y) # In[13]: ax = sns.scatterplot(x="Amount",y="Class",hue="Class",data=x_merged_to_y) # In[15]: fig = plt.figure(figsize=(15,8)) fig.subplots_adjust(hspace=0.6, wspace=0.8) for i in range(1, 9): plt.subplot(2, 4, i) sns.scatterplot(x="V"+str(i),y="Class",hue="Class",data=x_merged_to_y) # In[16]:
print(data.groupby( ['Team', 'Medal']).Medal.agg('count')) # Calulates the medals won by each team print(data.groupby(['Sex', 'Medal' ]).Sex.agg('count')) # Calulates the medals won by Sex print( 'The different types of sport for the athletes and their participation\n', data[data.Sport.fillna('None') != 'None'].Sport.value_counts()) # Calculates all the sports and their participation print('Total number of women participants', len(data[data.Sex == 'F'].Name.unique())) # Women participation print('Total number of men participants', len(data[data.Sex == 'M'].Name.unique())) # Men participation f_year_count = data[data.Sex == 'F'].groupby('Year').agg('count').Name m_year_count = data[data.Sex == 'M'].groupby('Year').agg('count').Name plot = (sns.scatterplot(data=m_year_count), sns.scatterplot(data=f_year_count)) plt.show() # Data visulation plot of male and female participants male_data = data[data.Sex == 'M'] print( male_data.groupby(['Sport'])['Weight', 'Height'].agg(['min', 'max', 'mean'])) female_data = data[data.Sex == 'F'] print( female_data.groupby(['Sport'])['Weight', 'Height'].agg(['min', 'max', 'mean'])) # It calculates the height and weight of male and female in the different sports sport_min_year = male_data.groupby('Sport').Year.agg( ['min', 'max'])['min'].sort_values('index') year_count = Counter(sport_min_year) year = list(year_count.keys())
def plot_residuals(actual, prediction): residual = prediction - actual sns.scatterplot(x=actual, y=residual) plt.title("residuals")
tsne = TSNE(n_components=2, perplexity=15, random_state=1000) data_tsne = tsne.fit_transform(sdf) df_tsne = pd.DataFrame(data_tsne, columns=['x', 'y'], index=cdf.index) dff = pd.concat([cdf, df_tsne], axis=1) # Show the dataset sns.set() fig, ax = plt.subplots(figsize=(18, 11)) with sns.plotting_context("notebook", font_scale=1.5): sns.scatterplot(x='x', y='y', size='Age', sizes=(30, 400), palette=sns.color_palette("husl", 2), data=dff, ax=ax) ax.set_xlabel(r'$x$', fontsize=14) ax.set_ylabel(r'$y$', fontsize=14) plt.show() # Perform the preliminary analysis n_clusters = [] n_noise_points = [] silhouette_scores = [] calinski_harabaz_scores = []
norm_count = counts[p] if p in shuffled_counts: shuf_count = shuffled_counts[p] diff_dict['predicate'].append(p) diff_dict['normal'].append(norm_count) diff_dict['difference'].append(norm_count - shuf_count) sns.barplot(diff_dict['predicate'], diff_dict['difference']) plt.xticks(rotation=90) plt.tight_layout() plt.ylabel('Difference (Normal Count - Shuffled Count)') plt.xlabel('Predicate') plt.savefig('Figures/AA_TOF_norm_shuffled_change.png') plt.show() sns.scatterplot(diff_dict['normal'], diff_dict['difference']) plt.xticks(rotation=90) plt.tight_layout() plt.ylabel('Normal vs Difference') plt.xlabel('Normal Count') plt.savefig('Figures/AA_TOF_difference_v_norm_scatter.png') plt.show() # plot path lengths: length_df = pd.concat( [pd.DataFrame(path_lengths), pd.DataFrame(shuffled_path_lengths)]) sns.boxplot(length_df['type'], length_df['length']) plt.savefig('Figures/AA_TOF_shuffled_length_boxplots.png') plt.show()
#print(columnGene2) #print(len(columnCodonPosition2)) #print(len(columnDepth2)) #print(len(columnGene2)) d = { "CodonPosition": columnCodonPosition2, 'CodonDepth': columnDepth2, "Genetype": columnGene2 } df = pd.DataFrame(data=d) #print(columnCodonPosition) #cmap = sns.cubehelix_palette(dark=color, light=.8, as_cmap=True) sns.scatterplot(x="CodonPosition", y="CodonDepth", hue="Genetype", palette=customPalette, s=30, alpha=0.9, data=df).set_title("strain " + listofSamples[x]) #print("test") #plt.show() plt.savefig("strain " + "_" + listofSamples[x] + "_" + str(gene) + '.png') plt.clf() #print(df) #df.to_csv("test2.csv") #break # cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True) # cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True) #d = {"CodonPosition": columnCodonPosition, 'CodonDepth':columnDepth, "Genetype":columnGene}
""" Scatterplot with continuous hues and sizes ========================================== _thumb: .45, .45 """ import seaborn as sns sns.set() # Load the example planets dataset planets = sns.load_dataset("planets") cmap = sns.cubehelix_palette(rot=-.2, as_cmap=True) ax = sns.scatterplot(x="distance", y="orbital_period", hue="year", size="mass", palette=cmap, sizes=(10, 200), data=planets)
iowa_train = pd.read_csv('train.csv') iowa_train.head() iowa_train['SalePrice'].describe() # ALl prices are greater than zero # Looking at Skewness and Kurtosis sns.distplot(iowa_train['SalePrice']) # Sknewness and Kurtosis print(iowa_train['SalePrice'].skew()) print(iowa_train['SalePrice'].kurt()) # Visualzing the Numeric Variables Through ScatterPlots # Extracting Numeric Variables from Data Frame numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] num_iowatrain = iowa_train.select_dtypes(include = numerics) num_iowatrain.head() num_iowatrain.columns Gr = sns.scatterplot(x = 'GrLivArea', y = 'SalePrice', data = num_iowatrain) Lot = sns.scatterplot(x = 'LotArea', y = 'SalePrice', data = num_iowatrain) Mas = sns.scatterplot(x = 'MasVnrArea', y = 'SalePrice', data = num_iowatrain
""" Scatterplot with categorical and numerical semantics ==================================================== _thumb: .45, .5 """ import seaborn as sns import matplotlib.pyplot as plt sns.set(style="whitegrid") # Load the example diamonds dataset diamonds = sns.load_dataset("diamonds") # Draw a scatter plot while assigning point colors and sizes to different # variables in the dataset f, ax = plt.subplots(figsize=(6.5, 6.5)) sns.despine(f, left=True, bottom=True) clarity_ranking = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"] sns.scatterplot(x="carat", y="price", hue="clarity", size="depth", palette="ch:r=-.2,d=.3_r", hue_order=clarity_ranking, sizes=(1, 8), linewidth=0, data=diamonds, ax=ax)
def plot_data(data): X = data[0] Y = data[1] sns.scatterplot(X[0], X[1], hue=Y[0]) plt.show()