def evaluate_distribution_matrix(dis_matrix: sp.spmatrix, show: bool = True, tell: bool = True, save_path: str = None, row_name: str = "column", column_name: str = "row"): """ Evaluate document-topic distribution matrix, involving a combination of: * printing statistics * showing boxplots * pruning empty docs and topics, and pruning topics that are too common :param dis_matrix: distribution matrix to be evaluated. :param column_name: name of columns for printing :param row_name: name of rows for printing :param show: whether to show boxplots :param tell: whether to print statistics :param save_path: path of file to save, default is None, meaning no saving :return: potentially pruned matrix """ sb.set_theme(style="whitegrid") return_stats = [] return_stats_combined = () stat_names = [ "Non-Zero", "Zero", "Zeros%", "Minimums", "Maximums", "Averages", "Medians", "Entropies" ] # loop over A-B distribution, then B-A distribution for ab in range(2): stats = {} return_stats = [] non_zeros, num_zeros, per_zeros, empties, avgs, maxs, mins, medians, entropies = [], [], [], [], [], [], [], [], [] # Fill out statistics for each row/column max_loop = 1 if ab == 0 else 0 for i in tqdm(range(0, dis_matrix.shape[max_loop])): vec = dis_matrix.getcol(i) if ab == 0 else dis_matrix.getrow(i) non_vec = vec.nonzero()[ab] non_zeros.append(len(non_vec)) num_zeros.append(vec.shape[ab] - len(non_vec)) per_zeros.append((vec.shape[ab] - len(non_vec)) / vec.shape[ab]) avgs.append(vec.mean()) maxs.append(vec.max()) mins.append(vec.min()) medians.append(np.median(vec.toarray())) if len(non_vec) == 0: empties.append(i) vec_array = vec.toarray().T[0] if ab == 0 else vec.toarray()[0] # entropy is set to 1 if distribution is all zeros (which returns NaN). ent = 1 if np.isnan(entropy(vec_array, base=vec.shape[ab])) else entropy( vec_array, base=vec.shape[ab]) entropies.append(ent) # Print statistics print_name = f"{column_name}-{row_name}" if ab == 0 else f"{row_name}-{column_name}" if tell: print(print_name) print(f"{len(empties)} empty vectors") stats = { stat_names[0]: non_zeros, stat_names[1]: num_zeros, stat_names[2]: per_zeros, stat_names[3]: mins, stat_names[4]: maxs, stat_names[5]: avgs, stat_names[6]: medians, stat_names[7]: entropies } # Make stats ready for return for name, stat in stats.items(): return_stats.append(stats_of_list(stat, name=name, tell=tell)) return_stats.append(len(empties)) return_stats_combined += (return_stats, ) # Save stats if save_path is not None: with open(save_path + "_" + print_name + '.csv', "w+") as f: for name, stat in zip(stats.keys(), return_stats): f.write(f"{name}, " + ", ".join(str(x) for x in stat) + "\n") # Show stats if show or save_path is not None: # remove absolute number zero statistics, as they are not in range [0,1] stats.pop(stat_names[0]) stats.pop(stat_names[1]) df = pd.DataFrame(data=stats) box = df.boxplot() box.set_title(print_name) if save_path is not None: plt.savefig(save_path + "_" + print_name + ".png") if show: plt.show() else: plt.clf() return return_stats_combined
def plot_toy( data, output_dir, annotate=False, site=None, zenith=None, obs_times=None, x_tick_labels="auto", y_tick_labels="auto", min_value=None, max_value=None, color_scheme="viridis", color_scale=None, as_percent=False, filetype="png", subtitle=None, filename_suffix="", show_only=False, ): sns.set_theme() if str(zenith).lower() == "all": zenith = None if site.lower() == "all": site = None df = analyze(data, site=site, zenith=zenith, obs_times=obs_times) df.rename(columns={"obs_time": "exposure time"}, inplace=True) if as_percent: df["percent"] = df["percent"] * 100 pivot = df.pivot("exposure time", "delay", "percent").astype(float) f, ax = plt.subplots(figsize=(9, 9)) cbar_kws = { "label": "Percentage of GRBs detected", "orientation": "vertical" } if color_scale == "log": from matplotlib.colors import LogNorm color_scale = LogNorm(vmin=min_value, vmax=max_value) if annotate: heatmap = sns.heatmap( pivot, annot=True, fmt=".0f", linewidths=0.5, ax=ax, cmap=color_scheme, vmin=min_value, vmax=max_value, xticklabels=x_tick_labels, yticklabels=y_tick_labels, cbar_kws=cbar_kws, norm=color_scale, ) else: heatmap = sns.heatmap( pivot, annot=False, ax=ax, cmap=color_scheme, vmin=min_value, vmax=max_value, xticklabels=x_tick_labels, yticklabels=y_tick_labels, cbar_kws=cbar_kws, norm=color_scale, ) heatmap.invert_yaxis() heatmap.set_facecolor("#1C1C1C") if not site: site = "Both sites" else: site = f"CTA {site.capitalize()}" if not zenith: zenith = "all zeniths" else: zenith = f"z{zenith}" if subtitle: plt.title( f"GRB Detectability for {site}, {zenith}: {subtitle} (n={len(np.unique(data.index))})" ) else: plt.title(f"GRB Detectability for {site}, {zenith}") fig = heatmap.get_figure() if not show_only: output_file = f"{output_dir}/GW_{site.replace(' ','_')}_{zenith.replace(' ','_')}{filename_suffix}.{filetype}" fig.savefig(output_file) # print(f"Saved plot {output_file}") else: plt.show()
from matplotlib import pyplot as plt import seaborn as sns font_cs = {'fontname': 'Consolas'} def entropy(x_, b=2): if b == 2: h_x = -np.sum(x_ * np.log2(x_)) else: h_x = -np.sum(x_ * np.log(x_)) return h_x x = np.arange(0.01, 1.00, 0.01) h = entropy(x) df = pd.DataFrame(h, x, columns=['entropy(x)']) # df = pd.DataFrame(h, x1, columns=['x*-logP(x)+(1-x)*(-logP(1-x))']) print(df) sns.set_theme(style='whitegrid') sns.lineplot(data=df) plt.xlabel('x', **font_cs) plt.ylabel('value', **font_cs) plt.title('Entropy', **font_cs) plt.show()
async def main(): """ Main function of the application. :return: Nothing. """ print_header() timer_main = Timer() config = default_config() # read and prepare dataset for training df_timeseries_complete = load_dataset("zurich_adapter", config) df_timeseries = chop_first_fringe( df_timeseries_complete) # Chop first improper filled rows imputed_timeseries = impute_simple_imputer(df_timeseries) smooth_timeseries = moving_average(imputed_timeseries) smooth_timeseries.dropna( inplace=True ) # Make sure there really is no empty cell anymore, else drop row # Split training/testing data in 80%/20% df_train_val, df_test = temporal_train_test_split(smooth_timeseries, test_size=.20) # Define all models at our disposal models = [ ModelHolder(name="arima", trainer=train_or_load_ARIMA, config=config), ModelHolder(name="autoarima", trainer=train_or_load_autoARIMA, config=config), ModelHolder(name="expsmooting", trainer=train_or_load_expSmoothing, config=config), ModelHolder(name="lstm", trainer=train_or_load_LSTM, config=config), ModelHolder(name="lstm_seq", trainer=train_or_load_LSTM, config=config) ] # Train the models trained_models = await gather(*[ to_thread(train_model, model=model, data=df_train_val) for model in models ]) [model.model.store(model.config) for model in trained_models ] # Stores if not existing. Does NOT OVERWRITE!!! # Test the generalization performance of our models forecast_test = [ model.model.predict(x=df_test, fh=5) for model in trained_models ] print(forecast_test) # plt.plot(forecast_test[0][['Zch_Stampfenbachstrasse.PM10', 'Zch_Stampfenbachstrasse.PM10_Pred']]) # plt.plot(forecast_test[0][['Zch_Stampfenbachstrasse.Humidity', 'Zch_Stampfenbachstrasse.Temperature']]) # plt.show() logger.info(f"Script completed in {timer_main}.") logger.info("Terminating gracefully...") logger.info("start predicting new time") forecast_dict = { "arima": pd.Series(), "autoarima": pd.Series(), "expsmoothing": pd.Series(), "lstm": pd.Series(), "lstm_seq": pd.Series() } with InfluxSensorData(config=config, name="influx") as client: # Load the data from the server data = client.get_data().rename( columns={ "humidity": "Live.Humidity", "pm10": "Live.PM10", "temperature": "Live.Temperature" }) imputed_data = impute_simple_imputer(data) # Impute avg_data = moving_average(imputed_data) # Average input logger.debug("Forecasting") forecast_list = [ model.model.predict(x=avg_data, fh=5) for model in trained_models ] # Make predictions logger.info(forecast_list) forecast_dict = { "arima": forecast_list[0], "autoarima": forecast_list[1], "expsmoothing": forecast_list[2], "lstm": forecast_list[0].iloc[:, forecast_list[0].columns. get_loc("Live.PM10_Pred")], # was item 3 "lstm_seq": forecast_list[1].iloc[:, forecast_list[1].columns.get_loc( "Live.PM10_Pred")] # was item 4 } forecast = pd.DataFrame(data=forecast_dict) logger.debug(forecast) forecast = forecast.mean(axis=1).head(n=50) forecast.name = "forecast" logger.info(f"Forcasting finished with forecast value\n {forecast}") config["influx"]["limit"] = "150" config["influx"][ "drops"] = '["pm1", "pm4.0", "pm2.5", "result", "table", "_time", "humidity", "temperature"]' with InfluxSensorData(config=config, name="influx") as client: # Load the data from the server data = client.get_data().tail(n=50) data.index = range(len(data)) data = data.iloc[:, 0] print(f"data {data}") sns.set_theme(style="darkgrid") sns.lineplot(data=[forecast, data])
def presentation_frequency_plot_figures(outcome): # This function method attemps to generate all the frequency plots created to # analyze each EMS call outcome distribution across fire station and shift. # This function used the outcome variable to define which distribution we # would like to visualize: # * The Overall value is used to plot the distribution for all the # EMS call outcomes across fire station and shift (this plot was # not used on the presentation or report). # # * The Outcome value is used to plot the EMS call outcome frequency # count across shift. # # * The Top 4 Outcomes value is used to plot the top 4 EMS call outcome # frequency count across shift (This plot was created for the presentation # with the intent of optimizing space usage for the briefing) # # * The remaining plots can be generated by entering the EMS call outcome # of interest. If the EMS call outcome is properly entered as value the # data frame is reduced to show only the records associated for the # individual EMS call outcome across fire station and shift. df = df_q4.copy(deep=True) # Axis Labels x_label = 'Counts' y_label = 'Fire Station' # Y Selection y_sel = 'FireStation' # General figure and font size gen_fig_size = (20, 20) gen_font_size = 24 if outcome == 'Overall': title = outcome + ' Fire Station Outcomes Across Shift' elif outcome == 'Outcome': title = 'Patient Outcome Frequency' y_label = 'Patient Outcome' y_sel = 'PatientOutcome' sns.set_theme(style=seaborn_theme) elif outcome == 'Top 4 Outcomes': title = 'Patient Outcome Frequency' y_label = 'Patient Outcome' y_sel = 'PatientOutcome' out_list = [ 'Treated & Transported', 'Patient Refusal (AMA)', 'No Treatment/Transport Required', 'Canceled (Prior to Arrival)' ] df = df_q4[df_q4['PatientOutcome'].isin(out_list)].copy(deep=True) gen_fig_size = (10, 5) gen_font_size = 18 else: title = outcome + ' Outcome Across Fire Station and Shift' df = df_q4[df_q4['PatientOutcome'] == outcome].copy(deep=True) #Plot plt.subplots(figsize=gen_fig_size) sns.set_theme(style=seaborn_theme) ax = sns.countplot(data=df, y=y_sel, hue='Shift', palette=palette_sel_distinct, order=df[y_sel].value_counts().index, hue_order=['A - Shift', 'B - Shift', 'C - Shift']) ax.set_title(title, fontsize=gen_font_size) ax.set_xlabel(x_label, fontsize=gen_font_size) ax.set_ylabel(y_label, fontsize=gen_font_size) ax.tick_params(labelsize=gen_font_size) ax.legend(fontsize=gen_font_size, loc='lower right')
def space_invaders(name_plot='r', ci='t', include_css=False, holder_melt=None): '''['r', 'sd', 'iqr', 'range', 'no_norm'], t vs bs ''' assert holder_melt is not None df = holder_melt.copy() sns.set_theme(context='poster', style='darkgrid', font='sans-serif', color_codes=True) plt.rcParams["figure.figsize"] = (16, 12) name_plot = name_plot name_y = 'rmse_' + name_plot include_css = include_css if name_plot == 'r': name_y = 'mean_corr' if include_css: errors = df[name_plot].ci data = df[name_plot] else: errors = df[name_plot].loc[df[name_plot]['metric'] != 'CSS', 'ci'] data = df[name_plot].loc[df[name_plot]['metric'] != 'CSS', :] #plt.rcParams["errorbar.capsize"] = 0.05 #colors = ['#a6cee3','#1f78b4','#b2df8a','#ffff99','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#33a02c'] ax = sns.pointplot(x='metric', y=name_y, hue='model', style='metric', data=data, dodge=0.6, join=False, ci=None, scale=1, palette=sns.color_palette('Paired', data.shape[0]) #palette = sns.color_palette("Paired", 13) ) # Find the x,y coordinates for each point x_coords = [] y_coords = [] for point_pair in ax.collections: for x, y in point_pair.get_offsets(): x_coords.append(x) y_coords.append(y) # Calculate the type of error to plot as the error bars # Make sure the order is the same as the points were looped over ax.errorbar(x_coords, y_coords, yerr=errors, fmt='none', c='black', elinewidth=4, markeredgewidth=4, zorder=-1, capsize=10) ax.set_xlabel('') ax.set_ylabel('') plt.title('rmse normed by ' + name_plot + ' confidence calculated using ' + ci) if name_plot == 'r': plt.title('r ' + 'confidence calculated using ' + ci) elif name_plot == 'no_norm': plt.title('rmse not normed ' + 'confidence calculated using ' + ci) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.1) return plt
df_opp["Type"] = "Opp" df = pd.concat([df_same, df_opp], ignore_index=True) df = df[df["Step"] < 9] df["Step"] = r"$i$+" + df["Step"].astype(int).astype(str) barWidth = 0.35 nodes = [0, 0.25, 0.5, 0.75, 1] colors = ["#FDE725FF", "#440154FF", "#FDE725FF"] # Regular red and green # colors = ["#6BE585", "#DD3E54", "#6BE585"] # degree_cmap = LinearSegmentedColormap.from_list("", list(zip(nodes, colors))) # degree_cmap = LinearSegmentedColormap.from_list("", list(zip(nodes, colors))) degree_cmap = mpl.colors.ListedColormap(mpl.cm.get_cmap('viridis_r').colors + mpl.cm.get_cmap('viridis').colors) # print(degree_cmap.colors) # print(len(degree_cmap.colors)) # grid = plt.GridSpec(3,6, wspace=0.4, hspace=0.1) sns.set_theme(style="white", context="paper") # sns.set(fontsize=14) #print(df.groupby(["Type","Step"])["Step"].count()) f, axes = plt.subplots(5,1,figsize=(7, 9), gridspec_kw={"height_ratios":[72,1,72,1,72]}) # h = sns.histplot(df, x="Step", color="grey", hue="Type", discrete=True, multiple="dodge", shrink=.8, ax=axes[2]) # h = sns.histplot(df, x="Step", color="grey", hue="Type", discrete=True, hue_order=["Opp","Same"], multiple="dodge", shrink=.8, ax=axes[2]) # axes[2].get_legend().remove() #axes[2].set_alpha(0.8) hatches = {0:"///", 1:"", 2:"|||"} fill = {0:"#FFFFFF", 1:False, 2:False} # Distinct colors 0-8 distinct_colors = ["#FFFFFF", "#773712", "#B3B3B3", "#EE7F31", "#FBE44D", "#B3B3B3", "#B3B3B3", "#D5C4AB", "#B3B3B3", "#B3B3B3"] wheel_colors = ["#FFFFFF", "#773712", "#B3B3B3", "#EE7F31", "#FBE44D", "#B3B3B3", "#B3B3B3", "#D5C4AB", "#FFFFFF", "#B3B3B3"] # for i in range(2): # for j in range(8): # color_index = (j*100 + 100) % 360
help='position on the hyperparameter list') parser.add_argument('-p', '--params', help='path to a list of hyperparameters') parser.add_argument('-r', '--random', default=123, type=int, help='random seed') parser.add_argument('session', help='name of the session') parser.add_argument('-v', '--verbose', action='count', default=0) args = parser.parse_args() # set matplotlib backend to batch use sns.set_theme(context='paper', palette='tab10') mpl.use('agg') # create relevant directory structure img_dir = os.path.join(args.session, 'img') mod_dir = os.path.join(args.session, 'mod') log_dir = os.path.join(args.session, 'log') os.makedirs(img_dir, exist_ok=True) # image directory os.makedirs(mod_dir, exist_ok=True) # models directory os.makedirs(log_dir, exist_ok=True) # log directory # create log file level = logging.ERROR if args.verbose == 1: level = logging.INFO
import csv import matplotlib.pyplot as plt import seaborn as sns sns.set_theme(color_codes=True) import numpy as np #creates lists for the ratings and shot quality that will be used to graph teamOffensiveRatings = [] teamOffShotQuality = [] teamDefensiveRatings = [] teamDefShotQuality = [] #opens CSV file for offense with open('pbpstats 2020-2021 Team Data Offense.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: #skips first line if line_count == 0: pass line_count += 1 else: #row 3 has total points data points = int(row[3]) #row 2 has total possessions data possessions = int(row[2]) #row 20 has shot quality data shotQuality = float(row[20]) #calculates offensive rating offRating = (points / possessions) * 100
if provide_mean_lengths == True: return L_32, L_43, D_32, D_43, unreal_D43, unreal_D32 else: return DSD #### TRIAL USAGE ######### DSD = CLDtoDSDMethod1('Experiment 2020-11-27 10-34 Default.csv', 'Last Time') DSDm2 = CLDtoDSDMethod2('Experiment 2020-11-27 10-34 Default.csv', 'Last Time') L_32, L_43, D_32, D_43, unreal_D43, unreal_D32 = CLDtoDSDMethod2( 'Experiment 2020-11-27 10-34 Default.csv', 'Last Time', provide_mean_lengths=True) CLD = ExperimentalCLD('Experiment 2020-11-27 10-34 Default.csv', 'Last Time') sns.set_theme(context='paper', style='ticks', font_scale=2, palette='bright') sns.set_style({'font.family': 'serif', 'font.serif': 'Times New Roman'}) plt.figure(figsize=(10, 10)) plt.semilogx(CLD.iloc[:, 0].values, CLD.iloc[:, 1].values, label='CLD', linewidth=2, color='black') plt.semilogx(DSD['Diameter'].values, DSD['Counts'].values, label='Method 1', linewidth=2, color='red') plt.semilogx(DSDm2['Diameter'].values, DSDm2['Counts'].values, label='Method 2',
# In[207]: #making a pivot table IsBorrowerHomeowner is column and row IncomeRange ct_counts = ct_counts.pivot(index='IncomeRange', columns='IsBorrowerHomeowner', values='count') # In[208]: fig, ax = plt.subplots(figsize=[14.70, 8.27]) sb.heatmap(ct_counts, annot=True, fmt='d', ax=ax) # In[209]: sb.set_theme(style="darkgrid") fig, ax = plt.subplots(figsize=[14.70, 8.27]) sb.countplot(data=df, x='IsBorrowerHomeowner', hue='IncomeRange', ax=ax) # #### different method is used to observe the relation between owning home and income range and liky found that the range of 100k dollar is person who owning home is more . # # In[210]: df['StatedMonthlyIncome'].head() # In[211]: #regression plot to show the line which represent the correlation . fig, ax = plt.subplots(figsize=[14.70, 8.27]) sb.regplot(data=df,
import math import os import h5py from Bio import SeqIO from tqdm import tqdm import seaborn as sns import pandas as pd import matplotlib.pyplot as plt import numpy as np sns.set_theme(style='whitegrid', rc={"xtick.bottom": True}, font_scale=0.9, font='Verdana') plt.rcParams["figure.figsize"] = (8, 4) plt.rcParams['figure.dpi'] = 300 font = {'family': 'normal', 'weight': 'bold', 'size': 10} plt.rc('font', **font) df = pd.read_csv('../data/results/paper_tables.CSV') remapping = { 'Baseline': 'Majority', 'LocTree2': 'LocTree2', 'MultiLoc2': 'MultiLoc2', 'SherLoc2': 'SherLoc2', 'Yloc': 'Yloc', 'CELLO': 'CELLO', 'iLoc-Euk': 'iLoc-Euk', 'WoLF PSORT': 'WolF PSORT',
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import streamlit as st sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}) def label(x, color, label): ax = plt.gca() ax.text(0, .4, label, fontweight="bold", color='black', ha="left", va="center", transform=ax.transAxes) df = pd.read_csv('data/nsi.csv') df=df.set_index("date") vars = ["st", "wl", "ml", "gm"] labels = ["Stress", "Workload", "Motivation", "Mood"] mainvar = "Stress" v = vars[labels.index(mainvar)] tdf = df.filter(regex=v+'_\d') # I love regex tdf["date"] = tdf.index tdf = pd.wide_to_long(tdf, stubnames=v+"_", i="date", j="score").reset_index().rename(columns={v+"_": v}).dropna() tdf = tdf.loc[tdf.index.repeat(tdf[v])].reset_index() tdf["date"] = pd.to_datetime(tdf["date"], format='%d/%m/%Y') tdf["date"] = tdf["date"].dt.strftime('%Y-%m-%d') tdf = tdf.sort_values(by='date',ascending=False)
import argparse import csv from functools import partial import gzip from pathlib import Path import sys import matplotlib.pyplot as plt import numpy as np import seaborn as sns from tqdm import tqdm sns.set_theme(style='white', context='paper') def extract_scores(scores_csv, score_col=1, title_line=True): if Path(scores_csv).suffix == '.gz': open_ = partial(gzip.open, mode='rt') else: open_ = open with open_(scores_csv) as fid: reader = csv.reader(fid) if title_line: next(reader) scores = [] for row in tqdm(reader): try: score = float(row[score_col]) except ValueError: continue
""" Smooth kernel density with marginal histograms ============================================== _thumb: .48, .41 """ import seaborn as sns sns.set_theme(style="white") df = sns.load_dataset("penguins") g = sns.JointGrid(data=df, x="body_mass_g", y="bill_depth_mm", space=0) g.plot_joint(sns.kdeplot, fill=True, clip=((2200, 6800), (10, 25)), thresh=0, levels=100, cmap="rocket") g.plot_marginals(sns.histplot, color="#03051A", alpha=1, bins=25)
def Fig3_boxplot(start_yr, var_names, ylabels, ylabels_R, ranges, ranges_diff): # set plots sns.set_style("ticks") sns.set_style({"xtick.direction": "in", "ytick.direction": "in"}) sns.set_theme(style="ticks", palette="pastel") fig, axs = plt.subplots(2, 2, figsize=(10, 7)) # fig = plt.figure(figsize=(12,6)) plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = "sans-serif" plt.rcParams['font.serif'] = "Helvetica" plt.rcParams['axes.linewidth'] = 1.5 plt.rcParams['axes.labelsize'] = 12 plt.rcParams['font.size'] = 12 plt.rcParams['legend.fontsize'] = 12 plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12 plt.rcParams["legend.markerscale"] = 3.0 almost_black = '#262626' # change the tick colors also to the almost black plt.rcParams['ytick.color'] = almost_black plt.rcParams['xtick.color'] = almost_black # change the text colors also to the almost black plt.rcParams['text.color'] = almost_black # Change the default axis colors from black to a slightly lighter black, # and a little thinner (0.5 instead of 1) plt.rcParams['axes.edgecolor'] = almost_black plt.rcParams['axes.labelcolor'] = almost_black # set the box type of sequence number props = dict(boxstyle="round", facecolor='white', alpha=0.0, ec='white') #colors = cm.Set2(np.arange(0,len(case_labels))) # ax = fig.add_subplot(111) # ax2 = ax.twinx() orders = ['(a)', '(b)', '(c)', '(d)'] for i, var_name in enumerate(var_names): row = i // 2 # round col = i % 2 # mod # -------------------- boxplot --------------------- # read box values filename_GW = "./txt/" + var_name + "_GW_rawdata_4_Python.txt" filename_FD = "./txt/" + var_name + "_FD_rawdata_4_Python.txt" df_gw = read_summer_heatwave(filename_GW, start_yr) df_gw['experiment'] = "GW" df_fd = read_summer_heatwave(filename_FD, start_yr) df_fd['experiment'] = "FD" # make one dataframe df = pd.concat([df_gw, df_fd]) print(df) # Plotting boxplot axs[row, col] = sns.boxplot(x="year", y="var", data=df, showfliers=False, palette=["m", "g"], hue="experiment", whis=0) # xxlim = axs[row,col].get_xlim() axs[row, col].set_ylim(ranges[i]) axs[row, col].set_ylabel(ylabels[i]) # axs[row,col].set_xlabel(" ") # Adding shadings fill_color = (1., 0.972549, 0.862745) # named color "cornsilk" in ncl axs[row, col].fill_between([0.5, 8.5], ranges[i][0], ranges[i][1], facecolor=fill_color, alpha=0.5) axs[row, col].fill_between([16.5, 19.5], ranges[i][0], ranges[i][1], facecolor=fill_color, alpha=0.5) xtickslocs = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 ] xticklabels = [ "2001", "", "2003", "", "2005", "", "2007", "", "2009", "", "2011", "", "2013", "", "2015", "", "2017", "", "2019" ] # plt.setp(axs[row,col].get_xticklabels(), visible=False) if row == 0 and col == 0: axs[row, col].legend(numpoints=1, loc="best", frameon=False) # loc=(0.7, 0.8) else: axs[row, col].get_legend().remove() if row == 1: # plt.setp(axs[row,col].get_xticklabels(), visible=False) # axs[row,col].get_xaxis().set_visible(True) axs[row, col].set(xticks=xtickslocs, xticklabels=xticklabels) else: axs[row, col].get_xaxis().set_visible(False) axs[row, col].text(0.05, 0.95, orders[i], transform=axs[row, col].transAxes, fontsize=14, verticalalignment='top', bbox=props) # # # -------------------- lines --------------------- # # read line values # medians_gw = df_gw.groupby(['year'])['var'].median().values # medians_fd = df_fd.groupby(['year'])['var'].median().values # # Plotting boxplot # axs2 = axs[row,col].twinx() # axs2.plot(medians_gw-medians_fd, ls="-", color=almost_black, label="GW-FD") # #align_yaxis(ax, 0, ax2, 0) # axs2.set_ylim(ranges_diff[i]) # axs2.set_ylabel(ylabels_R[i]) # for ind, label in enumerate(axs[row,col].get_xticklabels()): # if ind % 2 == 0: # every 2nd label # label.set_visible(True) # plt.setp(axs[row,col].get_xticklabels(), visible=True) # axs[row,col].set(xticks=xtickslocs, xticklabels=xticklabels) # else: # label.set_visible(False) # plt.setp(axs[row,col].get_xticklabels(), visible=True) # axs[row,col].set(xticks=xtickslocs, xticklabels=xticklabels) fig.savefig("./plots/plot_boxplots.png", bbox_inches='tight', dpi=300, pad_inches=2) #
import pandas as pd from kneed import KneeLocator import matplotlib.pyplot as plt from sklearn.cluster import KMeans import seaborn as sns; sns.set_theme() from sklearn.metrics import accuracy_score data = pd.DataFrame(pd.read_excel("advancedkmeans.xlsx")) data["RPN"] = data["Risk priortiy number"] data = data.drop(axis=0, columns=["Risk priortiy number"]) x = data[["S","O","D"]].values distance = [] K = range(1,15) for k in K: km = KMeans(n_clusters=k) km = km.fit(x) distance.append(km.inertia_) x_values = list(K) y_values = distance kene = KneeLocator(x_values,y_values, curve='convex', direction='decreasing', interp_method='interp1d') breakpoint = kene.knee km = KMeans(n_clusters = breakpoint, init = "k-means++", random_state = 17) clusters = km.fit_predict(x) data["Cluster Values"] = list(x[clusters])
# Lab 7: Seaborn plotting tutorial import seaborn as sns sns.set_theme(style='darkgrid', font_scale=3) # older version of sns: sns.set() tips = sns.load_dataset('tips') # Distribution plots sns.displot(tips, x='total_bill', col='sex', kind='kde') sns.displot(tips, x='total_bill', kind='kde') sns.displot(tips, x='total_bill', kind='kde', cut=0) sns.displot(tips, x='total_bill', stat='density') sns.displot(tips, x='total_bill', y='size', kind='kde') sns.displot(tips, x='total_bill', col='sex', kind='kde') # Relational plots sns.relplot(x='total_bill', y='tip', data=tips) sns.relplot(x='total_bill', y='tip', hue='smoker', data=tips) sns.relplot(x='total_bill', y='tip', hue='smoker', style='sex', data=tips, s=100) sns.relplot(x='total_bill', y='tip', size='size', sizes=(15, 200), data=tips) # Categorical plots sns.catplot(x='day', y='total_bill', data=tips) sns.catplot(x='day', y='total_bill', kind='swarm', data=tips) sns.catplot(x='day', y='total_bill', hue='smoker', kind='swarm', data=tips)
print(iris_dataset.DESCR) correlation_matrix = iris.corr() sns.heatmap(data=correlation_matrix, annot=True, cmap='Greys') sns.set() fig, axes = plt.subplots(1, 4, figsize=(20, 5)) features = [ 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ] target = ['class'] sns.set_theme(style="ticks") for i, col in enumerate(features): sns.stripplot(ax=axes[i], x=target[0], y=col, data=iris) iris.isnull().sum() from sklearn.model_selection import train_test_split X = iris.iloc[:, [0, 1, 2, 3]].values y = iris.iloc[:, 4].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
def set_sns_format(width=15, height=6): sns.set_theme(palette='pastel', context='notebook', rc={'savefig.dpi': 300}) matplotlib.rcParams['figure.figsize'] = (width, height) return None
def run(_cfg,fout=None,source_data=None): cfg = ConfigParser(interpolation=ExtendedInterpolation()) cfg.read(_cfg) #_remove = aux.read.into_list(cfg['mat']['remove']) _remove = ['VC01','VD01','VB01','VB02','HSNL','HSNR','PVNL','PVNR','PLNL','PLNR','PVR','PVR.'] left = aux.read.into_list(cfg['mat']['left_nodes']) right = aux.read.into_list(cfg['mat']['right_nodes']) lrmap = aux.read.into_lr_dict(cfg['mat']['lrmap']) data = [] N2U = 'N2U' JSH = 'JSH' n2u = from_db(N2U,adjacency=True,remove=_remove) jsh = from_db(JSH,adjacency=True,remove=_remove) ndelta,jdelta,bdelta = [],[],[] lnd = get_adj_deg(n2u,vertices = left) rnd = get_adj_deg(n2u,vertices = right) tmp = [n for n in sorted(lnd)] for (l,r) in [(n,lrmap[n]) for n in sorted(lnd.keys())]: data.append(['Adult L/R',l,r,lnd[l],rnd[r],lnd[l]-rnd[r]]) ndelta.append(lnd[l]-rnd[r]) lnd = get_adj_deg(jsh,vertices = left) rnd = get_adj_deg(jsh,vertices = right) for (l,r) in [(n,lrmap[n]) for n in sorted(lnd.keys())]: data.append(['L4 L/R',l,r,lnd[l],rnd[r],lnd[l]-rnd[r]]) jdelta.append(lnd[l]-rnd[r]) cells = [] for n in sorted(lnd.keys()): cells.append(n) cells.append(lrmap[n]) bnd = get_adj_deg(n2u,vertices = cells) bjd = get_adj_deg(jsh,vertices = cells) for c in cells: data.append(['Adult/L4',c,c,bnd[c],bjd[c],bnd[c]-bjd[c]]) bdelta.append(bnd[c]-bjd[c]) df = pd.DataFrame(data,columns=["Comparison","Cell1","Cell2","Deg1","Deg2","Deg_diff"]) print('Stats:') print_wilcoxon(ndelta,'Adult L/R') print_wilcoxon(jdelta,'L4 L/R') print_wilcoxon(bdelta,'Adult/L4',alternative="greater") #tval1,pval1 = ttest_ind(ndelta,jdelta) #tval2,pval2 = ttest_ind(jdelta,bdelta) #tval3,pval3 = ttest_ind(ndelta,bdelta) sns.set_theme(style="whitegrid") fig,ax = plt.subplots(1,1,figsize=(2.15,1.7)) flierprops = dict(markersize=1,marker='d',markerfacecolor='k') medianprops = dict(linestyle='-',linewidth=0.5,color='k') whiskerprops = dict(linestyle='-',linewidth=0.3,color='k') capprops = dict(linewidth=0.3) sns.boxplot(x="Comparison",y="Deg_diff", data=df,width=0.3,ax=ax,linewidth=0.3,color="#a5a5a5", flierprops=flierprops,medianprops=medianprops,capprops=capprops) ax.set_ylim([-30,30]) ax.set_yticks([-30,-20,-10,0,10,20,30]) #ax.set_yticklabels([-30,-20,-10,0,10,20,30],fontsize=5) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(7) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(5) ax.axhline(0,color='r',linewidth=0.8,linestyle='--') ax.set_xlabel("") ax.set_ylabel("Degree difference",fontsize=7) plt.tight_layout() if fout: plt.savefig(fout) plt.show() if source_data: df.to_csv(source_data,index=False)
# Pairs Trading Strategy # - Use PCA to reduce dimensionality of the daily returns # - Then add the company details to do K-means clustering # - Then conduct time series analysis on each pair in each cluster to find appropriate trading pairs # Import packages import pandas as pd import seaborn as sns sns.set_theme(style = 'white', context = 'talk') import streamlit as st import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn import preprocessing from scipy import stats from statsmodels.tsa.stattools import coint import statsmodels.api as sm import os import base64 import warnings warnings.filterwarnings('ignore') # Set page name and icon st.set_page_config( page_title = 'Pairs Trading', page_icon = 'n.png', ) # Set page title
def build_model(df): df = df.loc[:100] # FOR TESTING PURPOSE, COMMENT THIS OUT FOR PRODUCTION X = df.iloc[:, :-1] # Using all column except for the last column as X Y = df.iloc[:, -1] # Selecting the last column as Y st.markdown('**1.2. Dataset dimension**') st.write('X') st.info(X.shape) st.write('Y') st.info(Y.shape) st.markdown('**1.3. Variable details**:') st.write('X variable (first 20 are shown)') st.info(list(X.columns[:20])) st.write('Y variable') st.info(Y.name) # Build lazy model X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=split_size, random_state=seed_number) reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None) models_train, predictions_train = reg.fit(X_train, X_train, Y_train, Y_train) models_test, predictions_test = reg.fit(X_train, X_test, Y_train, Y_test) st.subheader('2. Table of Model Performance') st.write('Training set') st.write(predictions_train) st.markdown(filedownload(predictions_train, 'training.csv'), unsafe_allow_html=True) st.write('Test set') st.write(predictions_test) st.markdown(filedownload(predictions_test, 'test.csv'), unsafe_allow_html=True) st.subheader('3. Plot of Model Performance (Test set)') with st.markdown('**R-squared**'): # Tall predictions_test["R-Squared"] = [ 0 if i < 0 else i for i in predictions_test["R-Squared"] ] plt.figure(figsize=(3, 9)) sns.set_theme(style="whitegrid") ax1 = sns.barplot(y=predictions_test.index, x="R-Squared", data=predictions_test) ax1.set(xlim=(0, 1)) st.markdown(imagedownload(plt, 'plot-r2-tall.pdf'), unsafe_allow_html=True) # Wide plt.figure(figsize=(9, 3)) sns.set_theme(style="whitegrid") ax1 = sns.barplot(x=predictions_test.index, y="R-Squared", data=predictions_test) ax1.set(ylim=(0, 1)) plt.xticks(rotation=90) st.pyplot(plt) st.markdown(imagedownload(plt, 'plot-r2-wide.pdf'), unsafe_allow_html=True) with st.markdown('**RMSE (capped at 50)**'): # Tall predictions_test["RMSE"] = [ 50 if i > 50 else i for i in predictions_test["RMSE"] ] plt.figure(figsize=(3, 9)) sns.set_theme(style="whitegrid") ax2 = sns.barplot(y=predictions_test.index, x="RMSE", data=predictions_test) st.markdown(imagedownload(plt, 'plot-rmse-tall.pdf'), unsafe_allow_html=True) # Wide plt.figure(figsize=(9, 3)) sns.set_theme(style="whitegrid") ax2 = sns.barplot(x=predictions_test.index, y="RMSE", data=predictions_test) plt.xticks(rotation=90) st.pyplot(plt) st.markdown(imagedownload(plt, 'plot-rmse-wide.pdf'), unsafe_allow_html=True) with st.markdown('**Calculation time**'): # Tall predictions_test["Time Taken"] = [ 0 if i < 0 else i for i in predictions_test["Time Taken"] ] plt.figure(figsize=(3, 9)) sns.set_theme(style="whitegrid") ax3 = sns.barplot(y=predictions_test.index, x="Time Taken", data=predictions_test) st.markdown(imagedownload(plt, 'plot-calculation-time-tall.pdf'), unsafe_allow_html=True) # Wide plt.figure(figsize=(9, 3)) sns.set_theme(style="whitegrid") ax3 = sns.barplot(x=predictions_test.index, y="Time Taken", data=predictions_test) plt.xticks(rotation=90) st.pyplot(plt) st.markdown(imagedownload(plt, 'plot-calculation-time-wide.pdf'), unsafe_allow_html=True)
palette_colors = [(c[0] / 255.0, c[1] / 255.0, c[2] / 255.0) for c in DESATURATED_PALETTE[2:6] + [(0, 0, 0)]] parser = argparse.ArgumentParser() parser.add_argument("--dataset", "-d", choices=["penguins", "dots", "mpg"], default="mpg") args = parser.parse_args() inky = Inky() saturation = 0 dpi = 80 buf = io.BytesIO() seaborn.set_theme(style="white") if args.dataset == "mpg": palette = seaborn.color_palette(palette_colors, n_colors=3) mpg = seaborn.load_dataset("mpg") plot = seaborn.relplot(x="horsepower", y="mpg", hue="origin", size="weight", sizes=(40, 400), alpha=1.0, palette=palette, data=mpg) if args.dataset == "penguins":
'color': 'black', 'weight': 'normal', 'verticalalignment': 'bottom' } plt.figure(figsize=(20, 10)) plt.xlabel(str(nb_epochs) + ' Epochs', **font) plt.ylabel('Accuracy', **font) plt.plot(epochs, accuracy, 'r', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation acc', **title_font) plt.legend() plt.savefig('../rel/figuras/' + filename + 'acc.png') sns.set_theme() # Tamanho máximo de uma sentença SEQUENCE_MAXLEN = 50 # Carrega os embeddings do word2vec word2vec_model = KeyedVectors.load_word2vec_format("../data/word2vec_200k.txt") # Carrega os datasets train = pd.read_csv('../data/train.csv', sep=';') val = pd.read_csv('../data/val.csv', sep=';') test = pd.read_csv('../data/test.csv', sep=';') x_train = train['review_text'].values y_train = train['overall_rating'].values x_train = word_to_index(x_train)
# -*- coding: utf-8 -*- """ Created on Tue Nov 03 17:04:37 2020 @author: vivek """ import pandas as pd import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = 6.4, 4.8 import numpy as np import seaborn as sns sns.set_theme(style="ticks", palette="pastel") import altair as alt summ_data = pd.read_csv( r'C:\OneDrive\OneDrive-GitHub\Challenges-and-Competitions\TidyTuesday\Data\2020-11-10\summ_data.csv' ) summ_data.columns = ['Country', 'Landline connections', 'Mobile connections'] ax = summ_data.plot( x='Landline connections', y='Mobile connections', kind='scatter', title='2017: Connections per 100 people (each point represents a country)') # Inference - there are a lot of countries with more connections than number of people import geopandas as gpd from shapely.geometry import Point, Polygon import adjustText as aT
for i in range(2, 4): ##Ad ogni ciclo seleziono una dimensione sempre più grande pca = PCA(n_components=i) pca.fit(X) X_pca = pca.transform(X) scores = KNNclf(X_pca, y, param_list=param_list) print('Nested Cross validation Accuracy: %0.4f (+/- %0.4f)' % (scores.mean(), scores.std() * 2)) CV_scores.append(scores.mean()) CV_std.append(scores.std()) #Mettiamo in un grafico i risultati ottenuti plt.figure() sns.set_theme(style='darkgrid') plt.ylabel('CV scores') #plt.plot(labels, CV_scores, 'o', color = 'black') plt.errorbar(labels, CV_scores, CV_std, fmt='.', color='black') plt.show() ##Adesso vediamo uan rappresentazione del classificatore in 2d X = df2R.values #Questa volta splittiamo in test set e train set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) clf = KNeighborsClassifier(n_neighbors=4, weights='distance') h = 0.2
import matplotlib.patches as mpatches import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import torch sns.set() sns.set_theme(style="darkgrid") if __name__ == '__main__': conv_rewards_train = torch.load( 'ckpt_train/conv_test_rewards.ckpt').numpy() conv_rewards_test = torch.load('ckpt_test/conv_test_rewards.ckpt').numpy() print(conv_rewards_train.mean()) vit_rewards_train = torch.load('ckpt_train/vit_test_rewards.ckpt').numpy() vit_rewards_test = torch.load('ckpt_test/vit_test_rewards.ckpt').numpy() print(vit_rewards_train.mean()) levels = [i for i in range(0, 50)] levels = levels + levels convs = ['conv' for i in range(0, 50)] vits = ['vit' for i in range(0, 50)] model_names = convs + vits models_mean = list(conv_rewards_train)[0:50] + list( vit_rewards_train)[0:50] # #FF0000 # #ffcc66
""" Dot plot with several variables =============================== _thumb: .3, .3 """ import seaborn as sns sns.set_theme(style="whitegrid") # Load the dataset crashes = sns.load_dataset("car_crashes") # Make the PairGrid g = sns.PairGrid(crashes.sort_values("total", ascending=False), x_vars=crashes.columns[:-3], y_vars=["abbrev"], height=10, aspect=.25) # Draw a dot plot using the stripplot function g.map(sns.stripplot, size=10, orient="h", palette="flare_r", linewidth=1, edgecolor="w") # Use the same x axis limits on all columns and add better labels g.set(xlim=(0, 25), xlabel="Crashes", ylabel="") # Use semantically meaningful titles for the columns
import os from pathlib import Path import numpy as np import pandas as pd from collections import defaultdict from tensorboard.backend.event_processing.event_accumulator import EventAccumulator import seaborn as sns run_dir = Path(__file__).parents[2] / "runs" overleaf_dir = Path(__file__).resolve().parents[2] / "overleaf" / "figures" sns.set_theme("talk", style="whitegrid") def tabulate_events(dpath): summary_iterators = [ EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath) ] tags = summary_iterators[0].Tags()["scalars"] for it in summary_iterators: assert it.Tags()["scalars"] == tags out = defaultdict(list) steps = [] for tag in tags: steps = [e.step for e in summary_iterators[0].Scalars(tag)]