def show(left, right, *the_rest): """returns a markdown object print out equations, both symbolic and numeric in Jupyter notebook""" if the_rest: return md(f"${left} = {right}{the_rest}$") return md(f"${left} = {right}$")
def on_select_change(self, change): "First click." with self.out: clear_output() date_selected = self.qgrid_obj.get_selected_df().reset_index()["rev_time"].iloc[0] editor_selected = self.qgrid_obj.get_selected_df().reset_index()["editor_id"].iloc[0] editor_name = self.qgrid_obj.get_selected_df().reset_index()["editor"].iloc[0] page_title = self.all_tokens["article_title"].unique()[0] display(md("Loading revisions info...")) second_df = self.revision_manager.get_main(date_selected, editor_selected, self.current_freq) clear_output() display(md(f"Within **{self.current_freq}** timeframe, you have selected **{editor_name}** (id: {editor_selected})")) display(HTML(f"The revisions fall in <a href='https://{self.lng}.wikipedia.org/w/index.php?date-range-to={date_selected}&tagfilter=&title={page_title}&action=history' target='_blank'>{date_selected}</a>")) second_df.rename({"main_opponent": "main_op", "stopwords_ratio": "SW_ratio", "productivity": "prod"}, axis=1, inplace=True) columns_set = {"rev_time": {"width": 165}, "rev_id": {"width": 85}, "adds": {"width": 50}, "dels": {"width": 50}, "reins": {"width": 50}, "prod": {"width": 50, "toolTip": "productivity"}, "conflict": {"width": 70}, "SW_ratio": {"width": 82, "toolTip": "stopwords ratio"}, "main_op": {"width": 80, "toolTip": "main opponent"}, "min_react": {"width": 132, "toolTip": "min reaction time"}, "Damaging": {"width": 92}, "Goodfaith": {"width": 90}} self.second_qgrid = qgrid.show_grid(second_df, grid_options={'forceFitColumns': True, 'syncColumnCellResize': True}, column_definitions=columns_set) display(self.second_qgrid) self.out2 = Output() display(self.out2) self.second_qgrid.observe(self.on_select_revision, names=['_selected_rows'])
def get_protect(self, level="semi_edit"): """ Main function of ProtectListener. ... Parameters: ----------- level (str): select one from {"semi_edit", "semi_move", "fully_edit", "fully_move", "unknown"} ... Returns: ----------- final_table (pd.DataFrame): detailed dataframe containing protection records for a particular type/level. plot_table (pd.DataFrame): dataframe for further Gantt Chart plotting. """ if len(self.df) == 0: display(md(f"No {level} protection records!")) return None, pd.DataFrame(columns=["Task", "Start", "Finish", "Resource"]) else: self.df = self.df.drop(self.df[self.df["action"] == "move_prot"].index).reset_index(drop=True) if len(self.df) == 0: display(md(f"No {level} protection records!")) return None, pd.DataFrame(columns=["Task", "Start", "Finish", "Resource"]) df_with_expiry = self._get_expiry() df_with_unknown = self._check_unknown(df_with_expiry) df_checked_unprotect = self._check_unprotect(df_with_unknown) df_select_level = self._select_level(df_checked_unprotect, level=level) df_with_unprotect = self._get_unprotect(df_select_level) final_table = self._get_final(df_with_unprotect) plot_table = self._get_plot(final_table, level=level) return final_table, plot_table
def on_select_revision(self, change): "Second click and display comment." with self.out2: clear_output() self.selected_rev = str(self.second_qgrid.get_selected_df()["rev_id"].iloc[0]).encode("utf-8").decode("utf-8") self.search_widget.value = self.selected_rev display(md("Loading comments...")) self.get_comments() clear_output() if self.selected_rev not in self.rev_comments.keys(): self.rev_comments[self.selected_rev] = '' display(md(f"**Comment for the revision {self.selected_rev}:** {self.rev_comments[self.selected_rev]}")) display(HTML(f"<a href='https://{self.lng}.wikipedia.org/w/index.php?diff={self.selected_rev}&title=TITLEDOESNTMATTER&diffmode=source' target='_blank'>Cilck here to check revisions differences</a>"))
def fixMeshGrid(dataset, mystery_flag=False): ''' Maybe this is not necessary if masks are applied properly Derives gridsteps and dimensions from passed DataSet Assumes uniform grid, curvilinear grid wont work here! Reference to XZ and YZ need to be passed explicitly because Dask loads the netCDF lazily The mystery flag is a Boolean because sometimes 1 and sometimes 2 gridsteps need to be subtracted from the length ¯\_(ツ)_/¯ , don't really know why (even vs uneven?) ''' print("● Fixing mesh grid, assuming a uniform grid ") dataset.XZ.load() dataset.YZ.load() x_gridstep = dataset.XZ.values[2][-1] - dataset.XZ.values[1][-1] y_gridstep = dataset.YZ.values[-2][-1] - dataset.YZ.values[-2][-2] width = (dataset.XZ.shape[0] - 2) * x_gridstep if mystery_flag: length = (dataset.XZ.shape[1] - 1) * y_gridstep # eeehhh hmmmm -1? sometimes -2? else: length = (dataset.XZ.shape[1] - 2) * y_gridstep # eeehhh hmmmm -1? sometimes -2? md(f""" # Times | Name | Value | | --- | --- | | x gridstep | {x_gridstep} | | y gridstep | {y_gridstep} | | Width | {width} | | Length | {length} | """) XZ, YZ = makeMeshGrid(length=length, width=width, x_gridstep=x_gridstep, y_gridstep=y_gridstep) # for debugging # print('original XZ', dataset.XZ.shape) # print('original YZ', dataset.YZ.shape) # print('new XZ', XZ.shape) # print('new YZ', YZ.shape) dataset.XZ.values = XZ dataset.YZ.values = YZ return dataset
def show_dashboard(): output.clear_output() data_output.clear_output() item_layout = widgets.Layout(margin='0 0 10px 0', align_items='stretch') item_layout_tab = widgets.Layout(margin='0 0 10px 0') explore_data = range_table_all explore_data['sum_in_area'] = explore_data['sum_in_area'].astype(int) with output: display( md("> <font size = 3, font color = black> {}".format( resident_text))) with data_output: display(explore_data) global tab, input_widgets input_widgets = widgets.VBox( [selected_persona, selected_percentile, text_generation_button], layout=item_layout) tab = widgets.Tab([output, data_output], layout=item_layout_tab) tab.set_title(0, 'Narrative') tab.set_title(1, 'Data') global dashboard dashboard = widgets.VBox([input_widgets, tab])
def token_selection_change(self, change): "First click." with self.out1: clear_output() # Process the involved dataframe. token_selected = self.qgrid_token_obj.get_selected_df( ).reset_index()['string'].iloc[0] selected_token = self._select_token(token_selected, self._range1, self._range2) df_selected_token = selected_token.drop( ['page_id', 'o_editor', 'token', 'o_rev_id', 'article_title'], axis=1) new_cols = ['token_id', 'action', 'rev_time', 'editor', 'rev_id'] df_selected_token = df_selected_token[new_cols].rename( {'editor': 'editor_id'}, axis=1) df_selected_token['token_id'] = df_selected_token[ 'token_id'].astype(str) df_selected_token['rev_id'] = df_selected_token['rev_id'].astype( str) df_selected_token.set_index('token_id', inplace=True) qgrid_selected_token = qgrid.show_grid(df_selected_token) self.qgrid_selected_token = qgrid_selected_token display( md(f'**With string *{token_selected}*, select one revision you want to investigate:**' )) display(self.qgrid_selected_token) self.out2 = Output() display(self.out2) self.qgrid_selected_token.observe(self.revid_selection_change, names=['_selected_rows'])
def show_model_parameters(m_, idx_=-1): # fit model m_ = m_.fit() # extract significant figures from float def sigfigs(x): d = int(str('%.2e' % x)[('%.2e' % x).find('-') + 1:]) n = np.round(float(str('%.02e' % x)[0:3])) return n, d # extract model parameters beta, pval, df_model = m_.params[idx_], m_.pvalues[idx_], m_.df_model rsqrd, df_resid, tvalues = m_.rsquared, m_.df_resid, m_.tvalues[idx_] # show exact p values up to three significant figures if sigfigs(pval)[1] < 4: stat_str = "$\\beta = %.2f$, $F(%d, %d)$ = $%.02f, P = %.03f $" report = stat_str % ( beta, df_model, df_resid, tvalues, pval, ) else: stat_str = "$\\beta = %.2f$, $F(%d, %d)$ = $%.02f, P = %.0f $ x $ 10 ^{-%d} $" report = stat_str % (beta, df_model, df_resid, tvalues, sigfigs(pval)[0], sigfigs(pval)[1]) # return markdown visualization return md(report) #, report
def listen(self, _range1, _range2, stopwords): if stopwords == 'Not included': if self.conflicts_dict["Not included"] is None: conflicts_not_included = remove_stopwords(self.sources["tokens_source"]["conflicts_all"],self.lng).reset_index(drop=True) self.conflicts_dict["Not included"] = self.add_columns(conflicts_not_included) self.conflicts_dict["Not included"] = self.get_displayed_df(_range1, _range2, self.conflicts_dict["Not included"]) conflicts = self.conflicts_dict["Not included"] else: conflicts = self.conflicts_dict["Not included"] else: if self.conflicts_dict["Included"] is None: link_df = self.sources["tokens_source"]["conflicts_all"] self.conflicts_dict["Included"] = link_df del link_df conflicts_included = self.add_columns(self.conflicts_dict["Included"]) self.conflicts_dict["Included"] = self.add_columns(conflicts_included) self.conflicts_dict["Not Included"] = self.get_displayed_df(_range1, _range2, self.conflicts_dict["Included"]) conflicts = self.conflicts_dict["Included"] else: conflicts = self.conflicts_dict["Included"] if len(conflicts) > 0: qgrid_token_obj = qgrid.show_grid(conflicts,grid_options={'forceFitColumns':False}) display(qgrid_token_obj) else: display(md(f'**There are no conflicting tokens in this page.**')) display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))
def statsTable(muX, muY, sigX, sigY, rmsX, rmsY, nPts): ''' make a markdown table ''' myTable = md(f'|Statistic|$u_x - v_x$ (m/yr)|$u_y - v_y$ (m/yr)|' f'N points|\n' f'|------|------------|------------|---------|\n' f'|Mean|{muX:0.2}|{muY:0.2}|{nPts}|\n' f'|Std.Dev.|{sigX:0.2}|{sigY:0.2}|{nPts}|\n' f'|rms|{rmsX:0.2}|{rmsY:0.2}|{nPts}|') return muX, muY, sigX, sigY, rmsX, rmsY, nPts, myTable
def df_summary(df, heading='Summary', heading_level=3): """Display rendered summary of a DataFrame.""" table = pd.concat( [df.dtypes, df.isnull().sum(), (~df.isnull()).sum()], axis=1) table.reset_index(inplace=True) table.columns = 'Columns', 'Dtype', 'Null', 'Non-Null' dp( md('#' * heading_level + f' {heading}\n' f'*Rows*: **{df.shape[0]}** \n' f'*Columns*: **{df.shape[1]}**'), table)
def get_tokens(self, df_selected): with self.out: clear_output() editor_id = df_selected["editor_id"].values[0] year_and_month = (df_selected.index[0].year, df_selected.index[0].month, df_selected.index[0].day) display(md(f"In **{year_and_month[2]}.{year_and_month[1]}.{year_and_month[0]}** you have selected the editor **{df_selected['name'].values[0]}**")) # All actions. selected_source_tokens = self.token_source.loc[self.__date_editor_filter(self.token_source, year_and_month, editor_id)].reset_index(drop=True) # Conflicts. selected_conflict_tokens = self.token_conflict.loc[self.__date_editor_filter(self.token_conflict, year_and_month, editor_id)].reset_index(drop=True) # Elegibles. selected_elegible_tokens = self.token_elegible.loc[self.__date_editor_filter(self.token_elegible, year_and_month, editor_id)].reset_index(drop=True) # Classification and merge. selected_source = selected_source_tokens.groupby(["token_id"]).agg({"rev_id": "count"}).rename({"rev_id": "revisions"}, axis=1) selected_conflicts = selected_conflict_tokens.groupby(["token_id"]).agg({"action": "count", "conflict": "sum"}).rename({"action": "conflicts"}, axis=1) selected_elegibles = selected_elegible_tokens.groupby(["token_id"]).agg({"action": "count"}).rename({"action": "elegibles"}, axis=1) selected_elegibles = selected_elegibles.merge(selected_source, on="token_id") in_out = self.__count_in_out(selected_source_tokens) selected_elegibles = selected_elegibles.merge(in_out, on="token_id") selected_df = selected_conflicts.merge(selected_elegibles, on="token_id", how="right") selected_df["conflict"] = selected_df["conflict"] / selected_df["elegibles"] selected_df = selected_df.fillna(0) selected_df = selected_df.merge(selected_elegible_tokens[["token_id", "token"]].drop_duplicates().set_index("token_id"), on="token_id")[["token", "elegibles", "conflicts", "conflict", "revisions", "in_actions", "out_actions"]] selected_df = selected_df[selected_df["conflict"] != 0] # Find the main opponent for each token. editor_to_id = self.get_editor_month()[["editor_id", "name"]] editor_id_dict = dict(zip(editor_to_id["editor_id"], editor_to_id["name"])) for k, v in editor_id_dict.items(): if str(v) == "nan": editor_id_dict[k] = k else: pass main_opponent = self.__get_main_opponent(editor_id=editor_id, token_indices=selected_df.index, editor_dict=editor_id_dict) selected_df = selected_df.merge(main_opponent, on="token_id").rename({"editor": "main_opponent", "token": "string"}, axis=1) display(qgrid.show_grid(selected_df[selected_df["string"] != "<!--"]))
def listen(self, _range1, _range2, granularity): "Listener." if (len(str(_range1.year)) < 4) | (len(str(_range2.year)) < 4): return display(md("Please enter the correct date!")) if _range1 > _range2: return display(md("Please enter the correct date!")) else: df = self.df[(self.df.rev_time.dt.date >= _range1) & (self.df.rev_time.dt.date <= _range2)] df_from_agg = self._get_ratios(df, freq=granularity) df_from_agg = df_from_agg.rename({"editor_str": "editor_id"}, axis=1) df_display = self._merge_main(df_from_agg, freq=granularity) df_display["conflict"] = (df_display["conflict"] / df_display["elegibles"]).fillna(0) df_display["main_opponent"] = df_display["main_opponent"].replace(self.names_id) df_display.rename({"main_opponent": "main_op", "stopwords_ratio": "SW_ratio", "revisions": "revs", "productivity": "prod"}, axis=1, inplace=True) displayed = df_display[["rev_time", "editor", "adds", "dels", "reins", "prod", "conflict", "SW_ratio", "main_op", "avg_reac_time", "revs", "editor_id"]].set_index("rev_time").sort_index(ascending=False) columns_set = {"rev_time": {"width": 90}, "editor": {"width": 85}, "adds": {"width": 50}, "dels": {"width": 50}, "reins": {"width": 50}, "prod": {"width": 50, "toolTip": "productivity"}, "conflict": {"width": 70}, "SW_ratio": {"width": 80, "toolTip": "stopwords ratio"}, "main_op": {"width": 90, "toolTip": "main opponent"}, "avg_reac_time": {"width": 125, "toolTip": "average reaction time"}, "revs": {"width": 45, "toolTip": "revisions"}, "editor_id": {"width": 80}} self.qgrid_obj = qgrid.show_grid(displayed, grid_options={'forceFitColumns':True}, column_definitions=columns_set) display(self.qgrid_obj) self.out = Output() display(self.out) self.current_freq = granularity if self.search_widget != None: self.qgrid_obj.observe(self.on_select_change, names=['_selected_rows'])
def listen(self, _range1, _range2, granularity, trace): df = self.summ if len(df) == 0: display( md("***It is not possible to plot the tokens owned because this editor has never owned any token.***" )) return df = df[(df.day.dt.date >= _range1) & ( df.day.dt.date <= _range2 + datetime.timedelta(days=1))].copy() self.traces = [] if trace == 'Tokens Owned': _range = None df['time'] = df['day'].dt.to_period( granularity[0]).dt.to_timestamp(granularity[0]) df = df[~df.duplicated(subset='time', keep='first')] _y = df['abs'] elif trace == 'Tokens Owned (%)': _range = [0, 100] df['time'] = df['day'].dt.to_period( granularity[0]).dt.to_timestamp(granularity[0]) df = df[~df.duplicated(subset='time', keep='first')] _y = df['res'] self.traces.append( graph_objs.Scatter(x=df['time'], y=_y, name=trace, marker=dict(color='rgba(255, 0, 0, .5)'))) layout = graph_objs.Layout(hovermode='closest', xaxis=dict(title=granularity, ticklen=5, zeroline=True, gridwidth=2), yaxis=dict(ticklen=5, gridwidth=2, range=_range), legend=dict(x=0.5, y=1.2), showlegend=True, barmode='group') self.df_plotted = df plotly.offline.init_notebook_mode(connected=True) plotly.offline.iplot({"data": self.traces, "layout": layout})
def __init__(self, pp_log, lng): self.lng = lng self.df = pp_log if self.lng == "en": self.inf_str = "indefinite" self.exp_str = "expires" elif self.lng == "de": self.inf_str = "unbeschränkt" self.exp_str = "bis" else: display(md("This language is not supported yet.")) self.inf_str = "indefinite" self.exp_str = "expires"
def listen(self, stopwords, _range1, _range2): if stopwords == 'Not included': conflict_calculator = self.sources["con_manager"] else: conflict_calculator = self.sources["con_manager_all"] # display the tokens, the difference in seconds and its corresponding conflict score self.conflicts = conflict_calculator.conflicts.copy() self.add_columns() if len(self.conflicts) > 0: conflicts_for_grid = self.conflicts[[ 'order', 'count', 'action', 'token', 'token_id', 'conflict', 'rev_time', 'name', 'editor_id', 'time_diff_secs', 'rev_id' ]].rename(columns={ 'token': 'string', 'rev_time': 'timestamp', 'name': 'editor_name' }).sort_values('conflict', ascending=False) conflicts_for_grid['timestamp'] = pd.to_datetime( conflicts_for_grid['timestamp'], cache=False, utc=True).dt.date conflicts_for_grid = conflicts_for_grid[ (conflicts_for_grid.timestamp >= _range1) & (conflicts_for_grid.timestamp <= _range2)] conflicts_for_grid['token_id'] = conflicts_for_grid[ 'token_id'].astype(int).astype(str) conflicts_for_grid['rev_id'] = conflicts_for_grid['rev_id'].astype( int).astype(str) conflicts_for_grid['editor_id'] = conflicts_for_grid[ 'editor_id'].astype(str) conflicts_for_grid.set_index('token_id', inplace=True) self.df_for_grid = conflicts_for_grid.loc[ conflicts_for_grid['string'] != '<!--'].copy() qgrid_token_obj = qgrid.show_grid( self.df_for_grid, grid_options={'forceFitColumns': False}) self.qgrid_token_obj = qgrid_token_obj display(self.qgrid_token_obj) self.out21 = Output() display(self.out21) self.qgrid_token_obj.observe(self.on_selection_change, names=['_selected_rows']) else: display(md(f'**There are no conflicting tokens in this page.**')) display( HTML( f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>' ))
def __init__(self, all_actions, protection_plot, lng, wikipediadv_api, page): self.df = all_actions self.lng = lng self.api = wikipediadv_api self.page = page if lng == "en": self.templates = ["Featured Article", "Good Article", "Disputed", "POV", "Pov", "PoV", "NPOV", "Npov", "Neutrality", "Neutral", "Point Of View", "Systemic bias"] elif lng == "de": self.templates = ["Exzellent", "Lesenswert", "Neutralität"] else: display(md("This language is not supported yet.")) self.templates = ["oajdfoijelkjdf"] self.tl = [tl.lower().split()[0] for tl in self.templates] self.plot_protect = protection_plot
def get_protect(self, level="semi"): """""" if len(self.df) == 0: display(md(f"No {level} protection records!")) return None, pd.DataFrame( columns=["Task", "Start", "Finish", "Resource"]) df_with_expiry = self.__get_expiry() df_with_unknown = self.__check_unknown(df_with_expiry) df_checked_unprotect = self.__check_unprotect(df_with_unknown) df_select_level = self.__select_level(df_checked_unprotect, level=level) df_with_unprotect = self.__get_unprotect(df_select_level) final_table = self.__get_final(df_with_unprotect) plot_table = self.__get_plot(final_table, level=level) return final_table, plot_table
def exportar_pdf(df_ejercicios, fichero, titulo, tipo, letra='A', soluciones=False): # titulo = titulo + letra if tipo != 'ejercicios': df_ejercicios.n_columnas = 1 # para los exámenes fichero = fichero + letra if soluciones == True: fichero = fichero + '_sol' escribir_preambulo(fichero, titulo, tipo, soluciones) #for s in df_ejercicios.groupby('n_ejercicio',sort=False).count().index for s in df_ejercicios.groupby('n_ejercicio').count().index: display(md("**Ejercicio: **" + s)) display(df_ejercicios[df_ejercicios.n_ejercicio == s]) escribir_ejercicios(df_ejercicios[df_ejercicios.n_ejercicio == s], fichero, tipo) escribir_fin(fichero)
def __init__(self, pp_log, lng): """ Class to analyse protection information. ... Attributes: ----------- df (pd.DataFrame): raw data extracted from Wikipedia API. lng (str): langauge from {'en', 'de'} inf_str / exp_str (str): "indefinite" / "expires" for English "unbeschränkt" / "bis" for Deutsch """ self.lng = lng self.df = pp_log if self.lng == "en": self.inf_str = "indefinite" self.exp_str = "expires" elif self.lng == "de": self.inf_str = "unbeschränkt" self.exp_str = "bis" else: display(md("This language is not supported yet.")) self.inf_str = "indefinite" self.exp_str = "expires"
def show(df): """Show pretty DataFrames in PDF conversion: https://stackoverflow.com/questions/20685635/pandas-dataframe-as-latex-or-html-table-nbconvert """ display(md(df.to_markdown()))
# coding: utf-8 # get_ipython().magic(u'load_ext pyspecdata.ipy') from IPython.display import Markdown as md md('test *yes*') import pandas as pd # d = pd.read_excel('bridge12_diode_calib.xlsx') d = d.loc[2:] #truncate the stuff that's too low in power # d = nddata(d['"Rx" reading'].values, ['dBm']).labels({'dBm': double(d['Expected Real Value'].values)}) d # print d.data.min() # ## since I want to use inverse interpolation, check that it looks OK # plot(d, 'o') yvals = linspace(3, 390, 100) xvals = d.C.invinterp('dBm', yvals)
def example(ex_name): """ return markdown for the example requested """ return md('[' + ex_name + '](' + examples_dict[ex_name] + ')')
# In[16]: x_train.head() # In[31]: # Datasets shape print('Train dataset:\n{} rows\n{} columns'.format(train_set.shape[0], train_set.shape[1])) print('\nTest dataset:\n{} rows\n{} columns'.format(test_set.shape[0], test_set.shape[1])) # In[32]: proportion = x / train_set.shape[0] # Compute the tweets proportion by target md("The percentual of disaster tweets is {}%, and {}% for not disaster.". format(round(proportion[1] * 100, 0), round(proportion[0] * 100, 0))) # In[33]: fig1, ax1 = plt.subplots() ax1.pie( proportion, explode=(0, 0.1), # only "explode" the 2nd slice labels=['Not disaster', 'Disaster'], autopct='%1.1f%%', shadow=True, startangle=90) ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. plt.title("Percentual of tweets") plt.show()
def fit(self, features, target, features_test=None, target_test=None, random_state=random_state, bias=0.5, learning_rate=0.10, iterations=10000, verbose=False, feature_names=[], brake_callback=None, plot=False): np.random.seed(self.__random_state__) weights = np.array([[x] for x in np.random.random((features.dims[1][0]))]) results = [] for epoch in range(iterations): inputs = Qobj(features) in_o = inputs * Qobj(weights) + bias out_o = Qobj(self.activation(in_o)) loss = out_o - Qobj(target) if verbose: print(loss) derror_douto = loss douto_dino = Qobj(self.activation_prime(out_o)) deriv = derror_douto for r, c in enumerate(deriv): #does this do a hadamard product? deriv.data[r] *= douto_dino.data[r] inputs = inputs.trans() deriv_final = inputs * deriv weights -= learning_rate * deriv_final for i in deriv: bias -= learning_rate * i epoch_results = { f'weight_{i}': weight[0][0] for i, weight in enumerate(weights) } epoch_results.update({ 'epoch': epoch, 'bias': bias[0][0], }) predict_test = self.predict(epoch_results, features_test) test_scoring = self.score_model(predict_test, target_test) epoch_results.update(test_scoring) print( f'Epoch:{epoch} Test Acc: {test_scoring["accuracy"]:.6f} Test MAE: {test_scoring["mae"]:.6f}', end='\r') #check for brake callback if epoch >= brake_callback.patience: all_scores = { 'accuracy': [x['accuracy'] for x in results], 'mae': [x['mae'] for x in results] } #TODO: throw these in a separate dict to reduce overhead, also make more abstract instead of hc scores if not brake_callback.should_continue(all_scores): display(md(f'***Early Stoppage***')) display(md(f'- Epoch: *{epoch}*')) display( md(f'- Last Value: *{brake_callback.last_val:.6f}*')) display( md(f'- Test Value: *{brake_callback.compare_to:.6f}*')) display(md(f'- Function: *{brake_callback.func}*')) display(md(f'- Score: *{brake_callback.stat}*')) display( md(f'- Criterion: *{brake_callback.compare_type}*')) display( md(f'- Tolerance: *{brake_callback.tolerance:.6f}*')) epoch_results.update({'brake_callback': brake_callback}) break else: epoch_results.update({'brake_callback': None}) results.append(epoch_results) if plot: self.plot(results, feature_names) return results
from load_data import loadall from json import load from os.path import join, sep from sys import path import matplotlib.pyplot as plt from IPython.display import Markdown as md import numpy as np settings = load(open("foldersettings.json")) path.append(join(f"{sep}".join(settings["projectdir"]), "from_scratch")) from mystats import avg, describe_matrix if __name__ == "__main__": datadir = f"{sep}".join(settings["datadir"]) data = loadall(datadir, prefix="*ubyte*") X_train, X_test = data["i60000"], data["i10000"] y_train, y_test = data["l60000"], data["l10000"] mdobj = md(describe_matrix(X_train)) # non-square matrix # => no solution to Ax = b # What about A^T A x_hat = A^T b ? # If there would be a solution, it could be solved with: # np.linalg.solve(X_train, y_test) print(mdobj._repr_markdown_()) # 1. Visualise and clean data plt.figure() plt.imshow(X_train[0, :].reshape(28, 28), cmap="gist_yarg") plt.savefig(join("img", "example_digit.png"))
def play_notebook_video(folder, filename): '''Plays video in ipython using markdown''' file_path = os.path.join(folder, filename) return md('<video controls src="{0}"/>'.format(file_path))
def listen(self, revid, stopwords): # Get source data through ConflictManager. if stopwords == 'Not included': link_token = remove_stopwords(self.sources["tokens_all"], self.lng) self.token_source = link_token del link_token else: link_token = self.sources["tokens_all"] self.token_source = link_token del link_token self.token_source = self.token_source.reset_index(drop=True) #selected revision id: #self.rev_id = int(rev_id) #extract editor name and timestamp to display before the table self.rev_id = revid self.filtered_df = self.token_source[self.token_source['rev_id'] == self.rev_id] if len(self.filtered_df) != 0: editor_name = self.editors.loc[self.editors['editor_id'] == self. filtered_df['editor'].values[0], 'name'].values[0] else: return display(md("No tokens in this revision!")) timestamp = pd.DatetimeIndex(self.token_source[ self.token_source['rev_id'] == self.rev_id]['rev_time'])[0] display( md(f"***Selected revision: ID: {self.rev_id}, editor name: {str(editor_name)}, timestamp: {str(timestamp.date())} {str(timestamp.time())}***" )) # Print URL to wikipedia diff. url = f"https://{self.lng}.wikipedia.org/w/index.php?title={self.page_title}&diff={self.rev_id}" display( HTML( f'<a href="{url}" target="_blank">Click here to see the Wikipedia Text DIFF</a>' )) if self.rev_id != None: #add necessary columns and process the dataframe: self.convert_oadd() self.get_editor_names() self.get_columns() #self.token_source['time_diff'] = self.token_source['time_diff'].apply(lambda x: TokensListener.convert_time_diff(x)) #sort the dataframe by timestamp and token_id: self.token_source.sort_values(['rev_time', 'token_id'], ascending=True, inplace=True) #get tokens from the selected revision (from previous and future revisions as well): rev_tokens = self.token_source.loc[self.token_source['rev_id'] == self.rev_id, 'token_id'].values tokens_for_grid = self.token_source.loc[ self.token_source['token_id'].isin(rev_tokens), [ 'token', 'token_id', 'action', 'rev_id', 'rev_time', 'name', 'o_rev_id', 'reverted_editor', 'time_diff' ]].rename(columns={ 'token': 'string', 'name': 'editor' }) #convert the format of columns to display: tokens_for_grid['rev_id'] = tokens_for_grid['rev_id'].astype( int).astype(str) tokens_for_grid['time_diff'] = tokens_for_grid['time_diff'].apply( lambda x: TokensListener.convert_time_diff(x)) tokens_for_grid['time_diff'] = tokens_for_grid['time_diff'].astype( str) tokens_for_grid['token_id'] = tokens_for_grid['token_id'].astype( int).astype(str) tokens_for_grid.sort_values(["token_id", "rev_time"], inplace=True) tokens_for_grid.set_index('token_id', inplace=True) self.tokens_for_grid = tokens_for_grid.copy() #qgrid widget: columns_set = { "rev_time": { "width": 180 }, "action": { "width": 65 }, "string": { "width": 100 }, "token_id": { "width": 94 } } qgrid_selected_revision = qgrid.show_grid( self.tokens_for_grid, column_definitions=columns_set) self.qgrid_selected_revision = qgrid_selected_revision display(self.qgrid_selected_revision) self.out213 = Output() display(self.out213) self.qgrid_selected_revision.observe(self.on_selection_change, names=['_selected_rows']) else: display( md(f'**The selected revision does not exist for this page. Try another**' ))
def listen(self): #plot_revs = [] missing_revs = [] df_templates = [] for idx, tl in enumerate(self.tl): # For plot. captured, _, diff = self.get_template(tl) # For missing revisions. missing_revs.append(diff) # For captured revs. df_templates.append(captured) missing_revs = pd.concat(missing_revs).reset_index( drop=True).drop_duplicates() df_templates = pd.concat(df_templates).reset_index( drop=True).drop_duplicates() # Capture missing values if len(missing_revs) != 0: display(md("Checking if there are missing templates...")) missing_values = self.get_missing_tl(missing_revs) df_templates = pd.concat([missing_values, df_templates ]).sort_values(["token", "rev_time" ]).reset_index(drop=True) clear_output() display( md(f"***Page: {self.page['title']} ({self.lng.upper()})***")) # Create plot df for missing values plot = [] for tl in df_templates["token"].unique(): name_idx = self.tl.index(tl) cap_one_tl = df_templates[df_templates["token"] == tl] plot.append(self.to_plot_df(cap_one_tl, name_idx)) # For protection. plot.append(self.plot_protect) if len(plot) != 0: plot_merge_task = pd.concat(plot) #semi_plot = pd.concat([plot_merge_task, new_plot]).sort_values(["Task", "Start"]).reset_index(drop=True) #plot_merge_task = self.rebuild_plot_df(semi_plot) # Handle upgraded unknown protection while it doesn't expire. tasks = plot_merge_task["Task"].unique() if self.lng == "en": plot_merge_task["Task"] = plot_merge_task["Task"].replace([ "POV", "PoV", "Pov", "Npov", "NPOV", "Neutrality", "Neutral", "Point Of View" ], "POV*") plot_merge_task["Resource"] = plot_merge_task["Task"] self.plot = plot_merge_task # Color. if self.lng == "en": templates_color = { "Featured Article": '#056ded', "Good Article": '#d9331c', "Disputed": '#ff0505', "POV*": '#5cdb9a', "Systemic bias": '#02f77a', "Semi-protection": '#939996', "Full-protection": '#939996', "Unknown protection": '#939996' } elif self.lng == "de": templates_color = { "Exzellent": '#056ded', "Lesenswert": '#d9331c', "Neutralität": '#5cdb9a', "Semi-protection": '#939996', "Full-protection": '#939996', "Unknown protection": '#939996' } else: templates_color = { "Semi-protection": '#939996', "Full-protection": '#939996', "Unknown protection": '#939996' } if len(missing_revs) != 0: display( md("**Warning: there are perhaps missing records for template editing!**" )) display(md("The following revisions are possibly missing:")) display(qgrid.show_grid(missing_revs)) else: pass if len(self.plot) != 0: display(md("The following revisions are captured:")) display(qgrid.show_grid(df_templates)) display( ff.create_gantt(plot_merge_task, colors=templates_color, showgrid_x=True, showgrid_y=True, bar_width=0.1, group_tasks=True, index_col='Resource', show_colorbar=False)) if "POV*" in self.plot["Task"].unique(): display( md("\*Includes the templates [POV/NPOV/Neutrality/Neutral/Point Of View](https://en.wikipedia.org/wiki/Template:POV)" )) else: display(md("No templates or protection records found!"))
def listen(self, _range1, _range2, stopwords, granularity): # Get source data through ConflictManager. if stopwords == 'Not included': link_token = remove_stopwords(self.sources["tokens_all"], self.lng) self.token_source = link_token del link_token else: link_token = self.sources["tokens_all"] self.token_source = link_token del link_token self.token_source = self.token_source.reset_index(drop=True) if (len(str(_range1.year)) < 4) | (len(str(_range2.year)) < 4): return display(md("Please enter the correct date!")) if _range1 > _range2: return display(md("Please enter the correct date!")) else: self.token_source = self.token_source[ (self.token_source.rev_time.dt.date >= _range1) & (self.token_source.rev_time.dt.date <= _range2)] self.token_source['rev_time'] = pd.to_datetime( self.token_source['rev_time']).dt.tz_localize(None) self.get_editor_names() days = self.token_source['rev_time'].dt.to_period( granularity[0]).unique() #getting unique days today = pd.Period(datetime.today(), freq=granularity[0]) days = pd.Series(np.append(days, today)).sort_values( ascending=False) #adding today if len(days) > 0: days = days.dt.to_timestamp(granularity[0]) + pd.DateOffset( 1 ) #converting and adding one day for extracting previous dates from dataframe self.summ = pd.DataFrame(columns=['name', 'action', 'rev_time']) _abs = [] df = self.token_source for rev_time in days: df = df[df['rev_time'] <= rev_time] last_action = df.groupby( 'token_id').last() #last of group values for each token id surv = last_action[last_action['action'] != 'out'].groupby( 'name')['action'].agg('count').reset_index() surv['rev_time'] = rev_time - pd.DateOffset(1) self.summ = self.summ.append(surv) #getting top editors among the token owners over all time top_editors = self.summ.groupby('name')['action'].agg( 'sum').sort_values(ascending=False).reset_index()[:15] first_date = self.summ.groupby('name').last().reset_index( ) #first date of oadd for every editor top_editors_merged = pd.merge( top_editors, first_date[['name', 'rev_time']], on='name' ).sort_values( 'rev_time' ) #adding first date for each editor and sorting by date of first oadd #plot fig = go.Figure() for editor in top_editors_merged['name']: x = self.summ.loc[self.summ['name'] == editor, 'rev_time'] y = self.summ.loc[self.summ['name'] == editor, 'action'] fig.add_trace( go.Scatter(x=x, y=y, name=editor, stackgroup='one')) fig.update_layout(hovermode='x unified', showlegend=True, margin=go.layout.Margin(l=50, r=50, b=150, t=10, pad=3)) fig.show()