Example #1
0
def show(left, right, *the_rest):
    """returns a markdown object print out
    equations, both symbolic and numeric in
    Jupyter notebook"""
    if the_rest:
        return md(f"${left} = {right}{the_rest}$")
    return md(f"${left} = {right}$")
Example #2
0
 def on_select_change(self, change):
     "First click."
     with self.out:
         clear_output()
         date_selected = self.qgrid_obj.get_selected_df().reset_index()["rev_time"].iloc[0]
         editor_selected = self.qgrid_obj.get_selected_df().reset_index()["editor_id"].iloc[0]
         editor_name = self.qgrid_obj.get_selected_df().reset_index()["editor"].iloc[0]
         page_title = self.all_tokens["article_title"].unique()[0]
         
         display(md("Loading revisions info..."))
         second_df = self.revision_manager.get_main(date_selected, editor_selected, self.current_freq)
         clear_output()
         
         display(md(f"Within **{self.current_freq}** timeframe, you have selected **{editor_name}** (id: {editor_selected})"))
         display(HTML(f"The revisions fall in <a href='https://{self.lng}.wikipedia.org/w/index.php?date-range-to={date_selected}&tagfilter=&title={page_title}&action=history' target='_blank'>{date_selected}</a>"))
         
         second_df.rename({"main_opponent": "main_op", "stopwords_ratio": "SW_ratio",
                          "productivity": "prod"}, axis=1, inplace=True)
         columns_set = {"rev_time": {"width": 165}, "rev_id": {"width": 85}, "adds": {"width": 50}, "dels": {"width": 50},
            "reins": {"width": 50}, "prod": {"width": 50, "toolTip": "productivity"}, "conflict": {"width": 70},
            "SW_ratio": {"width": 82, "toolTip": "stopwords ratio"},
            "main_op": {"width": 80, "toolTip": "main opponent"},
            "min_react": {"width": 132, "toolTip": "min reaction time"},
            "Damaging": {"width": 92}, "Goodfaith": {"width": 90}}
         self.second_qgrid = qgrid.show_grid(second_df, grid_options={'forceFitColumns': True,
                                                 'syncColumnCellResize': True}, column_definitions=columns_set)
         display(self.second_qgrid)
         
         self.out2 = Output()
         display(self.out2)
         self.second_qgrid.observe(self.on_select_revision, names=['_selected_rows'])
Example #3
0
 def get_protect(self, level="semi_edit"):
     """
     Main function of ProtectListener.
     ...
     Parameters: 
     -----------
     level (str): select one from {"semi_edit", "semi_move", "fully_edit", "fully_move", "unknown"}
     ...
     Returns:
     -----------
     final_table (pd.DataFrame): detailed dataframe containing protection records for a particular type/level.
     plot_table (pd.DataFrame): dataframe for further Gantt Chart plotting.
     """        
     if len(self.df) == 0:
         display(md(f"No {level} protection records!"))
         return None, pd.DataFrame(columns=["Task", "Start", "Finish", "Resource"])
     else:
         self.df = self.df.drop(self.df[self.df["action"] == "move_prot"].index).reset_index(drop=True)
         if len(self.df) == 0:
             display(md(f"No {level} protection records!"))
             return None, pd.DataFrame(columns=["Task", "Start", "Finish", "Resource"])
     
     df_with_expiry = self._get_expiry()
     df_with_unknown = self._check_unknown(df_with_expiry)
     df_checked_unprotect = self._check_unprotect(df_with_unknown)
     df_select_level = self._select_level(df_checked_unprotect, level=level)
     df_with_unprotect = self._get_unprotect(df_select_level)
     
     final_table = self._get_final(df_with_unprotect)
     plot_table = self._get_plot(final_table, level=level)
             
     return final_table, plot_table
Example #4
0
 def on_select_revision(self, change):
     "Second click and display comment."
     with self.out2:
         clear_output()
         self.selected_rev = str(self.second_qgrid.get_selected_df()["rev_id"].iloc[0]).encode("utf-8").decode("utf-8")
         self.search_widget.value = self.selected_rev
         display(md("Loading comments..."))
         self.get_comments()
         clear_output()
         if self.selected_rev not in self.rev_comments.keys():
             self.rev_comments[self.selected_rev] = ''            
         display(md(f"**Comment for the revision {self.selected_rev}:** {self.rev_comments[self.selected_rev]}"))
         display(HTML(f"<a href='https://{self.lng}.wikipedia.org/w/index.php?diff={self.selected_rev}&title=TITLEDOESNTMATTER&diffmode=source' target='_blank'>Cilck here to check revisions differences</a>"))
Example #5
0
def fixMeshGrid(dataset, mystery_flag=False):
    '''
    Maybe this is not necessary if masks are applied properly
    Derives gridsteps and dimensions from passed DataSet
    Assumes uniform grid, curvilinear grid wont work here!
    Reference to XZ and YZ need to be passed explicitly because Dask loads the netCDF lazily
    The mystery flag is a Boolean because sometimes 1 and sometimes 2 gridsteps need to be subtracted from the length ¯\_(ツ)_/¯ , don't really know why (even vs uneven?)
    '''
    print("● Fixing mesh grid, assuming a uniform grid ")
    dataset.XZ.load()
    dataset.YZ.load()

    x_gridstep = dataset.XZ.values[2][-1] - dataset.XZ.values[1][-1]
    y_gridstep = dataset.YZ.values[-2][-1] - dataset.YZ.values[-2][-2]

    width = (dataset.XZ.shape[0] - 2) * x_gridstep
    if mystery_flag:
        length = (dataset.XZ.shape[1] -
                  1) * y_gridstep  # eeehhh hmmmm -1? sometimes -2?
    else:
        length = (dataset.XZ.shape[1] -
                  2) * y_gridstep  # eeehhh hmmmm -1? sometimes -2?

    md(f"""
    # Times
    | Name | Value |
    | --- | --- |
    | x gridstep | {x_gridstep} |
    | y gridstep | {y_gridstep} |
    | Width | {width} |
    | Length | {length} |
    
    """)

    XZ, YZ = makeMeshGrid(length=length,
                          width=width,
                          x_gridstep=x_gridstep,
                          y_gridstep=y_gridstep)

    # for debugging
    # print('original XZ', dataset.XZ.shape)
    # print('original YZ', dataset.YZ.shape)
    # print('new XZ', XZ.shape)
    # print('new YZ', YZ.shape)
    dataset.XZ.values = XZ
    dataset.YZ.values = YZ

    return dataset
Example #6
0
def show_dashboard():
    output.clear_output()
    data_output.clear_output()

    item_layout = widgets.Layout(margin='0 0 10px 0', align_items='stretch')
    item_layout_tab = widgets.Layout(margin='0 0 10px 0')

    explore_data = range_table_all
    explore_data['sum_in_area'] = explore_data['sum_in_area'].astype(int)

    with output:
        display(
            md("> <font size = 3, font color = black> {}".format(
                resident_text)))
    with data_output:
        display(explore_data)

    global tab, input_widgets
    input_widgets = widgets.VBox(
        [selected_persona, selected_percentile, text_generation_button],
        layout=item_layout)

    tab = widgets.Tab([output, data_output], layout=item_layout_tab)
    tab.set_title(0, 'Narrative')
    tab.set_title(1, 'Data')

    global dashboard
    dashboard = widgets.VBox([input_widgets, tab])
Example #7
0
    def token_selection_change(self, change):
        "First click."
        with self.out1:
            clear_output()

            # Process the involved dataframe.
            token_selected = self.qgrid_token_obj.get_selected_df(
            ).reset_index()['string'].iloc[0]
            selected_token = self._select_token(token_selected, self._range1,
                                                self._range2)
            df_selected_token = selected_token.drop(
                ['page_id', 'o_editor', 'token', 'o_rev_id', 'article_title'],
                axis=1)
            new_cols = ['token_id', 'action', 'rev_time', 'editor', 'rev_id']
            df_selected_token = df_selected_token[new_cols].rename(
                {'editor': 'editor_id'}, axis=1)
            df_selected_token['token_id'] = df_selected_token[
                'token_id'].astype(str)
            df_selected_token['rev_id'] = df_selected_token['rev_id'].astype(
                str)
            df_selected_token.set_index('token_id', inplace=True)

            qgrid_selected_token = qgrid.show_grid(df_selected_token)
            self.qgrid_selected_token = qgrid_selected_token
            display(
                md(f'**With string *{token_selected}*, select one revision you want to investigate:**'
                   ))
            display(self.qgrid_selected_token)

            self.out2 = Output()
            display(self.out2)
            self.qgrid_selected_token.observe(self.revid_selection_change,
                                              names=['_selected_rows'])
Example #8
0
def show_model_parameters(m_, idx_=-1):

    # fit model
    m_ = m_.fit()

    # extract significant figures from float
    def sigfigs(x):
        d = int(str('%.2e' % x)[('%.2e' % x).find('-') + 1:])
        n = np.round(float(str('%.02e' % x)[0:3]))
        return n, d

    # extract model parameters
    beta, pval, df_model = m_.params[idx_], m_.pvalues[idx_], m_.df_model
    rsqrd, df_resid, tvalues = m_.rsquared, m_.df_resid, m_.tvalues[idx_]

    # show exact p values up to three significant figures
    if sigfigs(pval)[1] < 4:
        stat_str = "$\\beta = %.2f$, $F(%d, %d)$ = $%.02f, P = %.03f $"
        report = stat_str % (
            beta,
            df_model,
            df_resid,
            tvalues,
            pval,
        )
    else:
        stat_str = "$\\beta = %.2f$, $F(%d, %d)$ = $%.02f, P = %.0f $ x $ 10 ^{-%d} $"
        report = stat_str % (beta, df_model, df_resid, tvalues,
                             sigfigs(pval)[0], sigfigs(pval)[1])

    # return markdown visualization
    return md(report)  #, report
Example #9
0
 def listen(self, _range1, _range2, stopwords):
     if stopwords == 'Not included':
         if self.conflicts_dict["Not included"] is None:
             conflicts_not_included = remove_stopwords(self.sources["tokens_source"]["conflicts_all"],self.lng).reset_index(drop=True)
             self.conflicts_dict["Not included"] = self.add_columns(conflicts_not_included)
             self.conflicts_dict["Not included"] = self.get_displayed_df(_range1, _range2, self.conflicts_dict["Not included"])
             conflicts = self.conflicts_dict["Not included"]
         else:
             conflicts = self.conflicts_dict["Not included"]
             
     else:
         if self.conflicts_dict["Included"] is None:
             link_df = self.sources["tokens_source"]["conflicts_all"]
             self.conflicts_dict["Included"] = link_df
             del link_df
             conflicts_included = self.add_columns(self.conflicts_dict["Included"])
             self.conflicts_dict["Included"] = self.add_columns(conflicts_included)
             self.conflicts_dict["Not Included"] = self.get_displayed_df(_range1, _range2, self.conflicts_dict["Included"])
             conflicts = self.conflicts_dict["Included"]
         else:
             conflicts = self.conflicts_dict["Included"]
         
     
     if len(conflicts) > 0:
         qgrid_token_obj = qgrid.show_grid(conflicts,grid_options={'forceFitColumns':False})
         display(qgrid_token_obj)
         
     else:
         display(md(f'**There are no conflicting tokens in this page.**'))
         display(HTML(f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'))
Example #10
0
 def statsTable(muX, muY, sigX, sigY, rmsX, rmsY, nPts):
     ''' make a markdown table '''
     myTable = md(f'|Statistic|$u_x - v_x$ (m/yr)|$u_y - v_y$ (m/yr)|'
                  f'N points|\n'
                  f'|------|------------|------------|---------|\n'
                  f'|Mean|{muX:0.2}|{muY:0.2}|{nPts}|\n'
                  f'|Std.Dev.|{sigX:0.2}|{sigY:0.2}|{nPts}|\n'
                  f'|rms|{rmsX:0.2}|{rmsY:0.2}|{nPts}|')
     return muX, muY, sigX, sigY, rmsX, rmsY, nPts, myTable
Example #11
0
def df_summary(df, heading='Summary', heading_level=3):
    """Display rendered summary of a DataFrame."""
    table = pd.concat(
        [df.dtypes, df.isnull().sum(), (~df.isnull()).sum()], axis=1)
    table.reset_index(inplace=True)
    table.columns = 'Columns', 'Dtype', 'Null', 'Non-Null'
    dp(
        md('#' * heading_level + f' {heading}\n'
           f'*Rows*: **{df.shape[0]}**  \n'
           f'*Columns*: **{df.shape[1]}**'), table)
Example #12
0
    def get_tokens(self, df_selected):
        with self.out:
            clear_output()
            editor_id = df_selected["editor_id"].values[0]
            year_and_month = (df_selected.index[0].year, df_selected.index[0].month, df_selected.index[0].day)
            display(md(f"In **{year_and_month[2]}.{year_and_month[1]}.{year_and_month[0]}** you have selected the editor **{df_selected['name'].values[0]}**"))

            # All actions.
            selected_source_tokens = self.token_source.loc[self.__date_editor_filter(self.token_source, 
                                                             year_and_month, editor_id)].reset_index(drop=True)

            # Conflicts.
            selected_conflict_tokens = self.token_conflict.loc[self.__date_editor_filter(self.token_conflict, 
                                                            year_and_month, editor_id)].reset_index(drop=True)

            # Elegibles.
            selected_elegible_tokens = self.token_elegible.loc[self.__date_editor_filter(self.token_elegible, 
                                                        year_and_month, editor_id)].reset_index(drop=True)

            # Classification and merge.
            selected_source = selected_source_tokens.groupby(["token_id"]).agg({"rev_id": "count"}).rename({"rev_id": "revisions"}, axis=1)
            selected_conflicts = selected_conflict_tokens.groupby(["token_id"]).agg({"action": "count", 
                                                             "conflict": "sum"}).rename({"action": "conflicts"}, axis=1)

            selected_elegibles = selected_elegible_tokens.groupby(["token_id"]).agg({"action": "count"}).rename({"action": "elegibles"}, axis=1)

            selected_elegibles = selected_elegibles.merge(selected_source, on="token_id")

            in_out = self.__count_in_out(selected_source_tokens)
            selected_elegibles = selected_elegibles.merge(in_out, on="token_id")

            selected_df = selected_conflicts.merge(selected_elegibles, on="token_id", how="right")
            selected_df["conflict"] = selected_df["conflict"] / selected_df["elegibles"]
            selected_df = selected_df.fillna(0)
            selected_df = selected_df.merge(selected_elegible_tokens[["token_id", "token"]].drop_duplicates().set_index("token_id"), 
                            on="token_id")[["token", "elegibles", "conflicts", "conflict", "revisions", "in_actions", "out_actions"]]
            selected_df = selected_df[selected_df["conflict"] != 0]
            

            # Find the main opponent for each token.
            editor_to_id = self.get_editor_month()[["editor_id", "name"]]
            editor_id_dict = dict(zip(editor_to_id["editor_id"], editor_to_id["name"]))
            for k, v in editor_id_dict.items():
                if str(v) == "nan":
                    editor_id_dict[k] = k
                else:
                    pass

            main_opponent = self.__get_main_opponent(editor_id=editor_id, token_indices=selected_df.index, editor_dict=editor_id_dict)
            selected_df = selected_df.merge(main_opponent, on="token_id").rename({"editor": "main_opponent", "token": "string"}, axis=1)

            display(qgrid.show_grid(selected_df[selected_df["string"] != "<!--"]))
Example #13
0
 def listen(self, _range1, _range2, granularity):
     "Listener."
     if (len(str(_range1.year)) < 4) | (len(str(_range2.year)) < 4):
         return display(md("Please enter the correct date!"))
     if _range1 > _range2:
         return display(md("Please enter the correct date!"))
     else:
         df = self.df[(self.df.rev_time.dt.date >= _range1) & (self.df.rev_time.dt.date <= _range2)]
     df_from_agg = self._get_ratios(df, freq=granularity)
     df_from_agg = df_from_agg.rename({"editor_str": "editor_id"}, axis=1)
     df_display = self._merge_main(df_from_agg, freq=granularity)
     df_display["conflict"] = (df_display["conflict"] / df_display["elegibles"]).fillna(0)
     
     df_display["main_opponent"] = df_display["main_opponent"].replace(self.names_id)
     
     df_display.rename({"main_opponent": "main_op", "stopwords_ratio": "SW_ratio",
                       "revisions": "revs", "productivity": "prod"}, axis=1, inplace=True)
     
     displayed = df_display[["rev_time", "editor", 
                   "adds", "dels", "reins",
                    "prod", "conflict",
                    "SW_ratio", "main_op",
                    "avg_reac_time", "revs", "editor_id"]].set_index("rev_time").sort_index(ascending=False)
     columns_set = {"rev_time": {"width": 90}, "editor": {"width": 85}, "adds": {"width": 50}, "dels": {"width": 50},
                    "reins": {"width": 50}, "prod": {"width": 50, "toolTip": "productivity"}, "conflict": {"width": 70},
                    "SW_ratio": {"width": 80, "toolTip": "stopwords ratio"},
                    "main_op": {"width": 90, "toolTip": "main opponent"},
                    "avg_reac_time": {"width": 125, "toolTip": "average reaction time"},
                    "revs": {"width": 45, "toolTip": "revisions"}, "editor_id": {"width": 80}}
     self.qgrid_obj = qgrid.show_grid(displayed, grid_options={'forceFitColumns':True}, column_definitions=columns_set)
     
     display(self.qgrid_obj)
     self.out = Output()
     display(self.out)
     
     self.current_freq = granularity
     if self.search_widget != None:
         self.qgrid_obj.observe(self.on_select_change, names=['_selected_rows'])
Example #14
0
    def listen(self, _range1, _range2, granularity, trace):

        df = self.summ

        if len(df) == 0:
            display(
                md("***It is not possible to plot the tokens owned because this editor has never owned any token.***"
                   ))
            return

        df = df[(df.day.dt.date >= _range1) & (
            df.day.dt.date <= _range2 + datetime.timedelta(days=1))].copy()

        self.traces = []
        if trace == 'Tokens Owned':
            _range = None
            df['time'] = df['day'].dt.to_period(
                granularity[0]).dt.to_timestamp(granularity[0])
            df = df[~df.duplicated(subset='time', keep='first')]
            _y = df['abs']

        elif trace == 'Tokens Owned (%)':
            _range = [0, 100]
            df['time'] = df['day'].dt.to_period(
                granularity[0]).dt.to_timestamp(granularity[0])
            df = df[~df.duplicated(subset='time', keep='first')]
            _y = df['res']

        self.traces.append(
            graph_objs.Scatter(x=df['time'],
                               y=_y,
                               name=trace,
                               marker=dict(color='rgba(255, 0, 0, .5)')))

        layout = graph_objs.Layout(hovermode='closest',
                                   xaxis=dict(title=granularity,
                                              ticklen=5,
                                              zeroline=True,
                                              gridwidth=2),
                                   yaxis=dict(ticklen=5,
                                              gridwidth=2,
                                              range=_range),
                                   legend=dict(x=0.5, y=1.2),
                                   showlegend=True,
                                   barmode='group')

        self.df_plotted = df

        plotly.offline.init_notebook_mode(connected=True)
        plotly.offline.iplot({"data": self.traces, "layout": layout})
    def __init__(self, pp_log, lng):
        self.lng = lng
        self.df = pp_log

        if self.lng == "en":
            self.inf_str = "indefinite"
            self.exp_str = "expires"
        elif self.lng == "de":
            self.inf_str = "unbeschränkt"
            self.exp_str = "bis"
        else:
            display(md("This language is not supported yet."))
            self.inf_str = "indefinite"
            self.exp_str = "expires"
Example #16
0
    def listen(self, stopwords, _range1, _range2):
        if stopwords == 'Not included':
            conflict_calculator = self.sources["con_manager"]
        else:
            conflict_calculator = self.sources["con_manager_all"]

        # display the tokens, the difference in seconds and its corresponding conflict score
        self.conflicts = conflict_calculator.conflicts.copy()
        self.add_columns()

        if len(self.conflicts) > 0:
            conflicts_for_grid = self.conflicts[[
                'order', 'count', 'action', 'token', 'token_id', 'conflict',
                'rev_time', 'name', 'editor_id', 'time_diff_secs', 'rev_id'
            ]].rename(columns={
                'token': 'string',
                'rev_time': 'timestamp',
                'name': 'editor_name'
            }).sort_values('conflict', ascending=False)
            conflicts_for_grid['timestamp'] = pd.to_datetime(
                conflicts_for_grid['timestamp'], cache=False, utc=True).dt.date
            conflicts_for_grid = conflicts_for_grid[
                (conflicts_for_grid.timestamp >= _range1)
                & (conflicts_for_grid.timestamp <= _range2)]
            conflicts_for_grid['token_id'] = conflicts_for_grid[
                'token_id'].astype(int).astype(str)
            conflicts_for_grid['rev_id'] = conflicts_for_grid['rev_id'].astype(
                int).astype(str)
            conflicts_for_grid['editor_id'] = conflicts_for_grid[
                'editor_id'].astype(str)
            conflicts_for_grid.set_index('token_id', inplace=True)
            self.df_for_grid = conflicts_for_grid.loc[
                conflicts_for_grid['string'] != '<!--'].copy()
            qgrid_token_obj = qgrid.show_grid(
                self.df_for_grid, grid_options={'forceFitColumns': False})
            self.qgrid_token_obj = qgrid_token_obj
            display(self.qgrid_token_obj)
            self.out21 = Output()
            display(self.out21)
            self.qgrid_token_obj.observe(self.on_selection_change,
                                         names=['_selected_rows'])

        else:
            display(md(f'**There are no conflicting tokens in this page.**'))
            display(
                HTML(
                    f'<a href="{get_previous_notebook()}" target="_blank">Go back to the previous workbook</a>'
                ))
Example #17
0
 def __init__(self, all_actions, protection_plot, lng, wikipediadv_api, page):
     self.df = all_actions
     self.lng = lng
     self.api = wikipediadv_api
     self.page = page
     if lng == "en":
         self.templates = ["Featured Article", "Good Article", "Disputed", "POV", "Pov", "PoV", 
                     "NPOV", "Npov", "Neutrality", "Neutral", "Point Of View", "Systemic bias"]
     elif lng == "de":
         self.templates = ["Exzellent", "Lesenswert", "Neutralität"]
     else:
         display(md("This language is not supported yet."))
         self.templates = ["oajdfoijelkjdf"]
         
     self.tl = [tl.lower().split()[0] for tl in self.templates]        
     self.plot_protect = protection_plot
    def get_protect(self, level="semi"):
        """"""
        if len(self.df) == 0:
            display(md(f"No {level} protection records!"))
            return None, pd.DataFrame(
                columns=["Task", "Start", "Finish", "Resource"])

        df_with_expiry = self.__get_expiry()
        df_with_unknown = self.__check_unknown(df_with_expiry)
        df_checked_unprotect = self.__check_unprotect(df_with_unknown)
        df_select_level = self.__select_level(df_checked_unprotect,
                                              level=level)
        df_with_unprotect = self.__get_unprotect(df_select_level)

        final_table = self.__get_final(df_with_unprotect)
        plot_table = self.__get_plot(final_table, level=level)

        return final_table, plot_table
Example #19
0
def exportar_pdf(df_ejercicios,
                 fichero,
                 titulo,
                 tipo,
                 letra='A',
                 soluciones=False):
    # titulo = titulo + letra

    if tipo != 'ejercicios':
        df_ejercicios.n_columnas = 1  # para los exámenes
        fichero = fichero + letra

    if soluciones == True: fichero = fichero + '_sol'

    escribir_preambulo(fichero, titulo, tipo, soluciones)
    #for s in df_ejercicios.groupby('n_ejercicio',sort=False).count().index
    for s in df_ejercicios.groupby('n_ejercicio').count().index:
        display(md("**Ejercicio: **" + s))
        display(df_ejercicios[df_ejercicios.n_ejercicio == s])
        escribir_ejercicios(df_ejercicios[df_ejercicios.n_ejercicio == s],
                            fichero, tipo)

    escribir_fin(fichero)
Example #20
0
 def __init__(self, pp_log, lng):
     """
     Class to analyse protection information.
     ...
     Attributes:
     -----------
     df (pd.DataFrame): raw data extracted from Wikipedia API.
     lng (str): langauge from {'en', 'de'}
     inf_str / exp_str (str): "indefinite" / "expires" for English
                     "unbeschränkt" / "bis" for Deutsch
     """
     self.lng = lng
     self.df = pp_log
     
     if self.lng == "en":
         self.inf_str = "indefinite"
         self.exp_str = "expires"
     elif self.lng == "de":
         self.inf_str = "unbeschränkt"
         self.exp_str = "bis"
     else:
         display(md("This language is not supported yet."))
         self.inf_str = "indefinite"
         self.exp_str = "expires"
Example #21
0
def show(df):
    """Show pretty DataFrames in PDF conversion:
    https://stackoverflow.com/questions/20685635/pandas-dataframe-as-latex-or-html-table-nbconvert
    """
    display(md(df.to_markdown()))
# coding: utf-8

#

get_ipython().magic(u'load_ext pyspecdata.ipy')
from IPython.display import Markdown as md
md('test *yes*')
import pandas as pd

#

d = pd.read_excel('bridge12_diode_calib.xlsx')
d = d.loc[2:]  #truncate the stuff that's too low in power

#

d = nddata(d['"Rx" reading'].values,
           ['dBm']).labels({'dBm': double(d['Expected Real Value'].values)})
d

#

print d.data.min()

# ## since I want to use inverse interpolation, check that it looks OK

#

plot(d, 'o')
yvals = linspace(3, 390, 100)
xvals = d.C.invinterp('dBm', yvals)
def example(ex_name):
    """
    return markdown for the example requested
    """

    return md('[' + ex_name + '](' + examples_dict[ex_name] + ')')
# In[16]:

x_train.head()

# In[31]:

# Datasets shape
print('Train dataset:\n{} rows\n{} columns'.format(train_set.shape[0],
                                                   train_set.shape[1]))
print('\nTest dataset:\n{} rows\n{} columns'.format(test_set.shape[0],
                                                    test_set.shape[1]))

# In[32]:

proportion = x / train_set.shape[0]  # Compute the tweets proportion by target
md("The percentual of disaster tweets is {}%, and {}% for not disaster.".
   format(round(proportion[1] * 100, 0), round(proportion[0] * 100, 0)))

# In[33]:

fig1, ax1 = plt.subplots()
ax1.pie(
    proportion,
    explode=(0, 0.1),  # only "explode" the 2nd slice
    labels=['Not disaster', 'Disaster'],
    autopct='%1.1f%%',
    shadow=True,
    startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Percentual of tweets")
plt.show()
    def fit(self,
            features,
            target,
            features_test=None,
            target_test=None,
            random_state=random_state,
            bias=0.5,
            learning_rate=0.10,
            iterations=10000,
            verbose=False,
            feature_names=[],
            brake_callback=None,
            plot=False):

        np.random.seed(self.__random_state__)
        weights = np.array([[x]
                            for x in np.random.random((features.dims[1][0]))])
        results = []
        for epoch in range(iterations):
            inputs = Qobj(features)
            in_o = inputs * Qobj(weights) + bias
            out_o = Qobj(self.activation(in_o))
            loss = out_o - Qobj(target)
            if verbose:
                print(loss)

            derror_douto = loss
            douto_dino = Qobj(self.activation_prime(out_o))
            deriv = derror_douto
            for r, c in enumerate(deriv):  #does this do a hadamard product?
                deriv.data[r] *= douto_dino.data[r]

            inputs = inputs.trans()
            deriv_final = inputs * deriv
            weights -= learning_rate * deriv_final

            for i in deriv:
                bias -= learning_rate * i

            epoch_results = {
                f'weight_{i}': weight[0][0]
                for i, weight in enumerate(weights)
            }

            epoch_results.update({
                'epoch': epoch,
                'bias': bias[0][0],
            })
            predict_test = self.predict(epoch_results, features_test)
            test_scoring = self.score_model(predict_test, target_test)
            epoch_results.update(test_scoring)
            print(
                f'Epoch:{epoch} Test Acc: {test_scoring["accuracy"]:.6f} Test MAE: {test_scoring["mae"]:.6f}',
                end='\r')

            #check for brake callback
            if epoch >= brake_callback.patience:
                all_scores = {
                    'accuracy': [x['accuracy'] for x in results],
                    'mae': [x['mae'] for x in results]
                }  #TODO: throw these in a separate dict to reduce overhead, also make more abstract instead of hc scores
                if not brake_callback.should_continue(all_scores):
                    display(md(f'***Early Stoppage***'))
                    display(md(f'- Epoch: *{epoch}*'))
                    display(
                        md(f'- Last Value: *{brake_callback.last_val:.6f}*'))
                    display(
                        md(f'- Test Value: *{brake_callback.compare_to:.6f}*'))
                    display(md(f'- Function: *{brake_callback.func}*'))
                    display(md(f'- Score: *{brake_callback.stat}*'))
                    display(
                        md(f'- Criterion: *{brake_callback.compare_type}*'))
                    display(
                        md(f'- Tolerance: *{brake_callback.tolerance:.6f}*'))
                    epoch_results.update({'brake_callback': brake_callback})
                    break
                else:
                    epoch_results.update({'brake_callback': None})

            results.append(epoch_results)
            if plot:
                self.plot(results, feature_names)

        return results
Example #26
0
from load_data import loadall
from json import load
from os.path import join, sep
from sys import path
import matplotlib.pyplot as plt
from IPython.display import Markdown as md
import numpy as np
settings = load(open("foldersettings.json"))
path.append(join(f"{sep}".join(settings["projectdir"]), "from_scratch"))
from mystats import avg, describe_matrix

if __name__ == "__main__":
    datadir = f"{sep}".join(settings["datadir"])
    data = loadall(datadir, prefix="*ubyte*")
    X_train, X_test = data["i60000"], data["i10000"]
    y_train, y_test = data["l60000"], data["l10000"]
    mdobj = md(describe_matrix(X_train))
    # non-square matrix
    # => no solution to Ax = b
    # What about A^T A x_hat = A^T b ?
    # If there would be a solution, it could be solved with:
    # np.linalg.solve(X_train, y_test)

    print(mdobj._repr_markdown_())
    # 1. Visualise and clean data
    plt.figure()
    plt.imshow(X_train[0, :].reshape(28, 28), cmap="gist_yarg")
    plt.savefig(join("img", "example_digit.png"))
def play_notebook_video(folder, filename):
    '''Plays video in ipython using markdown'''
    file_path = os.path.join(folder, filename)
    return md('<video controls src="{0}"/>'.format(file_path))
Example #28
0
    def listen(self, revid, stopwords):
        # Get source data through ConflictManager.
        if stopwords == 'Not included':
            link_token = remove_stopwords(self.sources["tokens_all"], self.lng)
            self.token_source = link_token
            del link_token
        else:
            link_token = self.sources["tokens_all"]
            self.token_source = link_token
            del link_token

        self.token_source = self.token_source.reset_index(drop=True)

        #selected revision id:
        #self.rev_id = int(rev_id)

        #extract editor name and timestamp to display before the table
        self.rev_id = revid
        self.filtered_df = self.token_source[self.token_source['rev_id'] ==
                                             self.rev_id]
        if len(self.filtered_df) != 0:
            editor_name = self.editors.loc[self.editors['editor_id'] == self.
                                           filtered_df['editor'].values[0],
                                           'name'].values[0]
        else:
            return display(md("No tokens in this revision!"))
        timestamp = pd.DatetimeIndex(self.token_source[
            self.token_source['rev_id'] == self.rev_id]['rev_time'])[0]
        display(
            md(f"***Selected revision: ID: {self.rev_id}, editor name: {str(editor_name)}, timestamp: {str(timestamp.date())} {str(timestamp.time())}***"
               ))

        # Print URL to wikipedia diff.
        url = f"https://{self.lng}.wikipedia.org/w/index.php?title={self.page_title}&diff={self.rev_id}"
        display(
            HTML(
                f'<a href="{url}" target="_blank">Click here to see the Wikipedia Text DIFF</a>'
            ))

        if self.rev_id != None:
            #add necessary columns and process the dataframe:
            self.convert_oadd()
            self.get_editor_names()
            self.get_columns()
            #self.token_source['time_diff'] = self.token_source['time_diff'].apply(lambda x: TokensListener.convert_time_diff(x))

            #sort the dataframe by timestamp and token_id:
            self.token_source.sort_values(['rev_time', 'token_id'],
                                          ascending=True,
                                          inplace=True)

            #get tokens from the selected revision (from previous and future revisions as well):
            rev_tokens = self.token_source.loc[self.token_source['rev_id'] ==
                                               self.rev_id, 'token_id'].values
            tokens_for_grid = self.token_source.loc[
                self.token_source['token_id'].isin(rev_tokens), [
                    'token', 'token_id', 'action', 'rev_id', 'rev_time',
                    'name', 'o_rev_id', 'reverted_editor', 'time_diff'
                ]].rename(columns={
                    'token': 'string',
                    'name': 'editor'
                })

            #convert the format of columns to display:
            tokens_for_grid['rev_id'] = tokens_for_grid['rev_id'].astype(
                int).astype(str)
            tokens_for_grid['time_diff'] = tokens_for_grid['time_diff'].apply(
                lambda x: TokensListener.convert_time_diff(x))
            tokens_for_grid['time_diff'] = tokens_for_grid['time_diff'].astype(
                str)
            tokens_for_grid['token_id'] = tokens_for_grid['token_id'].astype(
                int).astype(str)

            tokens_for_grid.sort_values(["token_id", "rev_time"], inplace=True)
            tokens_for_grid.set_index('token_id', inplace=True)
            self.tokens_for_grid = tokens_for_grid.copy()

            #qgrid widget:
            columns_set = {
                "rev_time": {
                    "width": 180
                },
                "action": {
                    "width": 65
                },
                "string": {
                    "width": 100
                },
                "token_id": {
                    "width": 94
                }
            }

            qgrid_selected_revision = qgrid.show_grid(
                self.tokens_for_grid, column_definitions=columns_set)
            self.qgrid_selected_revision = qgrid_selected_revision

            display(self.qgrid_selected_revision)
            self.out213 = Output()
            display(self.out213)
            self.qgrid_selected_revision.observe(self.on_selection_change,
                                                 names=['_selected_rows'])
        else:
            display(
                md(f'**The selected revision does not exist for this page. Try another**'
                   ))
    def listen(self):
        #plot_revs = []
        missing_revs = []
        df_templates = []
        for idx, tl in enumerate(self.tl):
            # For plot.
            captured, _, diff = self.get_template(tl)

            # For missing revisions.
            missing_revs.append(diff)

            # For captured revs.
            df_templates.append(captured)

        missing_revs = pd.concat(missing_revs).reset_index(
            drop=True).drop_duplicates()
        df_templates = pd.concat(df_templates).reset_index(
            drop=True).drop_duplicates()

        # Capture missing values
        if len(missing_revs) != 0:
            display(md("Checking if there are missing templates..."))
            missing_values = self.get_missing_tl(missing_revs)
            df_templates = pd.concat([missing_values, df_templates
                                      ]).sort_values(["token", "rev_time"
                                                      ]).reset_index(drop=True)
            clear_output()
            display(
                md(f"***Page: {self.page['title']} ({self.lng.upper()})***"))

        # Create plot df for missing values
        plot = []
        for tl in df_templates["token"].unique():
            name_idx = self.tl.index(tl)
            cap_one_tl = df_templates[df_templates["token"] == tl]
            plot.append(self.to_plot_df(cap_one_tl, name_idx))

        # For protection.
        plot.append(self.plot_protect)

        if len(plot) != 0:
            plot_merge_task = pd.concat(plot)
            #semi_plot = pd.concat([plot_merge_task, new_plot]).sort_values(["Task", "Start"]).reset_index(drop=True)
            #plot_merge_task = self.rebuild_plot_df(semi_plot)

        # Handle upgraded unknown protection while it doesn't expire.
        tasks = plot_merge_task["Task"].unique()

        if self.lng == "en":
            plot_merge_task["Task"] = plot_merge_task["Task"].replace([
                "POV", "PoV", "Pov", "Npov", "NPOV", "Neutrality", "Neutral",
                "Point Of View"
            ], "POV*")
        plot_merge_task["Resource"] = plot_merge_task["Task"]

        self.plot = plot_merge_task

        # Color.
        if self.lng == "en":
            templates_color = {
                "Featured Article": '#056ded',
                "Good Article": '#d9331c',
                "Disputed": '#ff0505',
                "POV*": '#5cdb9a',
                "Systemic bias": '#02f77a',
                "Semi-protection": '#939996',
                "Full-protection": '#939996',
                "Unknown protection": '#939996'
            }
        elif self.lng == "de":
            templates_color = {
                "Exzellent": '#056ded',
                "Lesenswert": '#d9331c',
                "Neutralität": '#5cdb9a',
                "Semi-protection": '#939996',
                "Full-protection": '#939996',
                "Unknown protection": '#939996'
            }
        else:
            templates_color = {
                "Semi-protection": '#939996',
                "Full-protection": '#939996',
                "Unknown protection": '#939996'
            }

        if len(missing_revs) != 0:
            display(
                md("**Warning: there are perhaps missing records for template editing!**"
                   ))
            display(md("The following revisions are possibly missing:"))
            display(qgrid.show_grid(missing_revs))
        else:
            pass

        if len(self.plot) != 0:
            display(md("The following revisions are captured:"))
            display(qgrid.show_grid(df_templates))
            display(
                ff.create_gantt(plot_merge_task,
                                colors=templates_color,
                                showgrid_x=True,
                                showgrid_y=True,
                                bar_width=0.1,
                                group_tasks=True,
                                index_col='Resource',
                                show_colorbar=False))
            if "POV*" in self.plot["Task"].unique():
                display(
                    md("\*Includes the templates [POV/NPOV/Neutrality/Neutral/Point Of View](https://en.wikipedia.org/wiki/Template:POV)"
                       ))
        else:
            display(md("No templates or protection records found!"))
Example #30
0
    def listen(self, _range1, _range2, stopwords, granularity):

        # Get source data through ConflictManager.
        if stopwords == 'Not included':
            link_token = remove_stopwords(self.sources["tokens_all"], self.lng)
            self.token_source = link_token
            del link_token
        else:
            link_token = self.sources["tokens_all"]
            self.token_source = link_token
            del link_token

        self.token_source = self.token_source.reset_index(drop=True)
        if (len(str(_range1.year)) < 4) | (len(str(_range2.year)) < 4):
            return display(md("Please enter the correct date!"))
        if _range1 > _range2:
            return display(md("Please enter the correct date!"))
        else:
            self.token_source = self.token_source[
                (self.token_source.rev_time.dt.date >= _range1)
                & (self.token_source.rev_time.dt.date <= _range2)]

        self.token_source['rev_time'] = pd.to_datetime(
            self.token_source['rev_time']).dt.tz_localize(None)
        self.get_editor_names()

        days = self.token_source['rev_time'].dt.to_period(
            granularity[0]).unique()  #getting unique days
        today = pd.Period(datetime.today(), freq=granularity[0])
        days = pd.Series(np.append(days, today)).sort_values(
            ascending=False)  #adding today

        if len(days) > 0:
            days = days.dt.to_timestamp(granularity[0]) + pd.DateOffset(
                1
            )  #converting and adding one day for extracting previous dates from dataframe
            self.summ = pd.DataFrame(columns=['name', 'action', 'rev_time'])
            _abs = []
            df = self.token_source
            for rev_time in days:
                df = df[df['rev_time'] <= rev_time]
                last_action = df.groupby(
                    'token_id').last()  #last of group values for each token id
                surv = last_action[last_action['action'] != 'out'].groupby(
                    'name')['action'].agg('count').reset_index()
                surv['rev_time'] = rev_time - pd.DateOffset(1)
                self.summ = self.summ.append(surv)

            #getting top editors among the token owners over all time
            top_editors = self.summ.groupby('name')['action'].agg(
                'sum').sort_values(ascending=False).reset_index()[:15]
            first_date = self.summ.groupby('name').last().reset_index(
            )  #first date of oadd for every editor
            top_editors_merged = pd.merge(
                top_editors, first_date[['name', 'rev_time']], on='name'
            ).sort_values(
                'rev_time'
            )  #adding first date for each editor and sorting by date of first oadd

            #plot
            fig = go.Figure()
            for editor in top_editors_merged['name']:
                x = self.summ.loc[self.summ['name'] == editor, 'rev_time']
                y = self.summ.loc[self.summ['name'] == editor, 'action']
                fig.add_trace(
                    go.Scatter(x=x, y=y, name=editor, stackgroup='one'))
            fig.update_layout(hovermode='x unified',
                              showlegend=True,
                              margin=go.layout.Margin(l=50,
                                                      r=50,
                                                      b=150,
                                                      t=10,
                                                      pad=3))
            fig.show()