Ejemplo n.º 1
0
 def test_stop(self):
     with pytest.raises(StopException) as exc_message:
         st.stop()
Ejemplo n.º 2
0
def cs_body():
    # Magic commands

    col1, col2, col3 = st.beta_columns(3)

    col1.subheader('Magic commands')
    col1.code('''# Magic commands implicitly `st.write()`
\'\'\' _This_ is some __Markdown__ \'\'\'
a=3
'dataframe:', data
    ''')

    # Display text

    col1.subheader('Display text')
    col1.code('''
st.text('Fixed width text')
st.markdown('_Markdown_') # see *
st.latex(r\'\'\' e^{i\pi} + 1 = 0 \'\'\')
st.write('Most objects') # df, err, func, keras!
st.write(['st', 'is <', 3]) # see *
st.title('My title')
st.header(My header')
st.subheader('My sub')
st.code('for i in range(8): foo()')
* optional kwarg unsafe_allow_html = True
    ''')

    # Display data

    col1.subheader('Display data')
    col1.code('''
st.dataframe(my_dataframe)
st.table(data.iloc[0:10])
st.json({'foo':'bar','fu':'ba'})
    ''')

    # Display charts

    col1.subheader('Display charts')
    col1.code('''
st.line_chart(data)
st.area_chart(data)
st.bar_chart(data)
st.pyplot(fig)
st.altair_chart(data)
st.vega_lite_chart(data)
st.plotly_chart(data)
st.bokeh_chart(data)
st.pydeck_chart(data)
st.deck_gl_chart(data)
st.graphviz_chart(data)
st.map(data)
    ''')

    # Display media

    col1.subheader('Display media')
    col1.code('''
st.image('./header.png')
st.audio(data)
st.video(data)
    ''')

    # Display interactive widgets

    col2.subheader('Display interactive widgets')
    col2.code('''
st.button('Hit me')
st.checkbox('Check me out')
st.radio('Radio', [1,2,3])
st.selectbox('Select', [1,2,3])
st.multiselect('Multiselect', [1,2,3])
st.slider('Slide me', min_value=0, max_value=10)
st.select_slider('Slide to select', options=[1,'2'])
st.text_input('Enter some text')
st.number_input('Enter a number')
st.text_area('Area for textual entry')
st.date_input('Date input')
st.time_input('Time entry')
st.file_uploader('File uploader')
st.beta_color_picker('Pick a color')
    ''')
    col2.write('Use widgets\' returned values in variables:')
    col2.code('''
>>> for i in range(int(st.number_input('Num:'))): foo()
>>> if st.sidebar.selectbox('I:',['f']) == 'f': b()
>>> my_slider_val = st.slider('Quinn Mallory', 1, 88)
>>> st.write(slider_val)
    ''')

    # Control flow

    col2.subheader('Control flow')
    col2.code('''
st.stop()
    ''')

    # Lay out your app

    col2.subheader('Lay out your app')
    col2.code('''
st.beta_container()
st.beta_columns(spec)
>>> col1, col2 = st.beta_columns(2)
>>> col1.subheader('Columnisation')
st.beta_expander('Expander')
>>> with st.beta_expander('Expand'):
>>>     st.write('Juicy deets')
    ''')

    # Display code

    col2.subheader('Display code')
    col2.code('''
st.echo()
>>> with st.echo():
>>>     st.write('Code will be executed and printed')
    ''')

    # Display progress and status

    col3.subheader('Display progress and status')
    col3.code('''
st.progress(progress_variable_1_to_100)
st.spinner()
>>> with st.spinner(text='In progress'):
>>>     time.sleep(5)
>>>     st.success('Done')
st.balloons()
st.error('Error message')
st.warning('Warning message')
st.info('Info message')
st.success('Success message')
st.exception(e)
    ''')

    # Placeholders, help, and options

    col3.subheader('Placeholders, help, and options')
    col3.code('''
st.empty()
>>> my_placeholder = st.empty()
>>> my_placeholder.text('Replaced!')
st.help(pandas.DataFrame)
st.get_option(key)
st.set_option(key, value)
st.beta_set_page_config(layout='wide')
    ''')

    # Mutate data

    col3.subheader('Mutate data')
    col3.code('''
DeltaGenerator.add_rows(data)
>>> my_table = st.table(df1)
>>> my_table.add_rows(df2)
>>> my_chart = st.line_chart(df1)
>>> my_chart.add_rows(df2)
    ''')

    # Optimize performance

    col3.subheader('Optimize performance')
    col3.code('''
@st.cache
>>> @st.cache
... def foo(bar):
...     # Mutate bar
...     return data
>>> # Executes d1 as first time
>>> d1 = foo(ref1)
>>> # Does not execute d1; returns cached value, d1==d2
>>> d2 = foo(ref1)
>>> # Different arg, so function d1 executes
>>> d3 = foo(ref2)
    ''')

    return None
Ejemplo n.º 3
0
def main():
    # Title
    st.title("Model Experimentation with MLflow")

    # Choose dataset
    df_raw = load_data()
    st.write(df_raw.head())

    st.header("Dendrogram")
    corr = np.round(spearmanr(df_raw.drop('label', axis=1)).correlation, 4)
    corr_condensed = hc.distance.squareform(1 - corr)
    z = hc.linkage(corr_condensed, method='average')
    fig_den = plt.figure(figsize=(16, 10))
    dendrogram = hc.dendrogram(z,
                               labels=df_raw.drop('label', axis=1).columns,
                               orientation='left',
                               leaf_font_size=16)
    st.pyplot(fig_den, clear_figure=True)

    st.header("Correlation Matrix")
    fig_cor = plt.figure(figsize=(16, 10))
    sns.heatmap(df_raw.corr())
    st.pyplot(fig_cor, clear_figure=True)

    st.header("Boxplots")
    fig_box1 = plt.figure(figsize=(20, 5))
    sns.boxplot(data=df_raw)
    st.pyplot(fig_box1, clear_figure=True)

    q = st.slider("", 0.9, 1.0, 0.99, 0.01)
    df_raw = filter_df(df_raw, q)

    fig_box2 = plt.figure(figsize=(20, 5))
    sns.boxplot(data=df_raw)
    st.pyplot(fig_box2, clear_figure=True)

    # Model selection
    models = {
        'Logistic Regression':
        LogisticRegression(max_iter=2000, n_jobs=4, random_state=42),
        'Random Forest':
        RandomForestClassifier(n_jobs=4, random_state=42),
        'SVC':
        SVC(random_state=42),
        'KNNeighbors':
        KNeighborsClassifier(n_jobs=4)
    }

    # Feature selection
    feature_options = df_raw.columns.drop('label').tolist()
    feature_choice = st.multiselect("Choose features to drop", feature_options)
    treatment_options = {
        'None': no_op,
        'StandardScaler': scal_features,
        'PCA': pca_features
    }

    treatment_choice = st.selectbox("Choose feature treatment",
                                    list(treatment_options.keys()))
    clear_mlflow = st.checkbox("Clear mlflow experiments?")
    clear_mlflow = st.button("Clear MLFlow")
    if clear_mlflow:
        exp = mlflow.get_experiment_by_name('model_selection')
        if exp != None and exp.lifecycle_stage != 'deleted':
            st.write('Previous experiment exists')
            mlflow.delete_experiment(exp.experiment_id)

        st.write(f'Archiving experiment with id {exp.experiment_id}')
        subprocess.run([f'ls -la mlruns/.trash/{exp.experiment_id}'],
                       shell=True,
                       check=True)

        try:
            subprocess.run('rm -rf mlruns/.trash/*', shell=True, check=True)
        except subprocess.CalledProcessError as e:
            st.write(e)
            exit(-1)
        experiment_id = mlflow.create_experiment('model_selection')
    else:
        experiment_id = mlflow.set_experiment('model_selection')
    # Mlflow tracking
    track_with_mlflow = st.checkbox("Track with mlflow?")

    # Model training
    start_training = st.button("Start training")
    if not start_training:
        st.stop()

    y = df_raw['label'].copy()
    sub_df = df_raw.drop([*feature_choice, 'label'], axis=1)
    X = sub_df.copy()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    res = pd.DataFrame({'model': [], 'f1': []})

    sc = make_scorer(f1_score, pos_label='AF')

    if track_with_mlflow and clear_mlflow:
        mlflow.end_run()
        exp = mlflow.get_experiment_by_name('model_selection')
        if exp != None and exp.lifecycle_stage != 'deleted':
            st.write('Previous experiment exists')
            mlflow.delete_experiment(exp.experiment_id)

        st.write(f'Archiving experiment with id {exp.experiment_id}')
        subprocess.run([f'ls -la mlruns/.trash/{exp.experiment_id}'],
                       shell=True,
                       check=True)

        try:
            subprocess.run('rm -rf mlruns/.trash/*', shell=True, check=True)
        except subprocess.CalledProcessError as e:
            st.write(e)
            exit(-1)
        experiment_id = mlflow.create_experiment('model_selection')
    else:
        experiment_id = mlflow.set_experiment('model_selection')

    for name, model in models.items():
        if track_with_mlflow:
            # mlflow.set_experiment(experiment_id)
            mlflow.start_run()
            mlflow.log_param('features', list(X.columns))
            mlflow.log_param('model', name)

        X_train, X_test = treatment_options[treatment_choice](X_train, X_test)
        st.write(f'Training {name}')
        scores = cross_val_score(model,
                                 X_train,
                                 y_train,
                                 cv=4,
                                 scoring=sc,
                                 n_jobs=4)
        model.fit(X_train, y_train)

        # Model evaluation
        preds_test = model.predict(X_test)
        metric_name = "f1_score"
        metric_test = f1_score(y_test, preds_test, pos_label='AF')

        # st.write(f"{metric_name}_train", round(metric_train, 3))
        # st.write(f"{metric_name}_test", round(metric_test, 3))
        res = res.append({
            'model': f"{name}",
            'f1': scores.mean()
        },
                         ignore_index=True)

        if track_with_mlflow:
            mlflow.log_metric(metric_name + "_test", scores.mean())
            tracking_url_type_store = urlparse(
                mlflow.get_tracking_uri()).scheme

            # Model registry does not work with file store
            if tracking_url_type_store != "file":

                mlflow.sklearn.log_model(model,
                                         "model",
                                         registered_model_name="AF_Classifier")
            else:
                mlflow.sklearn.log_model(model, "model")
            mlflow.end_run()

    st.write(res.sort_values('f1', ascending=False))
Ejemplo n.º 4
0
def handle_edge_url(url_params: dict, pathSession):
    '''Display tables associated with a link'''

    namespace = url_params.get('namespace', [""])[0]
    hostname = url_params.get('hostname', [""])[0]
    nhip = url_params.get('nhip', [""])[0]
    ipLookup = url_params.get('ipLookup', [""])[0]
    vtepLookup = url_params.get('vtepLookup', [""])[0]
    vrf = url_params.get('vrf', [""])[0]
    ifhost = url_params.get('ifhost', [""])[0]
    macaddr = url_params.get('macaddr', [""])[0]
    oif = url_params.get('oif', [""])[0]

    if not hostname:
        st.error('No hostname found to display information for')
        st.stop()

    st.header(f'Debug Tables for Path from {pathSession.source} to '
              f'{pathSession.dest}')
    hoptype = 'Bridged' if macaddr else 'Routed'
    st.subheader(f'{hoptype} hop between {hostname} and {ifhost}')

    pathobj = getattr(pathSession, 'pathobj', None)
    engobj = pathobj.engine_obj

    if ipLookup:
        if not vtepLookup or (ipLookup != vtepLookup):
            st.info(f'Route Lookup on {hostname}')
            st.dataframe(data=engobj._rdf.query(
                f'hostname=="{hostname}" and vrf=="{vrf}"'))

        if vtepLookup:
            st.info(f'Underlay Lookup on {hostname} for {vtepLookup}')
            vtepdf = engobj._underlay_dfs.get(vtepLookup, pd.DataFrame())
            if not vtepdf.empty:
                st.dataframe(data=vtepdf.query(
                    f'hostname=="{hostname}" and vrf=="default"'))
        if nhip:
            st.info(
                f'ARP/ND Table on {hostname} for nexthop {nhip}, oif={oif}')
            arpdf = engobj._arpnd_df.query(f'hostname=="{hostname}" and '
                                           f'ipAddress=="{nhip}" and '
                                           f'oif=="{oif}"')
            st.dataframe(data=arpdf)

            if not arpdf.empty:
                if ':' in nhip:
                    dropcol = ['ipAddressList']
                else:
                    dropcol = ['ip6AddressList']

                nhmac = arpdf.macaddr.iloc[0]
                if nhmac:
                    if_df = engobj._if_df.query(f'macaddr=="{nhmac}" and '
                                                f'hostname=="{ifhost}"') \
                                         .drop(columns=dropcol)
                    label = (f'matching nexthop {nhip}, macaddr {nhmac} on '
                             f'host {ifhost}')
                else:
                    label = f'matching nexthop {nhip} on host {ifhost}'
                    if_df = engobj._if_df.query(f'hostname=="{ifhost}"') \
                                         .drop(columns=dropcol)

            if nhip != '169.254.0.1':
                st.info(f'Interfaces {label}')
                s = if_df.ipAddressList.str \
                                       .startswith(f'{nhip}/') \
                                       .dropna()
                s = s.loc[s == True]
                st.dataframe(data=engobj._if_df.iloc[s.loc[s == True].index])
            else:
                st.info(f'Interfaces {label}')
                st.dataframe(data=if_df)
    if macaddr:
        with st.beta_expander(f'MAC Table for {hostname}, MAC addr {macaddr}',
                              expanded=True):
            st.dataframe(data=pathobj.engine_obj._macsobj.get(
                namespace=namespace, hostname=hostname, macaddr=macaddr))
Ejemplo n.º 5
0
def main():
    logging.info("Main script is refreshed...")

    # Custom functionality for ensuring changing widgets do not cause previous sections to rests
    state = get_state()
    st.title("What Makes a Playlist Successful?")
    st.write(
        "**This application trains & evaluates playlist success classification models, "
        "and generates SHAP visualizations for analyzing feature importance**")
    st.write(
        "[Created By: Alexander Wong](https://www.linkedin.com/in/alexrobwong/)",
        unsafe_allow_html=True,
    )

    if st.checkbox("Click to watch recorded demo"):
        st.video("https://www.youtube.com/watch?v=dPsGxb9lTUY")

    # Sidebar Inputs -------------------------------------------------------------------------------------------------
    experiment_name_input = st.sidebar.text_input("Experiment name:")
    experiment_name = f"{experiment_name_input}_{str(datetime.now())}"

    genre_options = GENRES
    default_ix = GENRES.index("Dance & House")
    selected_genre = st.sidebar.selectbox("Select genre:",
                                          options=genre_options,
                                          index=default_ix)

    # selected genre must be a list
    genre = [selected_genre]

    users_threshold = st.sidebar.number_input(
        "Minimum monthly number of Users:",
        min_value=10,
    )
    success_threshold = (st.sidebar.slider(
        "Streaming-ratio success threshold (%):",
        min_value=1,
        max_value=99,
        value=70,
    ) / 100)
    holdout_fraction = (st.sidebar.slider(
        "Test Size (%):", min_value=1, max_value=30, value=5) / 100)
    model_map = {
        "Extreme Gradient Boosting": "xgboost",
        "Decision Tree Classifier": "dt",
        "Extra Trees Classifier": "et",
        "Light Gradient Boosting Machine": "lightgbm",
        "Random Forest Classifier": "rf",
    }
    model_selection = list(
        st.sidebar.multiselect("Models to train:",
                               options=list(model_map.keys())))
    optionals = st.sidebar.beta_expander(
        "Additional Feature Engineering Parameters", False)
    polynomials_box = optionals.checkbox("Feature Polynomials")
    interactions_box = optionals.checkbox("Feature Interactions")
    ratios_box = optionals.checkbox("Feature Ratios")

    if polynomials_box:
        polynomials = True
    else:
        polynomials = False

    if interactions_box:
        interactions = True
    else:
        interactions = False

    if ratios_box:
        ratios = True
    else:
        ratios = False

    # Experiment & Model Training -------------------------------------------------------------------------------------
    train = st.checkbox("Click to train models")
    if train:

        # Application can only be run start to finish if xgboost is selected...add it to the list of options
        exb_added = False
        if "Extreme Gradient Boosting" not in model_selection:
            model_selection.append("Extreme Gradient Boosting")
            exb_added = True

        # Bugfix - must select at least two models to train other wise model object is used instead of index
        lgb_added = False
        if "Light Gradient Boosting Machine" not in model_selection:
            model_selection.append("Light Gradient Boosting Machine")
            lgb_added = True

        include_models = [model_map[x] for x in list(model_selection)]

        # Check that models are selected - if none are selected, all models will be trained (undesired app behavior)
        if len(include_models) == 0 or include_models is None:
            raise Exception(
                "No models were selected. Please re-start the application")

        base_frame = pd.read_parquet("data/streamlit_data.parquet")
        state.genre_frame = base_frame.loc[lambda f: f["genre_1"].isin(genre)]
        labelled_frame = classify_success(state.genre_frame, users_threshold,
                                          success_threshold)

        train_frame, holdout_frame = create_holdout(
            labelled_frame, holdout_fraction=holdout_fraction)

        # PyCaret setup to train models
        if not state.experiment_complete:
            with st.spinner("Model Training in Progress"):
                if exb_added:
                    st.success(
                        "**Extreme Gradient Boosting Model** automatically added by default into model pipeline"
                    )
                if lgb_added:
                    st.success(
                        "**Light Gradient Boosting Machine Model** automatically added by default into model pipeline"
                    )
                setup(
                    data=train_frame,
                    numeric_features=MODEL_NUMERICAL_FEATURES,
                    categorical_features=MODEL_CATEGORICAL_FEATURES,
                    target="success_streaming_ratio_users",
                    ignore_features=["playlist_uri"],
                    test_data=holdout_frame,
                    session_id=123,
                    ignore_low_variance=True,
                    remove_outliers=True,
                    fix_imbalance=True,
                    remove_multicollinearity=True,
                    log_experiment=True,
                    log_data=True,
                    fold=2,
                    n_jobs=-1,
                    combine_rare_levels=True,
                    experiment_name=experiment_name,
                    silent=True,
                    feature_interaction=interactions,
                    feature_ratio=ratios,
                    polynomial_features=polynomials,
                )
                state.list_models = compare_models(n_select=5,
                                                   round=3,
                                                   cross_validation=False,
                                                   include=include_models)
                state.experiment_complete = True

                state.X_train = get_config(variable="X_train")
                state.y_train = get_config(variable="y_train")
                state.view = pd.merge(state.y_train,
                                      state.X_train,
                                      left_index=True,
                                      right_index=True).reset_index(drop=True)

        # Display model training results
        st.header("Model Training & Testing Results")
        exp = pull()
        st.dataframe(exp)
        st.info("**Models were trained using default parameters**")
        st.info(
            "To improve individual model performance,"
            "please consider offline **hyperparameter tuning** techniques such as **Grid Search**. "
            "To improve overall performance, please consider advanced offline **ensembling** techniques "
            "such as **Bagging**, **Boosting**, **Stacking**")

        # Model Definitions
        models_expander = st.beta_expander("Model Definitions")
        models_expander.write(
            "[**Decision Tree Classifier**](https://en.wikipedia.org/wiki/Decision_tree_learning)"
        )
        models_expander.write(
            "A Decision Tree is a simple representation for "
            "classifying examples, a form of Supervised Machine Learning where the data is "
            "continuously split according to a certain parameter. A decision tree starts with a "
            "single node, which branches into possible outcomes. Each of those outcomes "
            "leads to additional nodes, which branch off into other possibilities"
        )
        models_expander.write("")
        models_expander.write(
            "[**Random Forest Classifier**](https://en.wikipedia.org/wiki/Random_forest)"
        )
        models_expander.write(
            "An ensemble learning method"
            "that operates by constructing a multitude of decision trees at training time, "
            "where each tree is trained on a bootstrap replica of the training data and final "
            "model classification is decide via majority vote from the constituent trees"
        )
        models_expander.write("")
        models_expander.write(
            "[**Extra Trees Classifier**](https://quantdare.com/what-is-the-difference-between"
            "-extra-trees-and-random-forest/)")
        models_expander.write(
            "Extremely randomized trees is similar to Random Forest, "
            "in that it builds multiple trees and splits nodes using random subsets of features, "
            "but with two key differences: it does not bootstrap observations (meaning it samples "
            "without replacement), and nodes are split on random splits, not best splits"
        )
        models_expander.write("")
        models_expander.write(
            "[**Extreme Gradient Boosting**](https://en.wikipedia.org/wiki/Gradient_boosting)"
        )
        models_expander.write(
            "Boosting is a technique which combines a learning "
            "algorithm in series to achieve a strong learner from many sequentially connected "
            "weak learners. In case of gradient boosted decision trees algorithm, "
            "the weak learners are decision trees where each tree attempts to minimize the errors "
            "of previous tree. Trees in boosting are weak learners but adding many trees in series a"
            "and each focusing on the errors from previous one make boosting a "
            "highly efficient and accurate model")
        models_expander.write("")
        models_expander.write(
            "[**Light Gradient Boosting Machine**](https://lightgbm.readthedocs.io/en/latest/)"
        )
        models_expander.write(
            "A gradient boosting framework for machine "
            "learning originally developed by Microsoft. Similar to Extreme Gradient Boosting, "
            "it is based on decision tree algorithms, however unlike Extreme Gradient Boosting, "
            "the algorithm splits the tree leaf wise instead of level wise")
        models_expander.write("")

        # Model Evaluation Metrics
        metrics_expander = st.beta_expander("Model Evaluation Metrics")
        metrics_expander.write("**Accuracy**")
        metrics_expander.write(
            "Accuracy is defined as the percentage of correct predictions for the test data."
            " It can be calculated easily by dividing the number of correct predictions by the "
            "number of total predictions.")
        metrics_expander.write("")
        metrics_expander.write("**AUC**")
        metrics_expander.write(
            "An ROC curve (receiver operating characteristic curve) is a graph showing the "
            "performance of a classification model at all classification thresholds. This curve "
            "plots the True Positive Rate (TP) and False Negative Rate (FP)")
        metrics_expander.write("")
        metrics_expander.write("**Recall**")
        metrics_expander.write(
            "Recall is defined as the fraction of examples which were predicted to belong "
            "to a class with respect to all of the examples that truly belong in the class."
        )
        metrics_expander.write("")
        metrics_expander.write("**Precision**")
        metrics_expander.write(
            "Precision is defined as the fraction of relevant examples (true positives) among "
            "all of the examples which were predicted to belong in a certain class."
        )
        metrics_expander.write("")
        metrics_expander.write("**F1**")
        metrics_expander.write(
            "The traditional F-measure or balanced F-score (F1 score) is the harmonic mean "
            "of precision and recall and is calculated as --> F1 score = 2 * (Precision * Recall) / "
            "(Precision + Recall)")
        metrics_expander.write("")
        metrics_expander.write("**Kappa**")
        metrics_expander.write(
            "The Kappa statistic (or value) is a metric that compares an Observed Accuracy with "
            "an Expected Accuracy (random chance). The kappa statistic is used not only to evaluate "
            "a single classifier, but also to evaluate classifiers amongst themselves. In addition, "
            "it takes into account random chance (agreement with a random classifier), which"
            " generally means it is less misleading than simply using accuracy as a metric "
            "(an Observed Accuracy of 80% is a lot less impressive with an Expected Accuracy of "
            "75% versus an Expected Accuracy of 50%)")
        metrics_expander.write("")
        metrics_expander.write("**MCC**")
        metrics_expander.write(
            "Unlike the other metrics discussed above, MCC takes all the cells of the Confusion"
            " Matrix into consideration in its formula --> MCC = TP * TN – FP * FN / √ (TP +FP) * "
            "(TP + FN) * (TN + FP) * (TN + FN) .Similar to Correlation Coefficient, the range of "
            "values of MCC lie between -1 to +1. A model with a score of +1 is a perfect model "
            "and -1 is a poor model. This property is one of the key usefulness of MCC as it"
            " leads to easy interpretability.")
        metrics_expander.write("")

        # Additional model data
        opts = st.beta_expander("Additional Model Data", False)
        # Download the training data as an excel file
        if opts.button("Display Link to Download Model Training Data"):
            st.markdown(get_table_download_link(state.view),
                        unsafe_allow_html=True)

        # Prompt to launch MLFlow
        if opts.button("Display Link to Spotify Model Training History"):
            st.info(
                "Note that this application uses MLFlow only when both the application and MLFlow are "
                "deployed locally")

        # Overall importance ------------------------------------------------------------------------------------------
        st.write("")  # Intentional extra blank spaces
        st.write("")
        st.header(f"Success Drives for {selected_genre} Playlists")
        dict_models = {}
        for i, model in enumerate(exp.index):
            dict_models[model] = i

        user_selected_model = st.selectbox(
            "Select model to view feature importance:", exp.index)
        state.importance = st.checkbox("Click to calculate feature importance")
        if state.importance and state.experiment_complete:
            state.new_selected_model = state.list_models[
                dict_models[user_selected_model]]
            st.write("**Model parameters: **")
            st.write(state.new_selected_model)
            st.write("")
            st.write("**Generating Visualizations...**")
            bar = st.progress(0)

            if state.selected_model != state.new_selected_model:
                state.selected_model = state.new_selected_model
                state.explainer = shap.TreeExplainer(state.selected_model)
                state.shap_values = state.explainer.shap_values(
                    state.X_train.to_numpy())
            bar.progress(25)

            # Overall Feature Importance -------------------------------------------------------------------------
            st.subheader("Success Drivers - Average")
            st.pyplot(
                shap.summary_plot(state.shap_values,
                                  state.X_train,
                                  plot_type="bar"))

            # Violin plot and waterfall plot only available at this time for XGBoost model
            if user_selected_model != "xgboost":
                st.warning(
                    "This PoC has only been configured for when **Extreme Gradient Boosting "
                    "(xgboost)** is selected for analysis")
                bar.progress(100)
                st.stop()

            else:
                # Violin Feature Importance --------------------------------------------------------------------------
                st.subheader(
                    f"Success Drivers - All {selected_genre} Playlists")
                st.pyplot(shap.summary_plot(state.shap_values, state.X_train))
                bar.progress(50)

                # Dependence plots for each of the top 3 features ----------------------------------------------------
                st.header(f"Shapley Dependence for {selected_genre} Playlists")
                vals = np.abs(state.shap_values).mean(0)
                feature_importance = pd.DataFrame(
                    list(zip(state.X_train.columns, vals)),
                    columns=["col_name", "feature_importance_vals"],
                )
                feature_importance = (feature_importance.sort_values(
                    by=["feature_importance_vals"],
                    ascending=False).reset_index(drop=True).head(3))

                top_features = list(feature_importance["col_name"])
                for feature in top_features:
                    index = list(state.X_train.columns).index(feature)
                    st.subheader(f"Shapley Value Dependence for {feature}")
                    st.pyplot(
                        shap.dependence_plot(
                            index,
                            state.shap_values,
                            state.X_train,
                            alpha=0.5,
                            interaction_index=None,
                        ))
                bar.progress(70)

                # Individual importance -------------------------------------------------------------------------------
                st.header(
                    f"Explaining {selected_genre} Playlist Success Prediction")

                # Display the data frame for users to visually see the row they want to analyze
                st.subheader("Model Training Data")
                st.dataframe(state.view)
                state.new_row = int(
                    st.number_input(
                        "Row from dataframe to inspect",
                        min_value=0,
                        max_value=len(state.view),
                        value=10,
                    ))
                if state.row != state.new_row:
                    state.row = state.new_row
                    shap_object = ShapObject(
                        base_values=state.explainer.expected_value,
                        values=state.explainer.shap_values(
                            state.X_train)[state.row, :],
                        feature_names=state.X_train.columns,
                        data=state.X_train.iloc[state.row, :],
                    )
                    bar.progress(85)
                    st.subheader(
                        f"Feature Contributions to {selected_genre} Playlist #{state.row}"
                    )
                    st.pyplot(shap.waterfall_plot(shap_object))
                    bar.progress(100)
                    st.stop()
                else:
                    st.stop()
        else:
            st.stop()
    else:
        st.stop()
Ejemplo n.º 6
0
def show_plot(df, datefield, title, wdw, what_to_show_, graph_type, centersmooth):
    what_to_show_ = what_to_show_ if type(what_to_show_) == list else [what_to_show_]
    color_list = [
        "#02A6A8",
        "#4E9148",
        "#F05225",
        "#024754",
        "#FBAA27",
        "#302823",
        "#F07826",
        "#ff6666",
    ]
    if len(df) == 1 and datefield == "YYYY":
        st.warning("Selecteer een grotere tijdsperiode")
        st.stop()

    if graph_type=="pyplot"  :
        with _lock:
            fig1x = plt.figure()
            ax = fig1x.add_subplot(111)
            for i, what_to_show in enumerate(what_to_show_):
                sma = df[what_to_show].rolling(window=wdw, center=centersmooth).mean()
                ax = df[what_to_show].plot(
                    label="_nolegend_",
                    linestyle="dotted",
                    color=color_list[i],
                    linewidth=0.5,
                )
                ax = sma.plot(label=what_to_show, color=color_list[i], linewidth=0.75)

            ax.set_xticks(df[datefield].index)
            if datefield == "YYYY":
                ax.set_xticklabels(df[datefield], fontsize=6, rotation=90)
            else:
                ax.set_xticklabels(df[datefield].dt.date, fontsize=6, rotation=90)
            xticks = ax.xaxis.get_major_ticks()
            for i, tick in enumerate(xticks):
                if i % 10 != 0:
                    tick.label1.set_visible(False)

            plt.xticks()
            plt.grid(which="major", axis="y")
            plt.title(title)
            plt.legend()
            st.pyplot(fig1x)
    else:
        fig = go.Figure()
        df["sma"] = df[what_to_show_[0]].rolling(window=wdw, center=centersmooth).mean()

        sma = go.Scatter(
            name=what_to_show_[0],
            x=df[datefield],
            y= df["sma"],
            mode='lines',
            line=dict(width=1,color='rgba(0, 0, 168, 0.8)'),
            )
        points = go.Scatter(
            name="",
            x=df[datefield],
            y= df[what_to_show_[0]],
            mode='markers',
            showlegend=False,marker=dict(
            color='LightSkyBlue',
            size=2))


        data = [sma,points]

        layout = go.Layout(
            yaxis=dict(title=what_to_show_[0]),
            title=title,)
            #, xaxis=dict(tickformat="%d-%m")
        fig = go.Figure(data=data, layout=layout)
        fig.update_layout(xaxis=dict(tickformat="%d-%m"))
        st.plotly_chart(fig, use_container_width=True)
Ejemplo n.º 7
0
def cs_body():
    # Magic commands

    col1, col2, col3 = st.columns(3)

    col1.subheader('Magic commands')
    col1.code('''# Magic commands implicitly `st.write()`
\'\'\' _This_ is some __Markdown__ \'\'\'
a=3
'dataframe:', data
    ''')

    # Display text

    col1.subheader('Display text')
    col1.code('''
st.text('Fixed width text')
st.markdown('_Markdown_') # see *
st.caption('Balloons. Hundreds of them...')
st.latex(r\'\'\' e^{i\pi} + 1 = 0 \'\'\')
st.write('Most objects') # df, err, func, keras!
st.write(['st', 'is <', 3]) # see *
st.title('My title')
st.header('My header')
st.subheader('My sub')
st.code('for i in range(8): foo()')

* optional kwarg unsafe_allow_html = True

    ''')

    # Display data

    col1.subheader('Display data')
    col1.code('''
st.dataframe(my_dataframe)
st.table(data.iloc[0:10])
st.json({'foo':'bar','fu':'ba'})
st.metric(label="Temp", value="273 K", delta="1.2 K")
    ''')

    # Display charts

    col1.subheader('Display charts')
    col1.code('''
st.line_chart(data)
st.area_chart(data)
st.bar_chart(data)
st.pyplot(fig)
st.altair_chart(data)
st.vega_lite_chart(data)
st.plotly_chart(data)
st.bokeh_chart(data)
st.pydeck_chart(data)
st.deck_gl_chart(data)
st.graphviz_chart(data)
st.map(data)
    ''')

    # Display media

    col1.subheader('Display media')
    col1.code('''
st.image('./header.png')
st.audio(data)
st.video(data)
    ''')

    # Display interactive widgets

    col2.subheader('Display interactive widgets')
    col2.code('''
st.button('Hit me')
st.download_button('On the dl', data)
st.checkbox('Check me out')
st.radio('Radio', [1,2,3])
st.selectbox('Select', [1,2,3])
st.multiselect('Multiselect', [1,2,3])
st.slider('Slide me', min_value=0, max_value=10)
st.select_slider('Slide to select', options=[1,'2'])
st.text_input('Enter some text')
st.number_input('Enter a number')
st.text_area('Area for textual entry')
st.date_input('Date input')
st.time_input('Time entry')
st.file_uploader('File uploader')
st.color_picker('Pick a color')
    ''')
    col2.write('Use widgets\' returned values in variables:')
    col2.code('''
>>> for i in range(int(st.number_input('Num:'))): foo()
>>> if st.sidebar.selectbox('I:',['f']) == 'f': b()
>>> my_slider_val = st.slider('Quinn Mallory', 1, 88)
>>> st.write(slider_val)
    ''')

    # Control flow

    col2.subheader('Control flow')
    col2.code('''
st.stop()
    ''')

    # Lay out your app

    col2.subheader('Lay out your app')
    col2.code('''
st.form('my_form_identifier')
st.form_submit_button('Submit to me')
st.container()
st.columns(spec)
>>> col1, col2 = st.columns(2)
>>> col1.subheader('Columnisation')
st.expander('Expander')
>>> with st.expander('Expand'):
>>>     st.write('Juicy deets')
    ''')

    col2.write('Batch widgets together in a form:')
    col2.code('''
>>> with st.form(key='my_form'):
>>> 	text_input = st.text_input(label='Enter some text')
>>> 	submit_button = st.form_submit_button(label='Submit')
    ''')

    # Display code

    col2.subheader('Display code')
    col2.code('''
st.echo()
>>> with st.echo():
>>>     st.write('Code will be executed and printed')
    ''')

    # Display progress and status

    col3.subheader('Display progress and status')
    col3.code('''
st.progress(progress_variable_1_to_100)
st.spinner()
>>> with st.spinner(text='In progress'):
>>>     time.sleep(5)
>>>     st.success('Done')
st.balloons()
st.error('Error message')
st.warning('Warning message')
st.info('Info message')
st.success('Success message')
st.exception(e)
    ''')

    # Placeholders, help, and options

    col3.subheader('Placeholders, help, and options')
    col3.code('''
st.empty()
>>> my_placeholder = st.empty()
>>> my_placeholder.text('Replaced!')
st.help(pandas.DataFrame)
st.get_option(key)
st.set_option(key, value)
st.set_page_config(layout='wide')
    ''')

    # Mutate data

    col3.subheader('Mutate data')
    col3.code('''
DeltaGenerator.add_rows(data)
>>> my_table = st.table(df1)
>>> my_table.add_rows(df2)
>>> my_chart = st.line_chart(df1)
>>> my_chart.add_rows(df2)
    ''')

    # Optimize performance

    col3.subheader('Optimize performance')
    col3.code('''
@st.cache
>>> @st.cache
... def fetch_and_clean_data(url):
...     # Mutate data at url
...     return data
>>> # Executes d1 as first time
>>> d1 = fetch_and_clean_data(ref1)
>>> # Does not execute d1; returns cached value, d1==d2
>>> d2 = fetch_and_clean_data(ref1)
>>> # Different arg, so function d1 executes
>>> d3 = fetch_and_clean_data(ref2)

    ''')

    col3.subheader('Other key parts of the API')
    col3.markdown('''
<small>[State API](https://docs.streamlit.io/en/stable/session_state_api.html)</small><br>
<small>[Theme option reference](https://docs.streamlit.io/en/stable/theme_options.html)</small><br>
<small>[Components API reference](https://docs.streamlit.io/en/stable/develop_streamlit_components.html)</small><br>
<small>[API cheat sheet](https://share.streamlit.io/daniellewisdl/streamlit-cheat-sheet/app.py)</small><br>
''',
                  unsafe_allow_html=True)

    return None
Ejemplo n.º 8
0
def page_work(state_container, page_flip: bool):
    '''The main workhorse routine for the Xplore page'''

    if not state_container.xploreSessionState:
        state_container.xploreSessionState = XploreSessionState()
        state = state_container.xploreSessionState
        state.columns = ['default']
    else:
        state = state_container.xploreSessionState

    url_params = st.experimental_get_query_params()
    page = url_params.pop('page', '')
    if get_title() in page:
        if url_params and not all(not x for x in url_params.values()):
            for key in url_params:
                if key == 'columns':
                    # This needs to be a list
                    continue
                val = url_params.get(key, '')
                if isinstance(val, list):
                    val = val[0]
                    url_params[key] = val
                if key == '':
                    if val == 'True':
                        url_params[key] = True
                    else:
                        url_params[key] = False
            state.__init__(**url_params)

    sqobjs = state_container.sqobjs
    # All the user input is preserved in the state vars
    xplore_sidebar(state, sqobjs)

    if state.table != "tables":
        df = gui_get_df(sqobjs[state.table],
                        _table=state.table,
                        namespace=state.namespace.split(),
                        hostname=state.hostname.split(),
                        start_time=state.start_time,
                        end_time=state.end_time,
                        view=state.view,
                        columns=state.columns)
        if state.table == "device" and 'uptime' in df.columns:
            df.drop(columns=['uptime'], inplace=True)
    else:
        df = gui_get_df(sqobjs[state.table],
                        _table=state.table,
                        namespace=state.namespace.split(),
                        hostname=state.hostname.split(),
                        start_time=state.start_time,
                        end_time=state.end_time,
                        view=state.view)

    query_str = ''
    if not df.empty:
        if 'error' in df.columns:
            st.error(df.iloc[0].error)
            st.experimental_set_query_params(**asdict(state))
            st.stop()
        if state.query:
            try:
                show_df = df.query(state.query)
                query_str = state.query
            except Exception:
                st.warning('Query string throws an exception, ignoring')
                show_df = df
                query_str = ''
        else:
            show_df = df
    else:
        show_df = df

    if state.table != "tables":
        summ_df = xplore_run_summarize(sqobjs[state.table],
                                       namespace=state.namespace.split(),
                                       hostname=state.hostname.split(),
                                       start_time=state.start_time,
                                       end_time=state.end_time,
                                       query_str=query_str)
    else:
        summ_df = pd.DataFrame()

    if not show_df.empty:
        dfcols = show_df.columns.tolist()
        if (state.table == 'routes' and 'prefix' in dfcols
                and 'prefixlen' not in dfcols):
            dfcols.append('prefixlen')

        dfcols = sorted((filter(lambda x: x not in ['index', 'sqvers'],
                                dfcols)))

        grid1 = st.beta_container()
        headercol, uniq_col = st.beta_columns(2)
        with grid1:
            with headercol:
                st.write(
                    f'<h2 style="color: darkblue; font-weight: bold;">{state.table} View</h2>',
                    unsafe_allow_html=True)
                if show_df.shape[0] > 256:
                    st.write(
                        f'Showing first 256 of {show_df.shape[0]} rows, use query to filter'
                    )
            with uniq_col:
                if state.table != "tables":
                    if (not state.uniq_clicked
                            or state.uniq_clicked not in dfcols):
                        if 'hostname' in dfcols:
                            selindex = dfcols.index('hostname') + 1
                        else:
                            selindex = 1
                    elif state.uniq_clicked in dfcols:
                        selindex = dfcols.index(state.uniq_clicked) + 1

                    state.uniq_clicked = st.selectbox('Distribution Count of',
                                                      options=['-'] + dfcols,
                                                      index=selindex,
                                                      key='distcount')

        scol1, scol2 = st.beta_columns(2)

        if state.table != "tables" and state.uniq_clicked != '-':
            uniq_df = xplore_run_unique(show_df, columns=state.uniq_clicked)
        else:
            uniq_df = pd.DataFrame()

        if state.assert_clicked:
            assert_df = xplore_run_assert(sqobjs[state.table],
                                          start_time=state.start_time,
                                          end_time=state.end_time,
                                          namespace=state.namespace.split())
        else:
            assert_df = pd.DataFrame()

        if not summ_df.empty:
            with scol1:
                st.subheader('Summary Information')
                st.dataframe(data=summ_df)

        if not uniq_df.empty:
            with scol2:
                if uniq_df.shape[0] > 32:
                    st.warning(
                        f'{state.uniq_clicked} has cardinality > 32. Displaying top 32'
                    )
                    chart = alt.Chart(
                        uniq_df.head(32),
                        title=f'{state.uniq_clicked} Distribution') \
                        .mark_bar(color='purple', tooltip=True) \
                        .encode(y=alt.Y(f'{state.uniq_clicked}:N',
                                        sort='-x'),
                                x='count')
                else:

                    chart = alt.Chart(
                        uniq_df, title=f'{state.uniq_clicked} Distribution') \
                        .mark_bar(color='purple', tooltip=True) \
                        .encode(y=alt.Y(f'{state.uniq_clicked}:N',
                                        sort='-x'),
                                x='count')
                st.altair_chart(chart)

        if state.table in ['interfaces', 'ospf', 'bgp', 'evpnVni']:
            if assert_df.empty:
                expand_assert = False
            else:
                expand_assert = True
            validate_expander = st.beta_expander('Assert',
                                                 expanded=expand_assert)
            with validate_expander:
                if not assert_df.empty:
                    st.dataframe(data=assert_df)
                elif state.assert_clicked:
                    st.write('Assert passed')
                else:
                    st.write('Assert not run')

    expander = st.beta_expander('Table', expanded=True)
    with expander:
        if not show_df.empty:
            convert_dict = {
                x: 'str'
                for x in df.select_dtypes('category').columns
            }
            st.dataframe(data=sq_gui_style(
                show_df.head(256).astype(convert_dict), state.table),
                         height=600,
                         width=2500)
        else:
            st.warning('No Data from query')

        st.experimental_set_query_params(**asdict(state))
Ejemplo n.º 9
0
def xplore_sidebar(state, sqobjs: dict):
    '''Draw appropriate sidebar for the page'''

    stime = state.start_time
    etime = state.end_time

    table_vals = sorted(list(sqobjs.keys()))

    if state.table:
        if isinstance(state.table, list):
            tblidx = table_vals.index(state.table[0])
        else:
            tblidx = table_vals.index(state.table)
    else:
        tblidx = table_vals.index('device')  # Default starting table
    assert_val = state.assert_clicked
    view_idx = 1 if state.view == 'all' else 0

    devdf = gui_get_df(sqobjs['device'], columns=['namespace', 'hostname'])
    if devdf.empty:
        st.error('Unable to retrieve any namespace info')
        st.stop()

    namespaces = [""]
    namespaces.extend(sorted(devdf.namespace.unique().tolist()))
    if state.namespace:
        nsidx = namespaces.index(state.namespace)
    else:
        nsidx = 0
    namespace = st.sidebar.selectbox('Namespace', namespaces, index=nsidx)

    if namespace != state.namespace:
        state.hostname = None
        state.namespace = namespace

    hostnames = [""]
    if state.namespace:
        hostlist = devdf.query(f'namespace=="{state.namespace}"') \
                        .hostname.unique().tolist()
    else:
        hostlist = devdf.hostname.unique().tolist()
    hostnames.extend(sorted(hostlist))
    if state.hostname:
        hostidx = hostnames.index(state.hostname)
    else:
        hostidx = 0
    state.hostname = st.sidebar.selectbox('Hostname', hostnames, index=hostidx)

    state.start_time = st.sidebar.text_input('Start time',
                                             value=stime,
                                             key='stime')
    state.end_time = st.sidebar.text_input('End time',
                                           value=etime,
                                           key='etime')
    table = st.sidebar.selectbox('Select Table to View',
                                 tuple(table_vals),
                                 index=tblidx)

    if table != state.table:
        # We need to reset the specific variables
        state.query = ''
        state.assert_clicked = False
        state.uniq_clicked = 0
        state.table = table
        state.columns = 'default'

    view_vals = ('latest', 'all')
    if state.start_time and state.end_time:
        # We show everything thats happened when both times are specified
        view_idx = 1
    state.view = st.sidebar.radio("View of Data", view_vals, index=view_idx)
    fields = TablesObj().describe(table=state.table)
    if state.table != 'tables':
        colist = sorted((filter(lambda x: x not in ['index', 'sqvers'],
                                fields.name.tolist())))
        columns = st.sidebar.multiselect('Pick columns',
                                         ['default', 'all'] + colist,
                                         default=state.columns)
        if ('default' in columns or 'all' in columns) and len(columns) == 1:
            col_sel_val = True
        else:
            col_sel_val = False

        col_ok = st.sidebar.checkbox('Column Selection Done',
                                     value=col_sel_val)
        if not col_ok:
            columns = ['default']
    else:
        col_ok = True
        columns = ['default']

    if not columns:
        columns = ['default']

    state.columns = columns
    if state.table in ['interfaces', 'ospf', 'bgp', 'evpnVni']:
        state.assert_clicked = st.sidebar.checkbox('Run Assert',
                                                   value=assert_val)
    else:
        state.assert_clicked = False

    if not col_ok:
        st.experimental_set_query_params(**asdict(state))
        st.stop()
    if ('default' in columns or 'all' in columns) and len(columns) != 1:
        st.error('Cannot select default/all with any other columns')
        st.experimental_set_query_params(**asdict(state))
        st.stop()
    elif not columns:
        st.error('Columns cannot be empty')
        st.experimental_set_query_params(**asdict(state))
        st.stop()

    state.query = st.sidebar.text_input('Filter results with pandas query',
                                        value=state.query,
                                        key=state.table)
    st.sidebar.markdown(
        "[query syntax help](https://suzieq.readthedocs.io/en/latest/pandas-query-examples/)"
    )

    if columns == ['all']:
        columns = ['*']
    if state.table != "tables":
        col_expander = st.sidebar.beta_expander('Column Names', expanded=False)
        with col_expander:
            st.subheader(f'{state.table} column names')
            st.table(TablesObj().describe(
                table=state.table).query('name != "sqvers"').reset_index(
                    drop=True).style)
Ejemplo n.º 10
0
def checkpoint():
    prep_data = st.button('Prepare data')
    if not prep_data:
        st.warning('Please prepare the data before proceeding.')
        st.stop()
    st.success('Thank you for preparing the data')
Ejemplo n.º 11
0
def graph_day(df, what_to_show_l, title):
    """  _ _ _ """
    #st.write(f"t = {t}")
    df_temp = pd.DataFrame(columns=["date"])
    if what_to_show_l is None:
        st.warning("Choose something")
        st.stop()

    if type(what_to_show_l) == list:
        what_to_show_l_ = what_to_show_l
    else:
        what_to_show_l_ = [what_to_show_l]
    aantal = len(what_to_show_l_)
    # SHOW A GRAPH IN TIME / DAY

    with _lock:
        fig1x = plt.figure()
        ax = fig1x.add_subplot(111)

        color_list = [
            "#02A6A8",
            "#4E9148",
            "#F05225",
        ]

        n = 0  # counter to walk through the colors-list

        for b in what_to_show_l_:
            df_temp = df

            df_temp[b].plot(
                label="_nolegend_",
                color=color_list[n],
                linestyle="--",
                alpha=0.9,
                linewidth=0.8,
            )
            n += 1

        plt.title(title, fontsize=10)

        # show every 10th date on x axis
        a__ = (max(df_temp["date"].tolist())).date() - (min(
            df_temp["date"].tolist())).date()
        freq = int(a__.days / 10)
        ax.xaxis.set_major_locator(MultipleLocator(freq))
        ax.set_xticks(df_temp["date"].index)
        ax.set_xticklabels(df_temp["date"].dt.date, fontsize=6, rotation=90)
        xticks = ax.xaxis.get_major_ticks()

        # for i, tick in enumerate(xticks):
        #     if i % 10 != 0:
        #         tick.label1.set_visible(False)
        plt.xticks()

        # layout of the x-axis
        ax.xaxis.grid(True, which="major", alpha=0.4, linestyle="--")
        ax.yaxis.grid(True, which="major", alpha=0.4, linestyle="--")

        left, right = ax.get_xlim()
        ax.set_xlim(left, right)
        fontP = FontProperties()
        fontP.set_size("xx-small")

        plt.xlabel("date")
        # everything in legend
        # https://stackoverflow.com/questions/33611803/pyplot-single-legend-when-plotting-on-secondary-y-axis
        handles, labels = [], []
        for ax in fig1x.axes:
            for h, l in zip(*ax.get_legend_handles_labels()):
                handles.append(h)
                labels.append(l)
        # plt.legend(handles,labels)
        # https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot/43439132#43439132
        plt.legend(handles,
                   labels,
                   bbox_to_anchor=(0, -0.5),
                   loc="lower left",
                   ncol=2)
        ax.text(
            1,
            1.1,
            "Created by Rene Smit — @rcsmit",
            transform=ax.transAxes,
            fontsize="xx-small",
            va="top",
            ha="right",
        )
        st.pyplot(fig1x)
Ejemplo n.º 12
0
def error(msg):
    global ERROR
    ERROR.markdown(f"<p style='font-size:18px'><span style='color:{DFT.RED}'><b>{msg}</b></span></p>", unsafe_allow_html=True)
    st.stop()
Ejemplo n.º 13
0
def app():
    """
    Covid world webgui
    """
    options = [
        "New cases",
        "New deaths",
        "Total cases",
        "Total deaths",
        "Hosp patients per mill",
        "Positivity rate",
    ]
    plot_selected = st.sidebar.selectbox("Select a plot", options, index=0)
    date_selected = st.sidebar.date_input("Change the dates?",
                                          value=(dt.datetime(2020, 3, 1),
                                                 dt.datetime.now()))
    if len(date_selected) != 2:
        st.info("Select a beginning and end date")
        st.stop()
    ##### Retrieve #####

    columns = [
        "location",
        "continent",
        "date",
        "hosp_patients_per_million",
        "new_cases_smoothed_per_million",
        "new_deaths_smoothed_per_million",
        "total_cases_per_million",
        "total_deaths_per_million",
        "rolling_pos_per_tests",
    ]

    my_df = pd.DataFrame(h.sql_orm_requester(columns, table, session))
    my_df.columns = columns

    my_df["date"] = pd.to_datetime(my_df["date"])

    # st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
    region_options = ["Default", "Continents", "World"]
    regions = [
        "North America", "West Europe", "East Europe", "Nordics", "Asia"
    ]
    regions.sort()
    region_options = region_options + regions

    region = st.sidebar.radio("Preset locations",
                              options=region_options,
                              index=0)

    if region == "North America":
        default = ["Canada", "United States", "Mexico"]

    elif region == "Nordics":
        default = ["Sweden", "Finland", "Norway"]

    elif region == "West Europe":
        default = [
            "Austria",
            "Belgium",
            "France",
            "Germany",
            "Spain",
            "United Kingdom",
            "Portugal",
            "Netherlands",
            "Switzerland",
        ]

    elif region == "East Europe":
        default = [
            "Hungary",
            "Slovakia",
            "Austria",
            "Slovenia",
            "Croatia",
            "Serbia",
            "Romania",
            "Ukraine",
        ]

    elif region == "Continents":
        default = [
            "Europe",
            "North America",
            "South America",
            "Africa",
            "Asia",
            "Australia",
        ]

    elif region == "Asia":
        default = ["Japan", "South Korea", "Thailand", "India"]
        st.sidebar.info(
            "__Note__: China not included by default due to low reported numbers"
        )
        china_wanted = st.sidebar.checkbox("Add China")
        if china_wanted:
            default.append("China")
    elif region == "World":
        default = list(my_df['location'].unique())
    else:
        default = ["Canada", "Hungary", "United States"]
    default.sort()

    premade_df = h.dataset_filterer(my_df,
                                    "location",
                                    default_selected=default)

    plot_selected = plot_selected.lower()
    if "positivity rate" in plot_selected:
        # cant do log of this data, so checkbox not needed
        st.info(
            "W.H.O. guidelines recommend a positivity rate of at most __0.05__ for two weeks before nations reopen."
        )

        ylabel = "rolling_pos_per_tests"
        title = "Positivity rate by location"
        graph_caller(ylabel,
                     date_selected,
                     premade_df,
                     title,
                     ylog=False,
                     yrange=(0, 0.5))
        st.write(
            "The positivity rate is calculated as 'number of positive tests' / 'positive + negative tests'"
        )

    else:
        placeholder = st.empty()
        placeholder.info(
            "__Instructions:__ Move mouse into plot to interact. Drag and select to zoom. Double click to reset. Click the camera to save."
        )
        col_annot, col_ylog = st.columns(2)
        with col_annot:
            show_annot = st.checkbox("Show annotations", value=True)
        with col_ylog:
            ylog = st.checkbox("log(y axis)")

        if ylog:
            placeholder.info(
                "The log of these values indicates the speed of transmission, making the flattening of curves more apparent."
            )

        if "new deaths" in plot_selected:
            ylabel = "new_deaths_smoothed_per_million"
            title = "New deaths per million by location"
        elif "new cases" in plot_selected:
            ylabel = "new_cases_smoothed_per_million"
            title = "New cases per million by location"
        elif "total cases" in plot_selected:
            ylabel = "total_cases_per_million"
            title = "Total cases per million by location"
        elif "total deaths" in plot_selected:
            ylabel = "total_deaths_per_million"
            title = "Total deaths per million by location"
        elif "hosp patients per mill" in plot_selected:
            ylabel = "hosp_patients_per_million"
            title = "Hospital patients per million by location"
            placeholder.warning(
                "The graph may be blank as not all countries publish hospital data"
            )
        graph_caller(ylabel,
                     date_selected,
                     premade_df,
                     title,
                     ylog=ylog,
                     show_annot=show_annot)
    # leaderboard
    yesterday = dt.datetime.now() - dt.timedelta(days=2)
    fig = h.overview_plotter(yesterday.date(),
                             premade_df,
                             x='location',
                             y=ylabel,
                             sortby='continent',
                             title=f"Top {title.lower()} in the past week")
    st.plotly_chart(fig)
Ejemplo n.º 14
0
def ttest_upload_data_ui(): 
    '''The Two-sample Student's t-test - Continuous variables (upload data) section. '''
    
    # Render the header. 
    with st.beta_container(): 
        st.title('Two-sample Student\'s t-test')
        st.header('Continuous variables')
    
    # Render file dropbox
    with st.beta_expander('Upload data', expanded=True): 
        how_to_load = st.selectbox('How to access raw data? ', ('Upload', 'URL', 'Sample data'))
        if how_to_load == 'Upload': 
            uploaded_file = st.file_uploader("Choose a CSV file", type='.csv')
        elif how_to_load == 'URL': 
            uploaded_file = st.text_input('File URL: ')
            if uploaded_file == '': 
                uploaded_file = None
        elif how_to_load == 'Sample data': 
            uploaded_file = 'https://raw.githubusercontent.com/luxin-tian/mosco_ab_test/main/sample_data/cookie_cats.csv'
        if uploaded_file is not None: 
            with st.spinner('Loading data...'): 
                df = _load_data(uploaded_file)
    
    if uploaded_file is not None: 
        with st.beta_expander('Data preview', expanded=True): 
            with st.spinner('Loading data...'): 
                st.dataframe(df)
                st.write('`{}` rows, `{}` columns'.format(df.shape[0],df.shape[1]))
    
    if uploaded_file is not None: 
        with st.beta_expander('Configurations', expanded=True): 
            df_columns_types = [ind + ' (' + val.name + ')' for ind, val in df.dtypes.iteritems()]
            df_columns_dict = {(ind + ' (' + val.name + ')'): ind for ind, val in df.dtypes.iteritems()}
            var_group_label = df_columns_dict[st.selectbox('Group label', df_columns_types)]
            col1, col2 = st.beta_columns(2) 
            with col1:
                var_group_name_1 = st.selectbox('Group name A', df[var_group_label].unique())
            with col2:
                var_group_name_2 = st.selectbox('Group name B', df[var_group_label].unique())
            var_outcome = [df_columns_dict[var] for var in st.multiselect('Outcome variable: ', df_columns_types)]
            col1, col2 = st.beta_columns([1, 1])
            with col1: 
                conf_level = st.select_slider('Confidence level: ', ('0.90', '0.95', '0.99'))
            with col2: 
                hypo_type = st.radio('Hypothesis type: ', ('One-sided', 'Two-sided'))
            if_dropna = st.checkbox('Drop null values', value=True)
            if_remove_outliers = st.checkbox('Remove outliers', value=False)
            if if_remove_outliers: 
                outlier_lower_qtl, outlier_upper_qtl = st.slider('Quantiles (observations falling into the tails will be removed): ', min_value=0.0, max_value=1.0, step=0.01, value=(0.0, 0.95))
                # col1, col2 = st.beta_columns(2) 
                # with col1: 
                #     outlier_lower_qtl = st.slider('Lower quantile: ', min_value=0.0, max_value=0.25, step=0.01, value=0.0)
                # with col2: 
                #     outlier_upper_qtl = st.slider('Upper quantile: ', min_value=0.75, max_value=1.00, step=0.01, value=0.99)
            else: 
                outlier_lower_qtl, outlier_upper_qtl = None, None
            if_data_description = st.checkbox('Show descriptive statistics', value=False)
            if_apply = st.button('Confirm')
    
    if uploaded_file is not None: 
        if if_apply: 
            if var_group_name_1 == var_group_name_2: 
                st.error('The names of Group A and Group B cannot be identical. ')
                st.stop()
            for col in var_outcome: 
                df = _process_data(df=df, col=col, if_dropna=if_dropna, if_remove_outliers=if_remove_outliers, outlier_lower_qtl=outlier_lower_qtl, outlier_upper_qtl=outlier_upper_qtl)
            # Render hypothesis testing
            with st.beta_expander('Hypothesis testing', expanded=True): 
                with st.spinner('Calculating...'): 
                    df_group_1 = df[df[var_group_label] == var_group_name_1]
                    df_group_2 = df[df[var_group_label] == var_group_name_2]
                    for var in var_outcome: 
                        st.markdown(f'`{var}`: {df[var].dtype}')
                        mu_1 = np.mean(df_group_1[var])
                        mu_2 = np.mean(df_group_2[var])
                        sigma_1 = np.std(df_group_1[var], ddof=1)
                        sigma_2 = np.std(df_group_2[var], ddof=1)
                        n_1 = len(df_group_1[var])
                        n_2 = len(df_group_2[var])

                        tstat, p_value, tstat_denom, pooled_sd, effect_size = scipy_ttest_ind_from_stats(
                            mu_1, mu_2, sigma_1, sigma_2, n_1, n_2)
                        observed_power = sm_tt_ind_solve_power(effect_size=effect_size, n1=n_1, n2=n_2, alpha=1-float(conf_level), power=None, hypo_type=hypo_type, if_plot=False)

                        # Render the results
                        ttest_plot(mu_1, mu_2, sigma_1, sigma_2, conf_level, tstat, p_value, tstat_denom, hypo_type, observed_power)

            # Render descriptive statistics
            if if_data_description: 
                with st.beta_expander('Data descriptions', expanded=True): 
                    with st.spinner('Processing data...'): 
                        # if if_factorize:  
                        #     df[var_hot_encoding] = df[var_hot_encoding].astype('category')
                        df = df[(df[var_group_label] == var_group_name_1) | (df[var_group_label] == var_group_name_2)]
                        df_summary = df.groupby(by=var_group_label).describe(include='all')

                        # Plot distribution
                        for var in var_outcome: 
                            st.markdown(f'`{var}`: {df[var].dtype}')
                            st.table(df_summary[var].T.dropna())
                            fig_1 = sns.displot(data=df, x=var, col=var_group_label, kde=True)
                            fig_2 = sns.displot(data=df, kind="ecdf", x=var, hue=var_group_label, rug=True)
                            fig_3, ax = plt.subplots()
                            ax = sns.boxplot(data=df, y=var, hue=var_group_label)
                            st.pyplot(fig_1)
                            col1, col2 = st.beta_columns([1, 1.1])
                            with col1: 
                                st.pyplot(fig_2)
                            with col2: 
                                st.pyplot(fig_3)
    return 
Ejemplo n.º 15
0
def plot_percentiles(df, gekozen_weerstation, what_to_show, wdw, centersmooth):
    if len(what_to_show)!=1 :
        st.warning("Choose (only) 1 thing to show")
        st.stop()

    df_quantile = pd.DataFrame(
        {"date": [],  "q10": [], "q25": [], "q50":[] ,"avg": [], "q75": [], "q90": []}    )
    year_to_show = st.sidebar.number_input("Year to show (2100 for nothing)", 1900, 2100, 2021)

    (month_from,month_until) = st.sidebar.slider("Months (from/until (incl.))", 1, 12, (1,12))
    if month_from > month_until:
        st.warning("Make sure that the end month is not before the start month")
        st.stop()
    df = df[
        (df["YYYYMMDD"].dt.month >= month_from) & (df["YYYYMMDD"].dt.month <= month_until)
    ]

    for month in list(range(1,13)):
        for day in list(range(1,32)):
            if month==2 and day==29:
                pass
            else:
                df_ = df[
                        (df["YYYYMMDD"].dt.month == month) & (df["YYYYMMDD"].dt.day == day)
                    ]

                df__ = df[
                        (df["YYYYMMDD"].dt.year == year_to_show) & (df["YYYYMMDD"].dt.month == month) & (df["YYYYMMDD"].dt.day == day)
                    ]

                if len(df__)>0:
                    value_in_year_ = df__[what_to_show].iloc[0]
                    value_in_year = value_in_year_[0]
                else:
                    value_in_year = None
                if len(df_)>0:
                    data = df_[what_to_show] #.tolist()
                    #st.write(data)

                    date_ = "1900-" +  str(month).zfill(2) + '-' + str(day).zfill(2)

                    q10 = np.percentile(data, 10)
                    q25 = np.percentile(data, 25)
                    q50 = np.percentile(data, 50)
                    q75 = np.percentile(data, 75)
                    q90 = np.percentile(data, 90)
                    avg = data.mean()


                    df_quantile = df_quantile.append(
                        {
                            "date_": date_,
                            "q10": q10,
                            "q25": q25,
                            "q50": q50,
                            "avg": avg,

                            "q75": q75,

                            "q90": q90,
                            "value_in_year" : value_in_year
                            },
                        ignore_index=True,
                    )

    df_quantile['date'] = pd.to_datetime(df_quantile.date_, format='%Y-%m-%d',  errors='coerce')

    columns = ["q10", "q25", "avg", "q50", "q75", "q90", "value_in_year"]
    for c in columns:
        df_quantile[c] = df_quantile[c].rolling(window=wdw, center=centersmooth).mean()
        df_quantile[c] = round(df_quantile[c],1)
    colors = ["red", "blue", ["yellow"]]
    title = (f" {what_to_show[0]} in {gekozen_weerstation} (percentiles (10/25/avg/75/90/))")
    graph_type = "plotly"
    if graph_type == "pyplot":

        with _lock:
            fig1x = plt.figure()
            ax = fig1x.add_subplot(111)
            idx = 0
            df_quantile.plot(x='date',y='avg', ax=ax, linewidth=0.75,
                            color=colors[idx],
                            label="avg")
            # df_quantile.plot(x='date',y='q50', ax=ax, linewidth=0.75,
            #                 color="yellow",
            #                 label="mediaan",  alpha=0.75)
            df_quantile.plot(x='date',y='value_in_year', ax=ax,
                            color="black",  linewidth=0.75,
                            label=f"value in {year_to_show}")
            ax.fill_between(df_quantile['date'],
                            y1=df_quantile['q25'],
                            y2=df_quantile['q75'],
                            alpha=0.30, facecolor=colors[idx])
            ax.fill_between(df_quantile['date'],
                            y1=df_quantile['q10'],
                            y2=df_quantile['q90'],
                            alpha=0.15, facecolor=colors[idx])


            ax.set_xticks(df_quantile["date"].index)
            # if datefield == "YYYY":
            #     ax.set_xticklabels(df[datefield], fontsize=6, rotation=90)
            # else:
            ax.set_xticklabels(df_quantile["date"], fontsize=6, rotation=90)
            xticks = ax.xaxis.get_major_ticks()
            for i, tick in enumerate(xticks):
                if i % 10 != 0:
                    tick.label1.set_visible(False)

            # plt.xticks()
            plt.grid(which="major", axis="y")
            plt.title(title)
            plt.legend()
            st.pyplot(fig1x)
    else:
        fig = go.Figure()
        q10 = go.Scatter(
            name='q10',
            x=df_quantile["date"],
            y=df_quantile['q10'] ,
            mode='lines',
            line=dict(width=0.5,
                    color="rgba(255, 188, 0, 0.5)"),
            fillcolor='rgba(68, 68, 68, 0.1)', fill='tonexty')

        q25 = go.Scatter(
            name='q25',
            x=df_quantile["date"],
            y=df_quantile['q25'] ,
            mode='lines',
            line=dict(width=0.5,
                    color="rgba(255, 188, 0, 0.5)"),
            fillcolor='rgba(68, 68, 68, 0.2)',
            fill='tonexty')

        avg = go.Scatter(
            name=what_to_show[0],
            x=df_quantile["date"],
            y=df_quantile["avg"],
            mode='lines',
            line=dict(width=0.75,color='rgba(68, 68, 68, 0.8)'),
            )

        value_in_year__ = go.Scatter(
            name="2021",
            x=df_quantile["date"],
            y=df_quantile["value_in_year"],
            mode='lines',
            line=dict(width=0.75,color='rgba(255, 0, 0, 0.8)'),
            )

        q75 = go.Scatter(
            name='q75',
            x=df_quantile["date"],
            y=df_quantile['q75'] ,
            mode='lines',
            line=dict(width=0.5,
                    color="rgba(255, 188, 0, 0.5)"),
            fillcolor='rgba(68, 68, 68, 0.1)',
            fill='tonexty')


        q90 = go.Scatter(
            name='q90',
            x=df_quantile["date"],
            y=df_quantile['q90'],
            mode='lines',
            line=dict(width=0.5,
                    color="rgba(255, 188, 0, 0.5)"),
            fillcolor='rgba(68, 68, 68, 0.1)'
        )

        data = [q90, q75, q25, q10,avg, value_in_year__ ]

        layout = go.Layout(
            yaxis=dict(title=what_to_show[0]),
            title=title,)
            #, xaxis=dict(tickformat="%d-%m")
        fig = go.Figure(data=data, layout=layout)
        fig.update_layout(xaxis=dict(tickformat="%d-%m"))
        st.plotly_chart(fig, use_container_width=True)
Ejemplo n.º 16
0
def main():
    """
    Main is responsible for the visualisation of everything connected with streamlit.
    It is the web application itself.
    """

    # # Radiobuttons in one row
    # st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)

    # Sets sidebar's header and logo
    sidebar.sidebar_head()

    #
    # # Spectrometer type `- BWTek / Renishaw / Witec / Wasatch / Teledyne
    #

    spectra_types = [
        'EMPTY', 'BWTEK', 'RENI', 'WITEC', 'WASATCH', 'TELEDYNE', 'JOBIN'
    ]
    spectrometer = st.sidebar.selectbox("Choose spectra type",
                                        spectra_types,
                                        format_func=LABELS.get,
                                        index=0)

    # sidebar separating line
    sidebar.print_widgets_separator()

    # User data loader
    # sidebar.print_widget_labels('Upload your data or try with ours', 10, 0)

    files = st.sidebar.file_uploader(label='Upload your data or try with ours',
                                     accept_multiple_files=True,
                                     type=['txt', 'csv'])

    # Allow example data loading when no custom data are loaded
    if not files:
        if st.sidebar.checkbox("Load example data"):
            if spectrometer == "EMPTY":
                st.sidebar.error('First Choose Spectra type')
            else:
                files = utils.load_example_files(spectrometer)

    # Check if data loaded, if yes, perform actions
    delim = None
    if files:
        st.spinner('Uploading data in progress')
        # sidebar separating line
        sidebar.print_widgets_separator()

        from detect_delimiter import detect
        new_files = []
        for file in files:
            file.seek(0)
            lines = file.readlines()

            try:
                lines = [line.decode('utf-8') for line in lines]
            except AttributeError:
                pass

            # lines = str.splitlines(str(text))  # .split('\n')
            first_lines = '\n'.join(lines[:20])

            delim = detect(first_lines)
            colnum = lines[-2].count(delim)

            lines = [i for i in lines if i.count(delim) == colnum]
            text = '\n'.join(lines)
            buffer = io.StringIO(text)
            buffer.name = file.name
            new_files.append(buffer)

        try:
            df = save_read.read_files(spectrometer, new_files, delim)
        except (TypeError, ValueError):
            st.error('Try choosing another type of spectra')
            st.stop()

        main_expander = st.beta_expander("Customize your chart")
        # Choose plot colors and templates
        with main_expander:
            plots_color, template = vis_utils.get_chart_vis_properties()

        # Select chart type
        chart_type = vis_opt.vis_options()

        # sidebar separating line
        sidebar.print_widgets_separator()

        # Select data conversion type
        spectra_conversion_type = vis_opt.convertion_opt()

        # TODO need improvements
        # getting rid of duplicated columns
        df = df.loc[:, ~df.columns.duplicated()]

        #
        # # data manipulation - raw / optimization / normalization
        #

        # TODO delete if not needed
        # Normalization
        # if spectra_conversion_type == LABELS["NORM"]:
        #     df = (df - df.min()) / (df.max() - df.min())

        # Mean Spectra
        if chart_type == 'MS':
            df = df.mean(axis=1).rename('Average').to_frame()

        # columns in main view. Chart, expanders
        # TODO rozwiązać to jakoś sprytniej
        normalized = False
        col_left, col_right = st.beta_columns([5, 2])
        if spectra_conversion_type != "RAW":
            col_right = col_right.beta_expander("Customize spectra",
                                                expanded=False)
            with col_right:
                vals = data_customisation.get_deg_win(chart_type,
                                                      spectra_conversion_type,
                                                      df.columns)
                if st.checkbox("Data Normalization"):
                    normalized = True
                    df = (df - df.min()) / (df.max() - df.min())
                else:
                    normalized = False

        # For grouped spectra sometimes we want to shift the spectra from each other, here it is:
        with main_expander:
            # TODO the code below needed?
            # trick to better fit sliders in expander
            # _, main_expander_column, _ = st.beta_columns([1, 38, 1])
            # with main_expander_column:

            shift_col, _, trim_col = st.beta_columns([5, 1, 5])
            with shift_col:
                if chart_type == 'GS':
                    shift = data_customisation.separate_spectra(normalized)
                elif chart_type == 'SINGLE':
                    col = st.selectbox('spectrum to plot', df.columns)
                    df = df[[col]]
                else:
                    shift = None
            with trim_col:
                df = vis_utils.trim_spectra(df)

        # data conversion end
        if spectra_conversion_type in {'OPT'}:
            baselines = pd.DataFrame(index=df.index)
            baselined = pd.DataFrame(index=df.index)
            flattened = pd.DataFrame(index=df.index)
            for col in df.columns:
                baselines[col] = peakutils.baseline(df[col], vals[col][0])
                baselined[col] = df[col] - baselines[col]
                flattened[col] = baselined[col].rolling(window=vals[col][1],
                                                        min_periods=1,
                                                        center=True).mean()

        #
        # # Plotting
        #

        # Groupped spectra
        if chart_type == 'GS':
            shifters = [(i + 1) * shift for i in range(len(df.columns))]
            plot_df = df if spectra_conversion_type == 'RAW' else flattened
            plot_df = plot_df + shifters

            figs = [
                px.line(plot_df,
                        x=plot_df.index,
                        y=plot_df.columns,
                        color_discrete_sequence=plots_color)
            ]

        # Mean spectra
        elif chart_type == 'MS':
            if spectra_conversion_type == 'RAW':
                plot_df = df
                figs = [
                    px.line(plot_df,
                            x=plot_df.index,
                            y=plot_df.columns,
                            color_discrete_sequence=plots_color)
                ]

            elif spectra_conversion_type in {'OPT'}:
                columns = [
                    'Average', 'Baseline', 'BL-Corrected',
                    'Flattened + BL-Corrected'
                ]
                plot_df = pd.concat([df, baselines, baselined, flattened],
                                    axis=1)
                plot_df.columns = columns

                fig1 = px.line(plot_df,
                               x=plot_df.index,
                               y=columns[-1],
                               color_discrete_sequence=plots_color[3:])
                fig2 = px.line(plot_df,
                               x=plot_df.index,
                               y=plot_df.columns,
                               color_discrete_sequence=plots_color)
                figs = [(fig1, fig2)]
            else:
                raise ValueError(
                    'Unknown conversion type for Mean spectrum chart')
        # 3D spectra
        elif chart_type == 'P3D':
            plot_df = flattened if spectra_conversion_type in {"OPT"} else df

            plot_df = plot_df.reset_index().melt('Raman Shift',
                                                 plot_df.columns)
            fig = px.line_3d(plot_df,
                             x='variable',
                             y='Raman Shift',
                             z='value',
                             color='variable')

            camera = dict(eye=dict(x=1.9, y=0.15, z=0.2))
            fig.update_layout(
                scene_camera=camera,
                width=1200,
                height=1200,
                margin=dict(l=1, r=1, t=30, b=1),
            )
            figs = [fig]

        # Single spectra
        elif chart_type == 'SINGLE':
            if spectra_conversion_type == 'RAW':
                plot_df = df
                figs = [
                    px.line(plot_df[col], color_discrete_sequence=plots_color)
                    for col in plot_df.columns
                ]
            else:
                columns = [
                    'Average', 'Baseline', 'BL-Corrected',
                    'Flattened + BL-Corrected'
                ]
                figs = []

                plot_df = pd.concat([df, baselines, baselined, flattened],
                                    axis=1)
                plot_df.columns = columns

                fig1 = px.line(plot_df,
                               x=plot_df.index,
                               y=columns[-1],
                               color_discrete_sequence=plots_color[3:]
                               )  # trick for color consistency
                fig2 = px.line(plot_df,
                               x=plot_df.index,
                               y=plot_df.columns,
                               color_discrete_sequence=plots_color)
                fig_tup = (fig1, fig2)
                figs.append(fig_tup)
        else:
            raise ValueError("Something unbelievable has been chosen")

        with col_left:
            charts.show_charts(figs, plots_color, template)

        with col_left:
            st.markdown('')
            link = utils.download_button(plot_df.reset_index(),
                                         f'spectrum.csv',
                                         button_text='Download CSV')
            st.markdown(link, unsafe_allow_html=True)

    else:
        manual.show_manual()

    authors.show_developers()
Ejemplo n.º 17
0
def getdata(stn, fromx, until):
    with st.spinner(f"GETTING ALL DATA ..."):
        # url =  "https://www.daggegevens.knmi.nl/klimatologie/daggegevens?stns=251&vars=TEMP&start=18210301&end=20210310"
        # https://www.knmi.nl/kennis-en-datacentrum/achtergrond/data-ophalen-vanuit-een-script
        # url = f"https://www.daggegevens.knmi.nl/klimatologie/daggegevens?stns={stn}&vars=ALL&start={fromx}&end={until}"
        url = f"https://www.daggegevens.knmi.nl/klimatologie/daggegevens?stns={stn}&vars=TEMP:SQ:SP:Q:DR:RH&start={fromx}&end={until}"
        try:
            df = pd.read_csv(
                url,
                delimiter=",",
                header=None,
                comment="#",
                low_memory=False,
            )

        except:
            st.write("FOUT BIJ HET INLADEN.")
            st.stop()
        # TG        : Etmaalgemiddelde temperatuur (in 0.1 graden Celsius) / Daily mean temperature in (0.1 degrees Celsius)
        # TN        : Minimum temperatuur (in 0.1 graden Celsius) / Minimum temperature (in 0.1 degrees Celsius)
        # TNH       : Uurvak waarin TN is gemeten / Hourly division in which TN was measured
        # TX        : Maximum temperatuur (in 0.1 graden Celsius) / Maximum temperature (in 0.1 degrees Celsius)
        # TXH       : Uurvak waarin TX is gemeten / Hourly division in which TX was measured
        # T10N      : Minimum temperatuur op 10 cm hoogte (in 0.1 graden Celsius) / Minimum temperature at 10 cm above surface (in 0.1 degrees Celsius)
        # T10NH     : 6-uurs tijdvak waarin T10N is gemeten / 6-hourly division in which T10N was measured; 6=0-6 UT; 12=6-12 UT; 18=12-18 UT; 24=18-24 UT
        # SQ        : Zonneschijnduur (in 0.1 uur) berekend uit de globale straling (-1 voor <0.05 uur) / Sunshine duration (in 0.1 hour) calculated from global radiation (-1 for <0.05 hour)
        # SP        : Percentage van de langst mogelijke zonneschijnduur / Percentage of maximum potential sunshine duration
        # Q         : Globale straling (in J/cm2) / Global radiation (in J/cm2)
        # DR        : Duur van de neerslag (in 0.1 uur) / Precipitation duration (in 0.1 hour)
        # RH        : Etmaalsom van de neerslag (in 0.1 mm) (-1 voor <0.05 mm) / Daily precipitation amount (in 0.1 mm) (-1 for <0.05 mm)

        column_replacements = [
            [0, "STN"],
            [1, "YYYYMMDD"],
            [2, "temp_avg"],
            [3, "temp_min"],
            [4, "temp_max"],
            [5, "T10N"],
            [6, "zonneschijnduur"],
            [7, "perc_max_zonneschijnduur"],
            [8, "glob_straling"],
            [9, "neerslag_duur"],
            [10, "neerslag_etmaalsom"],
        ]

        for c in column_replacements:
            df = df.rename(columns={c[0]: c[1]})

        df["YYYYMMDD"] = pd.to_datetime(df["YYYYMMDD"], format="%Y%m%d")
        df["YYYY"] = df["YYYYMMDD"].dt.year
        df["MM"] = df["YYYYMMDD"].dt.month
        df["DD"] = df["YYYYMMDD"].dt.day
        df["dayofyear"] = df["YYYYMMDD"].dt.dayofyear
        df["count"] = 1
        month_long_to_short = {
            "January": "Jan",
            "February": "Feb",
            "March": "Mar",
            "April": "Apr",
            "May": "May",
            "June": "Jun",
            "July": "Jul",
            "August": "Aug",
            "September": "Sep",
            "October": "Oct",
            "November": "Nov",
            "December": "Dec",
        }
        month_number_to_short = {
            "1": "Jan",
            "2": "Feb",
            "3": "Mar",
            "4": "Apr",
            "5": "May",
            "6": "Jun",
            "7": "Jul",
            "8": "Aug",
            "9": "Sep",
            "10": "Oct",
            "11": "Nov",
            "12": "Dec",
        }
        df["month"] = df["MM"].astype(str).map(month_number_to_short)
        df["year"] = df["YYYY"].astype(str)
        df["month"] = df["month"].astype(str)
        df["day"] = df["DD"].astype(str)
        df["month_year"] = df["month"] + " - " + df["year"]
        df["month_day"] = df["month"] + " - " + df["day"]

        to_divide_by_10 = [
            "temp_avg",
            "temp_min",
            "temp_max",
            "zonneschijnduur",
            "neerslag_duur",
            "neerslag_etmaalsom",
        ]
        for d in to_divide_by_10:
            try:
                df[d] = df[d] / 10
            except:
                df[d] = None

    return df, url
Ejemplo n.º 18
0

"""
# DICOM Header Viewer with Filter

This is a small example of the power of the `streamlit` library.

## Uploading a DICOM file

Begin by uploading a DICOM file
"""

dicom_bytes = st.file_uploader("Upload DICOM file", encoding=None)

if not dicom_bytes:
    raise st.stop()

try:
    dicom_header = pydicom.read_file(dicom_bytes,
                                     force=True,
                                     stop_before_pixels=True)
except:
    st.write(WrongFileType("Does not appear to be a DICOM file"))
    raise st.stop()
"""
## Filtering and Viewing the DICOM header
"""

filter_string = st.text_input("Filter headers by typing here")

view = dicom_header.__repr__().split("\n")
Ejemplo n.º 19
0
    def write(self):
        st.title("Create")

        obj_type = st.selectbox("Select object to create",
                                [CreatePage.COLL_OPT, CreatePage.DOC_OPT])

        if obj_type == CreatePage.COLL_OPT:
            db_name = st.selectbox(
                "Select Database",
                self.db_client.get_database_names() +
                [CreatePage.CREATE_NEW_DB_OPT],
            )
            if db_name == CreatePage.CREATE_NEW_DB_OPT:
                db_name = st.text_input("Database Name ?")
            if (db_name == ""
                    and not any(char in db_name
                                for char in {"$", "\\", "/", ".", " ", '"'})
                    and len(db_name) < 64):
                st.info(
                    "Database Names cant be empty. cant have '$', '\\', '/', '.', 'space', '\"' (quotes)' and length must be less than 64"
                )
                st.stop()
            coll_name = st.text_input("Collection Name ?")
            if coll_name == "" and not any(char in coll_name
                                           for char in {"$", "system."}):
                st.info(
                    "Collection Names cant be empty. cant have '$' and cant have 'system.'"
                )
                st.stop()

            if st.button("Create"):
                self.db_client.create_collection(db_name=db_name,
                                                 coll_name=coll_name)

        elif obj_type == CreatePage.DOC_OPT:
            db_name = st.selectbox(
                "Select Database",
                self.db_client.get_database_names() +
                [CreatePage.CREATE_NEW_DB_OPT],
            )
            coll_name = None
            if db_name == CreatePage.CREATE_NEW_DB_OPT:
                db_name = st.text_input("Database Name ?")
                if (db_name == "" and
                        not any(char in db_name
                                for char in {"$", "\\", "/", ".", " ", '"'})
                        and len(db_name) < 64):
                    st.info(
                        "Database Names cant be empty. cant have '$', '\\', '/', '.', 'space', '\"' (quotes)' and length must be less than 64"
                    )
                    st.stop()
                coll_name = st.text_input("Collection Name ?")
                if coll_name == "" and not any(char in coll_name
                                               for char in {"$", "system."}):
                    st.info(
                        "Collection Names cant be empty. cant have '$' and cant have 'system.'"
                    )
                    st.stop()
            else:
                coll_name = st.selectbox(
                    "Select Collection",
                    self.db_client.get_collection_names(db_name=db_name) +
                    [CreatePage.CREATE_NEW_COLL_OPT],
                )

            if coll_name == CreatePage.CREATE_NEW_COLL_OPT:
                coll_name = st.text_input("Collection Name ?")
                if coll_name == "" and not any(char in coll_name
                                               for char in {"$", "system."}):
                    st.info(
                        "Collection Names cant be empty. cant have '$' and cant have 'system.'"
                    )
                    st.stop()
            document = st.text_area("Document(s) data ?")

            try:
                document = json.loads(document)
            except JSONDecodeError:
                st.warning("Must be a valid JSON.")
                st.stop()
            if st.button("Create"):
                self.db_client.insert_docs(db_name, coll_name, document)

            see_doc = st.checkbox("See Document(s) ?")
            if see_doc:
                st.write(document)
Ejemplo n.º 20
0
def make_age_graph(df, d, columns_original, legendanames, titel):
    if d is None:
        st.warning("Choose ages to show")
        st.stop()
    with _lock:
        color_list = [
            "#3e5c76",  # blue 6,
            "#ff6666",  # reddish 0
            "#ac80a0",  # purple 1
            "#3fa34d",  # green 2
            "#EAD94C",  # yellow 3
            "#EFA00B",  # orange 4
            "#7b2d26",  # red 5
            "#e49273",  # dark salmon 7
            "#1D2D44",  # 8
            "#02A6A8",
            "#4E9148",
            "#F05225",
            "#024754",
            "#FBAA27",
            "#302823",
            "#F07826",
        ]

        # df = agg_ages(df)
        fig1y, ax = plt.subplots()
        for i, d_ in enumerate(d):

            #if d_ == "TOTAAL_index":
            if d_[:6] == "TOTAAL":
                ax.plot(df["Date_of_statistics_week_start"],
                        df[d_],
                        color=color_list[0],
                        label=columns_original[i],
                        linestyle="--",
                        linewidth=2)
                ax.plot(df["Date_of_statistics_week_start"],
                        df[columns_original[i]],
                        color=color_list[0],
                        alpha=0.5,
                        linestyle="dotted",
                        label='_nolegend_',
                        linewidth=2)
            else:
                ax.plot(df["Date_of_statistics_week_start"],
                        df[d_],
                        color=color_list[i + 1],
                        label=columns_original[i])
                ax.plot(df["Date_of_statistics_week_start"],
                        df[columns_original[i]],
                        color=color_list[i + 1],
                        alpha=0.5,
                        linestyle="dotted",
                        label='_nolegend_')
        plt.legend()
        if y_zero == True:
            ax.set_ylim(bottom=0)
        titel_ = titel + " (weekcijfers)"
        plt.title(titel_)
        plt.xticks(rotation=270)

        ax.text(
            1,
            1.1,
            "Created by Rene Smit — @rcsmit",
            transform=ax.transAxes,
            fontsize="xx-small",
            va="top",
            ha="right",
        )
        # plt.tight_layout()
        # plt.show()
        st.pyplot(fig1y)
Ejemplo n.º 21
0
def main():
    if platform.processor() != "":
        arr = os.listdir(
            "C:\\Users\\rcxsm\\Documents\\phyton_scripts\\streamlit_scripts")
    else:
        arr = os.listdir()

    counter = 1
    options = [["0. welcome", "welcome"],
               ["1. newagebullshitgenerator", "newagebullshitgenerator"],
               ["2. KNMI grafieken", "show_knmi"],
               ["3. Text generator", "txt_generator_streamlit"],
               ["4. YT transcriber", "YoutubeTranscriber_streamlit"],
               ["5. Schoonmaaktijden", "schoonmaaktijden"],
               ["6. Show sportactivities", "show_sportactivities"],
               ["7. YFinance info", "yfinance_info"],
               ["8. Crypto portfolio", "crypto_portfolio"],
               ["9. strftime_test", "strftime_test"]]

    # for file in arr:
    #     if file[-2:] =="py" and ( file != "welcome.py" and file !="menu_streamlit.py"):
    #         menutext = f"{counter}. {file}"
    #         menutext = menutext.replace("_"," ") # I was too lazy to change it in the list
    #         menutext = menutext.replace(".py","") # I was too lazy to change it in the list
    #         file_ = file.replace(".py","") # I was too lazy to change it in the list

    #         options.append([menutext, file_])
    #         counter +=1

    query_params = st.experimental_get_query_params(
    )  # reading  the choice from the URL..

    choice = int(
        query_params["choice"][0]
    ) if "choice" in query_params else 0  # .. and make it the default value

    menuchoicelist = [options[n][0] for n, l in enumerate(options)]

    with st.sidebar.expander(
            'MENU: Choose a script | scroll down for options/parameters',
            expanded=True):
        menu_choice = st.radio("", menuchoicelist, index=choice)

    st.sidebar.markdown("<h1>- - - - - - - - - - - - - - - - - - </h1>",
                        unsafe_allow_html=True)
    st.experimental_set_query_params(choice=menuchoicelist.index(
        menu_choice))  # setting the choice in the URL

    for n, l in enumerate(options):
        if menu_choice == options[n][0]:
            if platform.processor() != "":
                m = "C:\\Users\\rcxsm\\Documents\\phyton_scripts\\streamlit_scripts\\" + options[
                    n][1].replace(
                        " ", "_")  # I was too lazy to change it in the list
                st.write(f"{m }")
            else:
                m = options[n][1].replace(
                    " ", "_")  # I was too lazy to change it in the list
            try:
                module = dynamic_import(m)
            except Exception as e:
                st.error(f"Module '{m}' not found or error in the script\n")
                st.warning(f"{e}")
                st.warning(traceback.format_exc())

                st.stop()
            try:
                module.main()
            except Exception as e:
                st.error(
                    f"Function 'main()' in module '{m}' not found or error in the script"
                )
                st.warning(f"{e}")

                st.warning(traceback.format_exc())

                st.stop()
Ejemplo n.º 22
0
def main():

    lijst = [
        "0-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49",
        "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85-89",
        "90+", "Unknown", "0-29", "30-49", "50-69", "70-89", "90+", "30-69",
        "0-39", "40-59", "60-79", "80+", "0-19", "20-29", "30-39", "40-49",
        "50-59", "60-69", "70-79", "80-89", "90+", "TOTAAL"
    ]
    population = [
        2707000,
        1029000,
        1111000,
        1134000,
        1124000,
        1052000,
        1033000,
        1131000,
        1285000,
        1263000,
        1138000,
        1003000,
        971000,
        644000,
        450000,
        259000,
        130000,
        10,
        5981000,
        4340000,
        4689000,
        2_324_000,
        130000,
        9029000,
        8157000,
        4712000,
        3756000,
        839000,

        #              0-9     10-19     20-29    30-39   40-49
        1756000,
        1980000,
        2245000,
        2176000,
        2164000,
        #50-59   60-69      70-79    80-89    90+
        2548000,
        2141000,
        1615000,
        709000,
        130000,
        17464000
    ]  # tot 17 464 000

    st.header("Hospital / ICU admissions in the Netherlands")
    st.subheader("Please send feedback to @rcsmit")

    # DAILY STATISTICS ################

    start_ = "2020-01-01"
    today = datetime.today().strftime("%Y-%m-%d")
    global from_, FROM, UNTIL
    from_ = st.sidebar.text_input("startdate (yyyy-mm-dd)", start_)

    try:
        FROM = dt.datetime.strptime(from_, "%Y-%m-%d").date()
    except:
        st.error(
            "Please make sure that the startdate is valid and/or in format yyyy-mm-dd"
        )
        st.stop()

    until_ = st.sidebar.text_input("enddate (yyyy-mm-dd)", today)

    try:
        UNTIL = dt.datetime.strptime(until_, "%Y-%m-%d").date()
    except:
        st.error("Please make sure that the enddate is in format yyyy-mm-dd")
        st.stop()

    if FROM >= UNTIL:
        st.warning("Make sure that the end date is not before the start date")
        st.stop()

    if until_ == "2023-08-23":
        st.sidebar.error("Do you really, really, wanna do this?")
        if st.sidebar.button("Yes I'm ready to rumble"):
            caching.clear_cache()
            st.success("Cache is cleared, please reload to scrape new values")
    global WDW2
    WDW2 = st.sidebar.slider("Window smoothing curves (weeks)", 1, 8, 1)
    global delete_last_row
    delete_last_row = st.sidebar.selectbox(
        "Delete last week/row of complete dataset", [True, False], index=0)

    df_pivot_hospital, df_pivot_ic = prepare_data()

    df_pivot_hospital = select_period(df_pivot_hospital,
                                      "Date_of_statistics_week_start", FROM,
                                      UNTIL)
    df_pivot_ic = select_period(df_pivot_ic, "Date_of_statistics_week_start",
                                FROM, UNTIL)

    df_pivot_hospital_basic = df_pivot_hospital.copy(deep=False)
    df_pivot_ic_basic = df_pivot_ic.copy(deep=False)

    df_pivot_hospital = agg_ages(df_pivot_hospital)
    df_pivot_ic = agg_ages(df_pivot_ic)

    save_df(df_pivot_hospital, "hospital_voor_maarten")
    save_df(df_pivot_ic, "ic_voor_maarten")

    df_pivot_casus_landelijk_per_week = make_pivot_casus_landelijk_per_week()
    save_df(df_pivot_casus_landelijk_per_week,
            "casus_per_age_per_week_voor_maarten")

    hospital_or_ic = st.sidebar.selectbox("Hospital or IC",
                                          ["hospital", "icu"],
                                          index=0)
    what_to_do = st.sidebar.selectbox("What type of graph", ["stack", "line"],
                                      index=1)

    default_age_groups = ["0-29", "30-49", "50-69", "70-89", "90+"]
    default_age_groups_perc = [
        "0-29_perc", "30-49_perc", "50-69_perc", "70-89_perc", "90+_perc"
    ]
    default_age_groups_cumm_all = [
        "0-29_cumm_all", "30-49_cumm_all", "50-69_cumm_all", "70-89_cumm_all",
        "90+_cumm_all"
    ]
    default_age_groups_cumm_period = [
        "0-29_cumm_period", "30-49_cumm_period", "50-69_cumm_period",
        "70-89_cumm_period", "90+_cumm_period"
    ]
    default_age_groups_per_capita = [
        "0-29_per_capita", "30-49_per_capita", "50-69_per_capita",
        "70-89_per_capita", "90+_per_capita"
    ]
    if what_to_do == "line":

        age_groups = ["0-29", "30-49", "50-69", "70-89", "90+", "TOTAAL"]
        absolute_or_index = st.sidebar.selectbox(
            f"Absolute | percentages of TOTAAL |\n index (start = 100) | per capita | cummulatief from 2020-1-1 | cummulatief from {FROM}",
            [
                "absolute", "percentages", "index", "per_capita",
                "cummulatief_all", "cummulatief_period"
            ],
            index=0)

        normed = absolute_or_index == "index"
        if absolute_or_index == "percentages":
            ages_to_show = st.sidebar.multiselect(
                "Ages to show (multiple possible)", lijst_perc,
                default_age_groups_perc)
        elif absolute_or_index == "cummulatief_all":
            ages_to_show = st.sidebar.multiselect(
                "Ages to show (multiple possible)", lijst_cumm_all,
                default_age_groups_cumm_all)
        elif absolute_or_index == "cummulatief_period":
            ages_to_show = st.sidebar.multiselect(
                "Ages to show (multiple possible)", lijst_cumm_period,
                default_age_groups_cumm_period)
        elif absolute_or_index == "per_capita":
            ages_to_show = st.sidebar.multiselect(
                "Ages to show (multiple possible)", lijst_per_capita,
                default_age_groups_per_capita)
        else:
            # absolute
            ages_to_show = st.sidebar.multiselect(
                "Ages to show (multiple possible)", lijst, default_age_groups)
    else:
        #stackplot
        absolute_or_relative = st.sidebar.selectbox(
            "Absolute or relative (total = 100%)", ["absolute", "relative"],
            index=0)
        ages_to_show = st.sidebar.multiselect(
            "Ages to show (multiple possible)", lijst, default_age_groups)

    if len(ages_to_show) == 0:
        st.warning("Choose ages to show")
        st.stop()
    global y_zero
    y_zero = st.sidebar.selectbox("Y-ax starts at 0", [True, False], index=1)

    if what_to_do == "stack":

        #  SHOW STACKGRAPHS
        if hospital_or_ic == "hospital":

            to_do_stack = [[
                df_pivot_hospital, ages_to_show,
                "ziekenhuisopname naar leeftijd"
            ]]
        else:
            to_do_stack = [[
                df_pivot_ic, ages_to_show, "IC opname naar leeftijd"
            ]]

        for d in to_do_stack:
            show_stack(d[0], d[1], d[2], absolute_or_relative)

    elif what_to_do == "line":
        # SHOW LINEGRAPHS
        if normed == True:
            df_pivot_hospital, d = normeren(df_pivot_hospital, ages_to_show)
            df_pivot_ic, d = normeren(df_pivot_ic, ages_to_show)
        else:
            d = ages_to_show
        if hospital_or_ic == "hospital":
            show_age_graph(df_pivot_hospital, d, "ziekenhuisopnames")
        else:
            show_age_graph(df_pivot_ic, d, "IC opnames")
    else:
        st.error("ERROR")
        st.stop

    if hospital_or_ic == "hospital":
        st.subheader("Ziekenhuisopnames (aantallen)")

        st.write(df_pivot_hospital_basic)

        df_new = do_the_rudi(df_pivot_hospital_basic)

        st.write(
            df_new.style.format(
                None, na_rep="-").applymap(color_value).set_precision(2))

        #st.dataframe(df_new.style.applymap(color_value))
    else:
        st.subheader("Ziekenhuisopnames (aantallen)")
        st.write(df_pivot_ic_basic)
        df_new = do_the_rudi(df_pivot_ic_basic)
        st.dataframe(df_new.style.applymap(color_value))

    tekst = (
        "<style> .infobox {  background-color: lightblue; padding: 5px;}</style>"
        "<hr><div class='infobox'>Made by Rene Smit. (<a href='http://www.twitter.com/rcsmit' target=\"_blank\">@rcsmit</a>) <br>"
        'Data source :  <a href="https://data.rivm.nl/covid-19/COVID-19_ziekenhuis_ic_opnames_per_leeftijdsgroep.csv" target="_blank">RIVM</a> (daily retrieved)<br>'
        'Sourcecode : <a href="https://github.com/rcsmit/COVIDcases/edit/main/plot_hosp_ic_streamlit.py" target="_blank">github.com/rcsmit</a><br>'
        'How-to tutorial : <a href="https://rcsmit.medium.com/making-interactive-webbased-graphs-with-python-and-streamlit-a9fecf58dd4d" target="_blank">rcsmit.medium.com</a><br>'
    )

    st.sidebar.markdown(tekst, unsafe_allow_html=True)

    st.markdown("<hr>", unsafe_allow_html=True)

    st.image(
        "https://raw.githubusercontent.com/rcsmit/COVIDcases/main/buymeacoffee.png"
    )

    st.markdown(
        '<a href="https://www.buymeacoffee.com/rcsmit" target="_blank">If you are happy with this dashboard, you can buy me a coffee</a>',
        unsafe_allow_html=True,
    )
Ejemplo n.º 23
0
def handle_hop_url(url_params, pathSession):
    '''Handle table display associated with hop'''

    namespace = url_params.get('namespace', [""])[0]
    hostname = url_params.get('hostname', [""])[0]

    if not hostname:
        st.error('No hostname found to display information for')
        st.stop()

    st.header(f'Debug Tables for Path from {pathSession.source} to '
              f'{pathSession.dest}')

    pathobj = getattr(pathSession, 'pathobj', None)
    df = getattr(pathSession, 'path_df', None)
    engobj = pathobj.engine_obj

    if df.empty:
        st.warning('Empty path dataframe')
        st.stop()

    host_dfg = df.query(f'hostname == "{hostname}"') \
                 .groupby(by=['hopCount'])

    df2 = host_dfg.agg({
        'vrf': ['unique'],
        'ipLookup': ['unique'],
        'nexthopIp': ['unique'],
        'oif': ['unique'],
        'macLookup': ['unique'],
        'vtepLookup': ['unique']
    }).reset_index()
    df2.columns = [
        'hopCount', 'vrf', 'ipLookup', 'nexthopIp', 'oif', 'macaddr',
        'vtepLookup'
    ]
    df2 = df2.explode('hopCount').explode('vrf').explode('ipLookup') \
                                                .explode('macaddr') \
                                                .explode('vtepLookup')
    df2.drop_duplicates(subset=['vrf', 'ipLookup'], inplace=True)

    for row in df2.itertuples():
        with st.beta_expander(
                f'Lookups on {hostname}, for hopcount: '
                f'{row.hopCount}',
                expanded=True):
            if row.macaddr:
                st.info(f'MAC Table on {hostname}, MAC addr {row.macaddr}')
                st.dataframe(data=engobj._macsobj.get(namespace=namespace,
                                                      hostname=hostname,
                                                      macaddr=row.macaddr))
                continue

            if (row.ipLookup != row.vtepLookup):
                st.info(f'Route Lookup on {hostname}')
                st.dataframe(data=engobj._rdf.query(
                    f'hostname=="{hostname}" and vrf=="{row.vrf}"'))

            if row.vtepLookup:
                st.info(f'Underlay Lookup on {hostname} for {row.vtepLookup}')
                vtepdf = engobj._underlay_dfs.get(row.vtepLookup,
                                                  pd.DataFrame())
                if not vtepdf.empty:
                    st.dataframe(data=vtepdf.query(
                        f'hostname=="{hostname}" and vrf=="default"'))

            oifs = row.oif.tolist()
            nhops = row.nexthopIp.tolist()
            prev_nhop = ''
            for oif, nhop in zip_longest(oifs, nhops):
                blank1, arpcol = st.beta_columns([1, 40])
                blank2, ifcol = st.beta_columns([2, 40])
                # this logic because I don't know what fn to use with agg above
                # to not remove non-unique nhop.
                if not nhop and prev_nhop:
                    nhop = prev_nhop
                else:
                    prev_nhop = nhop
                arpdf = engobj._arpnd_df.query(f'hostname=="{hostname}" and '
                                               f'ipAddress=="{nhop}" and '
                                               f'oif=="{oif}"')
                with arpcol:
                    st.info(f'ARP/ND Lookup on {hostname} for {nhop}')
                    st.dataframe(data=arpdf, height=100)

                if not arpdf.empty:
                    if ':' in nhop:
                        dropcol = ['ipAddressList']
                    else:
                        dropcol = ['ip6AddressList']
                    if nhop == '169.254.0.1':
                        macaddr = arpdf.macaddr.iloc[0]
                        if_df = engobj._if_df.query(f'macaddr=="{macaddr}"') \
                                             .drop(columns=dropcol)
                        label = f'matching nexthop {nhop}, macaddr {macaddr}'
                    else:
                        if_df = engobj._if_df.drop(columns=dropcol)
                        label = f'matching nexthop {nhop}'
                else:
                    label = f'matching nexthop {nhop}'
                    if_df = engobj._if_df
                if ':' in nhop:
                    s = if_df.ip6AddressList \
                             .explode() \
                             .str.startswith(f'{nhop}/').dropna()
                    s = s.loc[s == True]
                    if_df = if_df.iloc[s.loc[s == True].index]
                elif nhop != '169.254.0.1':
                    s = if_df.ipAddressList \
                             .explode() \
                             .str.startswith(f'{nhop}/').dropna()
                    s = s.loc[s == True]
                    if_df = if_df.iloc[s.loc[s == True].index]
                with ifcol:
                    st.info(f'Interfaces {label}')
                    st.dataframe(data=if_df, height=600)
        st.markdown("<hr>", unsafe_allow_html=True)
Ejemplo n.º 24
0
def main():
    df = read()

    acco_codes = ["all", "w", "sa", "se", "k", "b"]
    acco_names = ["All", "Waikiki", "Sahara", "Serengeti", "Kalahari", "Bali"]

    # distributions = ["weibull_min",  "exponweib"]
    # distribution_to_use =  st.sidebar.selectbox(
    #         "Which distribution to use",
    #         distributions,
    #         index=0)
    # exponweib doesnt work properly

    distribution_to_use = "weibull_min"
    # distribution_to_use = "exponweib"

    st.title(f"Schoonmaaktijden gefit aan Weibull verdeling")
    menu_choice = st.sidebar.radio(
        "",
        ["ALL", "animated", "never cleaned", "edit sheet", "show formulas"],
        index=0)
    binwidth = st.sidebar.slider("Binwidth", 1, 20, 6)
    st.sidebar.write(
        "Attention: Guests are supposed to leave the accomodation clean behind as they found it. These cleaning times are in fact 'make perfect'-times !"
    )
    st.sidebar.write(
        "Google sheet : https://docs.google.com/spreadsheets/d/1Lqddg3Rsq0jhFgL5U-HwvDdo0473QBZtjbAp9ol8kcg/edit#gid=0"
    )
    st.sidebar.write(
        "Broncode : https://github.com/rcsmit/streamlit_scripts/schoonmaaktijden.py"
    )

    if menu_choice == "ALL":
        show_various_plots(df, acco_codes, acco_names, distribution_to_use,
                           binwidth)
    elif menu_choice == "edit sheet":
        edit_sheet()
    elif menu_choice == "never cleaned":
        check_accos_never_cleaned(df)
    elif menu_choice == "animated":
        show_animation(df, acco_codes, acco_names, distribution_to_use,
                       binwidth)

    elif menu_choice == "show formulas":

        st.header("Formulas")

        #st.write ("distribution: y =  (shape/scale) * ((x/scale)**(shape - 1)) * np.exp(-1*((x/scale)**shape)) ")

        st.write(
            "PDF - probability density function : y = (shape/scale) * ((x/scale)**(shape - 1)) * np.exp(-1*((x/scale)**shape))"
        )

        st.write(
            "CDF - cummulative distribution function: y = 1 - (np.exp(- (x/scale)**shape))"
        )
        st.subheader(
            "From percentage to time (x % of the cleans is under y minutes)")
        st.write(
            "PPF - Percentual point function: q = 1-p | y =  scale * (-1 * np.log(q))**(1/shape)"
        )
        st.subheader("Discrete / steps")
        st.write(
            "PMF - probability mass function :     a = np.exp(-1*(x/scale)**shape) |     b =np.exp(-1*((x+step)/scale)**shape) | y = a-b"
        )
        st.write(
            "CDF - Cummulative distribution function :    b =np.exp(-1*((x+1)/scale)**shape) | y = (1-b)"
        )
        st.subheader("Various")
        st.write("cumm_hazard : y = (x/scale)**shape")
        st.write(
            "mean : n = (1+ (1/shape)) |  gamma = math.gamma(n) | y = scale*gamma"
        )
        st.write(
            "pdf_not_used :    x_min_1 = 1-np.exp(-1*((x-1/scale)**shape)) |     xx = 1-np.exp(-1*((x/scale)**shape))| y = (x_min_1 - xx)"
        )
        st.subheader("Extra info")
        st.write(
            " the shape parameter describes the shape of your data’s distribution. Statisticians also refer to it as the Weibull slope because its value equals the slope of the line on a probability plot. Shape value of 2 equals a Rayleigh distribution, which is equivalent to a Chi-square distribution with two degrees of freedom. Shape value near of 3 approximates the normal distribution"
        )

        st.write(
            "The scale parameter represents the variability present in the distribution. The value of the scale parameter equals the 63.2 percentile in the distribution. 63.2% of the values in the distribution are less than the scale value."
        )
        st.write(
            "https://statisticsbyjim.com/probability/weibull-distribution/")
        st.subheader("Links")
        st.write(
            "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.weibull_min.html"
        )
        st.write(
            "https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Weibull.html"
        )
        st.write(
            "https://www.sciencedirect.com/topics/computer-science/weibull-distribution"
        )
        st.write(
            "https://www.itl.nist.gov/div898/handbook/eda/section3/eda3668.htm"
        )

    else:
        st.write("ËRROR")
        st.stop()
Ejemplo n.º 25
0
def discovery_page(state):
    logger.info({"message": "Loading Intents Discovery page."})
    st.title("Intents Discovery")

    st.markdown("""
    When you start to develop your chatbot or even you are developing more intents, you will need to analyze user messages to map the new intents.

    This activity is a hard work task for each chatbot developer, but here we have a method to help you.

    We'll use an unsupervised learning algorithm that will try to cluster user messages into topics.

    The process is done in two stages:

    - Find the best number of clusters (n_clusters);
    - Run the unsupervised algorithm with the best number of clusters.
    """)
    st.image('images/clusters.gif')
    st.write(
        "Your job will be analyze the examples of each topic and understand if these examples should be a new intent or not."
    )

    # Initialize unlabeled_examples
    unlabeled_examples = None

    sim_option = st.radio('Where do we get unlabeled messages?',
                          options=["Watson Assistant", "Import file"])

    if sim_option == "Import file":
        st.markdown("""
        File format

        ```
        I want to make a request
        how to cancel an order
        I need to schedule a visit
        ...
        ```
        """)
        uploaded_file = st.file_uploader("Attach file", type=["csv", "xlsx"])
        if uploaded_file is not None:
            df = read_df(uploaded_file, cols_names=["examples"])
            unlabeled_examples = df["examples"].tolist()
    elif sim_option == "Watson Assistant":
        if st.button("Get logs"):
            # Getting Watson logs
            st.write("Loading Watson Assistant logs.")
            from src.connectors.watson_assistant import WatsonAssistant
            wa = WatsonAssistant(
                apikey=state.watson_args["apikey"],
                service_endpoint=state.watson_args["endpoint"],
                default_skill_id=state.watson_args["skill_id"])

            logs = wa.get_logs()
            if len(logs) > 0:
                state.discovery_data = pd.DataFrame(prepare_logs(logs))
            else:
                logger.error({
                    "message":
                    "It's seems that this skill has no logs available."
                })
                st.error("It's seems that this skill has no logs available.")
                st.stop()

    if isinstance(state.discovery_data, pd.DataFrame):
        if len(state.discovery_data) > 0:
            data = state.discovery_data
            max_words = data["input_words"].max()
            min_words = data["input_words"].min()

            sliders = {
                "confidence":
                st.slider('Confidence',
                          min_value=0.0,
                          max_value=1.0,
                          value=(0.3, 0.6),
                          step=0.01),
                "input_words":
                st.slider('Input words',
                          min_value=max_words,
                          max_value=max_words,
                          value=(min_words, max_words),
                          step=1)
            }

            data = data[(data["confidence"] >= sliders["confidence"][0])
                        & (data["confidence"] <= sliders["confidence"][1]) &
                        (data["input_words"] >= sliders["input_words"][0]) &
                        (data["input_words"] <= sliders["input_words"][1])]

            st.write("Selected messages: {}".format(len(data)))
            st.write(data)

            unlabeled_examples = data["input"].tolist()

    if unlabeled_examples != None:
        if st.button("Run analysis"):
            st.write("## Working on the data")
            st.write("We are preparing the data, this may take some time.")

            # imports
            import plotly.express as px
            from src.intents.discovery import IntentsDiscovery

            # Instantiate an object of IntentsDiscovery class
            intents_discovery = IntentsDiscovery(data=unlabeled_examples,
                                                 spacy_model=state.spacy_model)

            # Apply preprocessing on dataset
            if isinstance(state.stopwords, list):
                intents_discovery.text_processing(stopwords=state.stopwords,
                                                  inplace=True)

            # Find best n_clusters
            st.write("Starting tests to find the best `n_clusters`.")
            intents_discovery.search_n_clusters()

            clustering_data = intents_discovery.clustering(
                n_clusters=intents_discovery.n_clusters)

            df = pd.DataFrame({
                "examples": clustering_data["data"],
                "labels": clustering_data["labels"]
            })

            st.markdown("""
            ## Silhouette score
            To evaluate how the unsupervised model is performing, we’ll use [Silhouette](https://en.wikipedia.org/wiki/Silhouette_(clustering)) score.
            """)

            df_score = pd.DataFrame(intents_discovery.search_data)

            st.plotly_chart(px.line(df_score,
                                    x="n_clusters",
                                    y="silhouette_score",
                                    title="Silhouette score"),
                            use_container_width=True)

            st.markdown("""
            ## Clustered messages
            See below the clustered messages or download it as csv file.
            """)

            link = download_link(df, "clustered_examples.csv",
                                 "Download CSV file")

            st.markdown(link, unsafe_allow_html=True)

            st.dataframe(df)

            st.markdown("""
            ## Topics

            Below we can see the topics that were found.
            """)

            df_topics = df.groupby("labels").count()
            df_topics.sort_values("examples", inplace=True, ascending=True)
            df_topics.reset_index(inplace=True)

            fig_title = "{} topics for {} messages.".format(
                len(df_topics), df_topics["examples"].sum())
            fig = px.bar(df_topics,
                         x="examples",
                         y="labels",
                         orientation="h",
                         hover_name="labels",
                         hover_data=["labels"],
                         title=fig_title)
            fig.layout.update(showlegend=False)
            st.plotly_chart(fig, use_container_width=True)

    state.sync()
Ejemplo n.º 26
0
def main():
    # online version : https://data.rivm.nl/covid-19/COVID-19_casus_landelijk.csv
    df_getdata = load_data()
    df = df_getdata.copy(
        deep=False
    )  # prevent an error [Return value of `prepare_data()` was mutated between runs.]

    start_ = "2021-05-01"
    today = datetime.today().strftime("%Y-%m-%d")
    global from_, FROM, UNTIL
    from_ = st.sidebar.text_input("startdate (yyyy-mm-dd)", start_)

    try:
        FROM = dt.datetime.strptime(from_, "%Y-%m-%d").date()
    except:
        st.error(
            "Please make sure that the startdate is valid and/or in format yyyy-mm-dd"
        )
        st.stop()

    until_ = st.sidebar.text_input("enddate (yyyy-mm-dd)", today)

    try:
        UNTIL = dt.datetime.strptime(until_, "%Y-%m-%d").date()
    except:
        st.error("Please make sure that the enddate is in format yyyy-mm-dd")
        st.stop()

    if FROM >= UNTIL:
        st.warning("Make sure that the end date is not before the start date")
        st.stop()

    if until_ == "2023-08-23":
        st.sidebar.error("Do you really, really, wanna do this?")
        if st.sidebar.button("Yes I'm ready to rumble"):
            caching.clear_cache()
            st.success("Cache is cleared, please reload to scrape new values")

    df.rename(
        columns={
            "Date_file": "count",
        },
        inplace=True,
    )
    # df_hospital = df[df["Hospital_admission"] == "Yes"].copy(deep=False)
    # df_deceased = df[df["Deceased"] == "Yes"].copy(deep=False)
    df = select_period(df, "Date_statistics", FROM, UNTIL)
    df_pivot = (pd.pivot_table(
        df,
        values="count",
        index=["Date_statistics"],
        columns=["Agegroup"],
        aggfunc=np.sum,
    ).reset_index().copy(deep=False))
    df_pivot["TOTAAL"] = df_pivot["0-9"] + df_pivot["10-19"] + df_pivot[
        "20-29"] + df_pivot["30-39"] + df_pivot["40-49"] + df_pivot[
            "50-59"] + df_pivot["60-69"] + df_pivot["70-79"] + df_pivot[
                "80-89"] + df_pivot["90+"]

    # df_pivot_hospital = (
    #     pd.pivot_table(
    #         df_hospital,
    #         values="count",
    #         index=["Date_statistics"],
    #         columns=["Agegroup"],
    #         aggfunc=np.sum,
    #     )
    #     .reset_index()
    #     .copy(deep=False)
    # )

    # df_pivot_deceased = (
    #     pd.pivot_table(
    #         df_deceased,
    #         values="count",
    #         index=["Date_statistics"],
    #         columns=["Agegroup"],
    #         aggfunc=np.sum,
    #     )
    #     .reset_index()
    #     .copy(deep=False)
    # )

    #df_pivot = df_pivot.add_prefix("pos_test_")
    # df_pivot_hospital = df_pivot_hospital.add_prefix("hosp_")
    # save_df(df_pivot_hospital, "df_hospital_per_dag")
    # df_pivot_deceased = df_pivot_deceased.add_prefix("deceased_")
    # print(df_pivot_deceased.dtypes)
    todrop = [
        "Date_statistics_type",
        "Sex",
        "Province",
        "Hospital_admission",
        "Deceased",
        "Week_of_death",
        "Municipal_health_service",
    ]
    df = drop_columns(df, todrop)

    # save_df(df, "landelijk_leeftijd_2")

    # save_df(df_pivot, "landelijk_leeftijd_pivot")
    #save_df(df_pivot_hospital, "landelijk_leeftijd_pivot_hospital")
    #save_df(df_pivot_deceased, "landelijk_leeftijd_pivot_deceased")

    # df_temp = pd.merge(
    #     df_pivot,
    #     df_pivot_hospital,
    #     how="outer",
    #     left_on="pos_test_Date_statistics",
    #     right_on="hosp_Date_statistics",
    # )
    # df_temp = pd.merge(
    #     df_temp,
    #     df_pivot_deceased,
    #     how="outer",
    #     left_on="pos_test_Date_statistics",
    #     right_on="deceased_Date_statistics",
    # )

    #df_temp_per_week = df_temp.groupby(pd.Grouper(key='pos_test_Date_statistics', freq='W')).sum()
    #df_temp_per_week.index -= pd.Timedelta(days=6)
    #print(df_temp_per_week)
    #df_temp_per_week["weekstart"]= df_temp_per_week.index
    #save_df(df_temp, "final_result")
    #save_df(df_temp_per_week, "final_result_per_week")

    lijst = [
        "0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79",
        "80-89", "90+", "TOTAAL"
    ]
    ages_to_show = st.sidebar.multiselect("Ages to show (multiple possible)",
                                          lijst, lijst)
    global WDW2
    df = df_pivot.copy(deep=False)
    t = "SMA"
    tg = st.sidebar.slider("Generation time", 1, 7, 4)
    d = st.sidebar.slider("Look back how many days", 1, 14, 7)
    WDW2 = st.sidebar.slider("Window smoothing curves (days)", 1, 45, 7)
    centersmooth = st.sidebar.selectbox("Smooth in center", [True, False],
                                        index=1)
    df, smoothed_columns = smooth_columnlist(df, ages_to_show, t, WDW2,
                                             centersmooth)
    df, column_list_r_smoothened = add_walking_r(df, smoothed_columns,
                                                 "Date_statistics", t, WDW2,
                                                 tg, d)
    make_age_graph(df, column_list_r_smoothened, lijst,
                   "R getal naar leeftijd")

    st.write(
        "Attentie: DIt is het R getal op basis van moment van rapportage. RIVM berekent het R getal over het moment van besmetting of eerste symptomen"
    )
Ejemplo n.º 27
0
def main():
    state.re_init(current_mod_name)
    if st.checkbox("显示说明", value=True):
        st.markdown(show_describtion())
    state.func2_app['answer_dir'] = st.text_input(
        "输入标准答案样本的目录", state.func2_app['answer_dir'])
    state.func2_app['pending_dir'] = st.text_input(
        "输入待查答案样本的目录", state.func2_app['pending_dir'])
    show_widget_1 = st.empty()
    if not (os.path.exists(state.func2_app['answer_dir'])
            and os.path.exists(state.func2_app['pending_dir'])):
        show_widget_1.warning('xml 目录不存在')
    else:
        small_thr = None
        ignore_small = False
        if st.checkbox('启用宽容小目标模式'):
            if st.checkbox('启用忽略小目标'):
                ignore_small = True
            st.markdown('''
            宽容小目标模式下,对于尺寸小于预设值的目标,在计算iou指标的时候会降低, \n
            具体计算方式是在当前阈值下-0.3 和0.2取最大值 \n
            若启用忽略小目标,那么小于尺寸的目标将不再考虑
            ''')
            small_thr = st.number_input(' 最小面积比例(%)',
                                        min_value=1.0,
                                        max_value=100.0,
                                        value=5.0,
                                        step=0.1)

        iou_thr = show_widget_1.slider('请选择IOU阈值:',
                                       min_value=0.0,
                                       max_value=1.0,
                                       value=0.5,
                                       step=0.1)

        show_widget_2 = st.info('计算结果中!')
        print('current count dir is {}'.format(state.func2_app['pending_dir']))
        sub_pend_dirs = utils.get_sub_dir(state.func2_app['pending_dir'])
        print('sub_dir get done!', sub_pend_dirs)
        if not sub_pend_dirs:
            show_widget_2.text(' 待查目录子目录为空,请组织成正确的目录结构')
            st.stop()
        show_widget_2.text('获取子目录')
        result_dict = {}

        ans_xml_dir = utils.get_son_dir(state.func2_app['answer_dir'])

        for sub_dir in sub_pend_dirs:
            print('count!!!!')
            show_widget_2.info('{}结果计算中'.format(sub_dir))
            result = deal_one_sub_pend_dir(sub_dir,
                                           ans_xml_dir,
                                           iou_thr,
                                           small_thr,
                                           ignore_small,
                                           placeholder_widget=show_widget_2)
            result_dict[sub_dir] = result
        result_str = show_result(result_dict)
        show_widget_2.markdown(result_str)
        # 显示图片
        if st.checkbox(' 是否绘制图片有问题的结果', value=False):
            persons = {x.split(os.path.sep)[-1]: x for x in result_dict.keys()}
            current_preson = st.radio("当前显示结果的文件夹", tuple(persons.keys()))
            current_preson = persons[current_preson]
            _, wrong_labe_result, _, unaccurate_result, _, _, _, _ = result_dict[
                current_preson]
            wl_show_dict = deal_show_result(wrong_labe_result)
            ua_show_dict = deal_show_result(unaccurate_result)
            st.header('显示错误标签的图片')
            show_pair_result(wl_show_dict)
            if repr(wl_show_dict) == repr(ua_show_dict):
                st.text('错误标签和没框准的图片一样,这里不再显示没框准的图片')
            else:
                st.header('显示没框准的图片')
                show_pair_result(ua_show_dict)

        if st.checkbox('是否导出结果', value=True):
            save_dir = st.text_input(' 结果保存的目录')
            if os.path.exists(save_dir):
                export_result(result_dict, save_dir)
                st.write('导出完毕')
def app():
    
    # st.title("Home")
    st.title('Analyze End User License Agreement (EULA) Clauses using Interpretable Machine Learning')
    st.subheader("Developed by Andrew Mendez")
    

    @st.cache
    def get_pdf(filepath):
        pages_pdf = extract_clauses_from_pdf(filepath)
        clauses_pdf = preprocess_clauses_pdf(pages_pdf)
        return clauses_pdf
    @st.cache
    def get_docx(filepath):
        return get_text_from_docx(filepath)
    
    @st.cache
    def get_predictions(clauses):
        with st.spinner("Loading Model and predicting clause acceptabilty..."):
            device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
            PATH_TO_MODEL = '/Users/andrewmendez1/Documents/ai-ml-challenge-2020/data/Finetune BERT oversampling 8_16_2020/Model_1_4_0'
            best_model,tokenizer = load_model_and_tokenizer(PATH_TO_MODEL,device)
            ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
            sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
            cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence
            print(device)

            acceptable_eulas = []
            unacceptable_eulas = []
            num_pos = 0
            for clause in clauses:
                pred,confidence = get_prediction_and_confidence(best_model,clause,tokenizer,device,ref_token_id,sep_token_id,cls_token_id)
                if int(pred)==0:
                    num_pos+=1
                    acceptable_eulas.append([clause,int(pred),float(confidence)])
                else:
                    unacceptable_eulas.append([clause,int(pred),float(confidence)])


        

        return acceptable_eulas,unacceptable_eulas, num_pos
    
    st.header("Welcome to the EULA Analyzer. This website allows you analyze EULA documents to determine whether terms and conditions are acceptable to the government.")
    st.write(" ")
    st.subheader("To get started, enter the file path for a PDF or Word document. Then, press Enter to upload.")

    filename = st.text_input(r'Example on Mac - /Users/amendez/MyDocuments/EULAS/testEULA.docx ; Example on Windows - C:\Users\amendez\MyDocuments\EULAS\testEULA.docx')
    if not filename:
        st.warning('Please upload file')
        st.stop()
    
    ext = os.path.splitext(filename)[1]
    name = filename.split("/")[-1]
    if ext == '.pdf':
        with st.spinner("Extracting clauses from .pdf..."):
            clauses = get_pdf(filename)
    elif ext == '.docx':
        with st.spinner("Extracting Clauses from .docx..."):
            clauses = get_docx(filename)

    st.write(" ")
    st.subheader("Upload is complete. The system has identified {} clauses.".format(len(clauses)))
    
    if st.checkbox("(Optional) Select to show all clauses extracted from the EULA"):
        st.subheader("Clauses from the {} EULA Document".format(name))
        index = st.slider('Click the slider and press left and right arrow keys to explore data.',0,len(clauses),1)
        st.write(HTML_WRAPPER.format(clauses[index]),unsafe_allow_html=True)

    st.subheader("Next, select checkbox to analyze EULA terms and conditions for acceptability.")
    
    if st.checkbox("Run Model"):
        st.subheader("Model Results")
        acceptable_eulas,unacceptable_eulas,num_pos = get_predictions(clauses)
        plt.bar(['Acceptable','Unacceptable'],np.array([num_pos,len(clauses)-num_pos]))
        plt.ylabel("Number of Clauses")
        plt.title("Overview of clauses predicted Acceptable/Unacceptable")
        plt.show()
        st.pyplot()
        st.write("The model has identified {} clauses as Acceptable, and {} clauses as Unacceptable.".format(num_pos,len(clauses)-num_pos))
        st.write(" ")
        st.subheader("Explore Acceptable clauses:")

        index1 = st.slider('Click the slider and press left and right arrow keys to explore data.',0,len(acceptable_eulas),1)
        label1 = ''
        if acceptable_eulas[index1][1] == 0:
            label1 = 'Acceptable'
        else:
            label1 = 'Unacceptable'
        st.subheader("The model has identified this clause as {} with {:.1f} % confidence.".format(label1,acceptable_eulas[index1][2]*100 ))
        st.write(HTML_WRAPPER.format(acceptable_eulas[index1][0]),unsafe_allow_html=True)
        st.write(" ")
        st.subheader("Explore Unacceptable clauses:")
        index2 = st.slider('Click the slider and press left and right arrow keys to explore data.',0,len(unacceptable_eulas),1)
        label2 = ''
        if unacceptable_eulas[index2][1] == 0:
            label2 = 'Acceptable'
        else:
            label2 = 'Unacceptable'
        st.subheader("The model has identified this clause as {} with {:.1f} % confidence.".format(label2,unacceptable_eulas[index2][2]*100 ))
        st.write(HTML_WRAPPER.format(unacceptable_eulas[index2][0]),unsafe_allow_html=True)
        st.subheader("Explore Model Interpretation")
        if st.checkbox("Select to see why individual clauses were identified as unacceptable."):
            # st.write(" Here we are leveraging the IntegratedGradients to interpret model predictions and show specific words that have highest attribution to the model output.")
            # st.write("Integrated gradients is an axiomatic attribution method that assigns an attribution(i.e. factor) score to each word/token in the clause.")
            # st.write(" To run, select the predicted unacceptable clause and press Interpret.")
            st.subheader("Explore why clauses were identified as Unacceptable:")
            index3 = st.slider('Click the slider and press left and right arrow keys to explore unacceptable.',0,len(unacceptable_eulas),1)
            st.write(HTML_WRAPPER.format(unacceptable_eulas[index3][0]),unsafe_allow_html=True)
            text = unacceptable_eulas[index3][0]
            if st.button("Interpret Prediction"):
                with st.spinner("Running Model Interpretation Analysis..."):
                    interpret_main(text,"?")
Ejemplo n.º 29
0
        if times == 0:
            break

    #---- Apresentando a seleção
    select = (dados.consumo == 1)
    colunas = ['nome', 'preco', 'pais', 'tipo', 'descricao', 'uvas', 'rating']
    df_temp = dados[select][colunas].sort_values(by=['uvas'], ascending=False)

    #--- Precisa verificar se existe algo a apresentar !!
    tam_temp = df_temp.shape[0]

    if tam_temp == 0:
        st.write(
            'Lamentamos informar que sua seleção não retornou resultados. Tente novamente!'
        )
        st.stop()
    # else:
    # st.write('Sua Seleção de Consumo é apresentada abaixo:')
    # st.table(df_temp)

    #############################################################################################################
    ## Função de Recomendação, baseada no conceito de Vector Space Model, onde um documento de
    ## texto é transformado em uma representação vetorial, em um espaço multi dimensional.
    ## Após a transformação, passamos a empregar a Medida de Similaridade de Coseno, afim de determinar
    ## quais Observações mais se aproximam do conjunto de Preferencias do usuário, dentro do catalogo.
    #############################################################################################################

    # TfidfVectorizer não possui um conjunto interno de stop words em Portugues, mas aceita uma lista
    # de palavras como parametro. Dessa forma, vamos gerar as stopwords em Portugues, usando o NLTK.

    # Armazena as stop words em Portugues em uma variavel
Ejemplo n.º 30
0
def url_data():
    about()
    st.info("This feature has limited functionality")
    url=st.text_input("Webpage URL",help="Enter a url where your data is placed")
    if url=="":
        st.info("Please enter a valid input to get started")
        st.stop()
    
    #getting data Column names as user input
    column_name=st.text_input("enter candidadte column Name",key="value")
    value_list=column_name.split(",")
    
    #getting data example for refferances
    candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value")
    items_list=candidate.split(";")
    #st.write(items)
    
# create object
    scraper = AutoScraper()
# feeding for scraping
    final_result = scraper.build(url,items_list)
# display result
    
    
    results=scraper.get_result_similar(url,grouped=True,keep_order=True)
    result={}
    for key,value in results.items():
        if value not in result.values():
            result[key]=value
            
    orient_df=pd.DataFrame.from_dict(result,orient="index")
    df=orient_df.transpose()
    
    df.columns=value_list
    df.fillna(value=pd.np.nan,inplace=True)
    st.write(df)
    
    cols=df.columns.tolist()
    col1,col2=st.beta_columns(2)
 
    target=col1.selectbox("Select Target", cols,key="target")


    
    typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
    p_type=col2.selectbox("Select problem type",typelist,key="p_type")     
    st.write("hey")
    x=df.drop(columns=target)
    y=df[target]
    x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
    automl.search()


    rank=automl.rankings

#checking best pipeline     ###############################################################

    best_pipeline=automl.best_pipeline
    description=automl.describe_pipeline(automl.rankings.iloc[0]["id"])

### OPtimize the code 


### Evaluate on hold out data
    problem_list=['binary','time series binary']
    problem_list2=['multiclass','time series multiclass']

    cola,col_b,colc=st.beta_columns(3)
    
    if p_type in problem_list:
        objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector")  
        best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
    elif p_type in problem_list2:
        objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") 
        best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
    else:
                objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
    file=open("model_details.txt","w")
    str_dict=repr(tunned_description)
    file.write(str_dict)
    file.close()
    def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href                
    col1,col2,col3=st.beta_columns([1,1,1])        
    if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

    col11,col12=st.beta_columns([3,1])
    with col11:
        with st.beta_expander("Compare Models"):
                st.write(tunned_rankings)
        
    with col12:
        with st.beta_expander("Best Pipeline"):
                st.success(tunned_pipeline)
                st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)