parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))

corpus = st.CorpusWithoutCategoriesFromParsedDocuments(
    df,
    parsed_col='parse').build().get_unigram_corpus().remove_infrequent_words(
        minimum_term_count=6)

dispersion = st.Dispersion(corpus)

dispersion_df = dispersion.get_df().assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: dispersion.rosengrens(),
    Ypos=lambda df: st.Scalers.scale(df.Y),
)

html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['speaker'] + ' (' +
    corpus.get_df()['party'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label="Rosengren's S",
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
)

fn = 'demo_dispersion_basic.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual))

line_df = pd.DataFrame({
    'x': dispersion_df.Xpos.values,
    'y': dispersion_df.Expected.values,
}).sort_values(by='x')

html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['speaker'] + ' (' +
    corpus.get_df()['party'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label='DA',
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'DA'],
    header_names={
        'upper': 'Lower than Expected',
        'lower': 'More than Expected'
    },
    left_list_column='Residual',
    background_color='#e5e5e3',
    line_coordinates=line_df.to_dict('records'))

fn = 'demo_dispersion.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)

residual_dispersion_df = dispersion_df.assign(
    Expected=lambda df: Lowess().fit_predict(df.X.values, df.Y.values),
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: dispersion.rosengrens(),
    Ypos=lambda df: st.Scalers.scale(df.Y),
)

dispersion_df = dispersion_df.assign(
    Expected=lambda df: KNeighborsRegressor(n_neighbors=10).fit(
        df.X.values.reshape(-1, 1), df.Y
    ).predict(df.X.values.reshape(-1, 1)),
    Residual=lambda df: df.Y - df.Expected,
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual)
)

html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label="Rosengren's S",
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
    color_score_column='ColorScore',
    header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
    left_list_column='Residual',
    background_color='#e5e5e3'
)

fn = 'demo_dispersion.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)
Exemple #4
0
    Ypos=lambda df: st.Scalers.dense_rank(df.Y),
    Xpos=lambda df: st.Scalers.dense_rank(df.X),
    SuppressDisplay=False,
    ColorScore=lambda df: st.Scalers.scale_center_zero(df.Ypos - df.Xpos),
)

html = st.dataframe_scattertext(
    corpus,
    plot_df=plot_df,
    category='democrat', 
    category_name='Democratic', 
    not_category_name='Republican',
    width_in_pixels=1000, 
    suppress_text_column='Display',
    metadata=corpus.get_df()['speaker'],
    use_non_text_features=True,
    ignore_categories=False,
    use_offsets=True,
    unified_context=False,
    color_score_column='ColorScore',
    left_list_column='ColorScore',
    y_label='Democarats',
    x_label='Republicans',
    header_names={'upper': 'Top Democratic', 'lower': 'Top Republican', 'right': 'Most Frequent'},
    subword_encoding='RoBERTa'
)

fn = 'roberta_sentence_piece.html'
with open(fn, 'w') as of:
    of.write(html)

print("Open ./" + fn + ' in Chrome.')
Exemple #5
0
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.AdjustedDA))

line_df = pd.DataFrame({
    'x': dispersion_df.Xpos.values,
    'y': dispersion_df.Expected.values,
}).sort_values(by='x')

html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['speaker'] + ' (' +
    corpus.get_df()['party'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label='DA',
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
    color_score_column='ColorScore',
    tooltip_columns=['Frequency', 'DA'],
    header_names={
        'upper': 'Lower than Expected',
        'lower': 'More than Expected'
    },
    left_list_column='AdjustedDA',
    background_color='#e5e5e3',
    line_coordinates=line_df.to_dict('records'))

fn = 'demo_dispersion.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)

residual_dispersion_df = dispersion_df.assign(
    Expected=lambda df: Lowess().fit_predict(df.X.values, df.Y.values),