Esempio n. 1
0
    def gitc(self, dataframe):
        general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

        corpus = st.CorpusFromPandas(
            dataframe,
            category_col='Document Type',
            text_col='Text',
            nlp=st.whitespace_nlp_with_sentences,
            feats_from_spacy_doc=general_inquirer_feature_builder).build()

        html = st.produce_frequency_explorer(
            corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            use_non_text_features=True,
            use_full_doc=True,
            term_scorer=st.LogOddsRatioUninformativeDirichletPrior(),
            grey_threshold=1.96,
            width_in_pixels=1000,
            metadata=dataframe['Document'],
            topic_model_term_lists=general_inquirer_feature_builder.
            get_top_model_term_lists())

        logger.getLogger().info("Opening GITC-Visual")
        open(self.gitc_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.gitc_file)
Esempio n. 2
0
def logodds_ratio_with_uninformative_dirichlet_prior(df, alpha_w=0.0001):
    scorer = scattertext.LogOddsRatioUninformativeDirichletPrior(
        alpha_w=alpha_w)

    top_words = {}

    frequencies = df.sum(axis=0)

    for idx, row in df.iterrows():
        positive = row
        negative = frequencies - positive
        group_scores = scorer.get_scores(positive, negative)
        top_words[idx] = group_scores

    return pd.DataFrame(top_words).T
Esempio n. 3
0
    def scatter_viz(self):

        #load corpus
        corpus = self.create_corpus()

        #load cleaned df
        convention_df = self.clean_texts()

        html = produce_scattertext_explorer(
            corpus,
            category='left',
            category_name='Democratic',
            not_category_name='Republican',
            width_in_pixels=1000,
            minimum_term_frequency=1,
            metadata=convention_df['position'],
            term_significance=st.LogOddsRatioUninformativeDirichletPrior())

        file_name = key_path + '/templates/scattertext.html'
        open(file_name, 'wb').write(html.encode('utf-8'))

        return file_name
Esempio n. 4
0
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()
corpus = st.CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=general_inquirer_feature_builder).build()
html = st.produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    metadata=convention_df['speaker'],
    use_non_text_features=True,
    use_full_doc=True,
    term_scorer=st.LogOddsRatioUninformativeDirichletPrior(),
    grey_threshold=1.96,
    width_in_pixels=1000,
    topic_model_term_lists=general_inquirer_feature_builder.
    get_top_model_term_lists(),
    metadata_descriptions=general_inquirer_feature_builder.get_definitions())
fn = 'demo_general_inquirer_frequency_plot.html'
with open(fn, 'wb') as out:
    out.write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (fn))