Example #1
0
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences).build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1).use_general_term_frequencies().
          use_all_categories().get_priors())
(open(fn, 'wb').write(
    st.produce_fightin_words_explorer(
        corpus,
        category='fresh',
        not_categories=['rotten'],
        metadata=df['movie_name'],
        term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10),
    ).encode('utf-8')))
print(fn)
    st.CorpusFromParsedDocuments(
        reviews_df,
        category_col='category',
        parsed_col='parse',
        #feats_from_spacy_doc=st.PhraseMachinePhrases()
    ).build())

term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus.keep_only_these_categories([
    'Accept, Positive', 'Accept, Negative', 'Reject, Positive',
    'Reject, Negative'
], False).get_unigram_corpus().compact(
    st.ClassPercentageCompactor(term_count=5)))

print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(
    full_corpus, starting_count=0.01).use_all_categories().get_priors())
print('building four square', time.time() - t0, 's')

four_square = st.FourSquare(corpus,
                            category_a_list=['Accept, Positive'],
                            not_category_a_list=['Reject, Negative'],
                            category_b_list=['Accept, Negative'],
                            not_category_b_list=['Reject, Positive'],
                            term_ranker=term_ranker,
                            scorer=st.LogOddsRatioInformativeDirichletPrior(
                                priors, 500, 'word'),
                            labels={
                                'a': 'Positive Reviews of Accepted Papers',
                                'b': 'Negative Reviews of Accepted Papers',
                                'not_a_and_not_b': 'Rejections',
                                'a_and_b': 'Acceptances',