import scattertext as st from scattertext import LogOddsRatioInformativeDirichletPrior fn = 'rotten_fresh2.html' df = st.SampleCorpora.RottenTomatoes.get_data() corpus = (st.CorpusFromPandas(df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences).build()) priors = (st.PriorFactory(corpus, category='fresh', not_categories=['rotten'], starting_count=1).use_general_term_frequencies(). use_all_categories().get_priors()) (open(fn, 'wb').write( st.produce_fightin_words_explorer( corpus, category='fresh', not_categories=['rotten'], metadata=df['movie_name'], term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10), ).encode('utf-8'))) print(fn)
st.CorpusFromParsedDocuments( reviews_df, category_col='category', parsed_col='parse', #feats_from_spacy_doc=st.PhraseMachinePhrases() ).build()) term_ranker = st.OncePerDocFrequencyRanker corpus = (full_corpus.keep_only_these_categories([ 'Accept, Positive', 'Accept, Negative', 'Reject, Positive', 'Reject, Negative' ], False).get_unigram_corpus().compact( st.ClassPercentageCompactor(term_count=5))) print('finding priors', time.time() - t0, 's') priors = (st.PriorFactory( full_corpus, starting_count=0.01).use_all_categories().get_priors()) print('building four square', time.time() - t0, 's') four_square = st.FourSquare(corpus, category_a_list=['Accept, Positive'], not_category_a_list=['Reject, Negative'], category_b_list=['Accept, Negative'], not_category_b_list=['Reject, Positive'], term_ranker=term_ranker, scorer=st.LogOddsRatioInformativeDirichletPrior( priors, 500, 'word'), labels={ 'a': 'Positive Reviews of Accepted Papers', 'b': 'Negative Reviews of Accepted Papers', 'not_a_and_not_b': 'Rejections', 'a_and_b': 'Acceptances',