def plot_distinctive_words(x_label='', x_files=[], y_label='', y_files=[], max_words=10000, max_files=100): ''' Create a scatterplot that shows the distinctive words among x_files and y_files. Use x_label as the x axis label and y_label as the y_axis label. Return HTML content that can be rendered to show the distinctive words. ''' rows = [] for i in x_files[:max_files]: rows.append([x_label, ' '.join(open(i).read().split()[:max_words])]) for i in y_files[:max_files]: rows.append([y_label, ' '.join(open(i).read().split()[:max_words])]) df = pandas.DataFrame(rows, columns=['Group', 'Text']) nlp = spacy.load('en') nlp.max_length = 2**64 corpus = scattertext.CorpusFromPandas(df, category_col='Group', text_col='Text', nlp=nlp).build() html = scattertext.produce_scattertext_html(corpus, category=y_label, category_name=y_label, not_category_name=x_label, minimum_term_frequency=5, width_in_pixels=1000) return html
import spacy from scattertext import SampleCorpora from scattertext import produce_scattertext_explorer, produce_scattertext_html from scattertext.CorpusFromPandas import CorpusFromPandas nlp = spacy.load('en') convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=nlp).build() html = produce_scattertext_html(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, pmi_filter_thresold=4, width_in_pixels=1000) open('./simple.html', 'wb').write(html.encode('utf-8')) print('Open ./simple.html in Chrome or Firefox.')