Example #1
0
def plot_distinctive_words(x_label='',
                           x_files=[],
                           y_label='',
                           y_files=[],
                           max_words=10000,
                           max_files=100):
    '''
  Create a scatterplot that shows the distinctive words among x_files and y_files.
  Use x_label as the x axis label and y_label as the y_axis label.
  Return HTML content that can be rendered to show the distinctive words.
  '''
    rows = []
    for i in x_files[:max_files]:
        rows.append([x_label, ' '.join(open(i).read().split()[:max_words])])
    for i in y_files[:max_files]:
        rows.append([y_label, ' '.join(open(i).read().split()[:max_words])])
    df = pandas.DataFrame(rows, columns=['Group', 'Text'])
    nlp = spacy.load('en')
    nlp.max_length = 2**64
    corpus = scattertext.CorpusFromPandas(df,
                                          category_col='Group',
                                          text_col='Text',
                                          nlp=nlp).build()
    html = scattertext.produce_scattertext_html(corpus,
                                                category=y_label,
                                                category_name=y_label,
                                                not_category_name=x_label,
                                                minimum_term_frequency=5,
                                                width_in_pixels=1000)
    return html
Example #2
0
import spacy

from scattertext import SampleCorpora
from scattertext import produce_scattertext_explorer, produce_scattertext_html
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_html(corpus,
                                category='democrat',
                                category_name='Democratic',
                                not_category_name='Republican',
                                minimum_term_frequency=5,
                                pmi_filter_thresold=4,
                                width_in_pixels=1000)
open('./simple.html', 'wb').write(html.encode('utf-8'))
print('Open ./simple.html in Chrome or Firefox.')
Example #3
0
import spacy

from scattertext import SampleCorpora
from scattertext import produce_scattertext_explorer, produce_scattertext_html
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_html(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    pmi_filter_thresold=4,
                                    width_in_pixels=1000)
open('./simple.html', 'wb').write(html.encode('utf-8'))
print('Open ./simple.html in Chrome or Firefox.')