Ejemplo n.º 1
0
def create_scatter_text(writers, names, messages, nonames=False):
    my_df = pd.DataFrame({"author": names, "message": messages})
    nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
    my_df['parse'] = my_df['message'].apply(nlp)

    corpus = st.CorpusFromParsedDocuments(
        my_df, category_col='author',
        parsed_col='parse').build().get_unigram_corpus().compact(
            st.AssociationCompactor(2000))

    if nonames:
        html = st.produce_scattertext_explorer(corpus,
                                               category=writers[0],
                                               category_name="Author_0",
                                               not_category_name="Author_1",
                                               minimum_term_frequency=0,
                                               pmi_threshold_coefficient=0,
                                               width_in_pixels=1000,
                                               transform=st.Scalers.dense_rank)
    else:
        html = st.produce_scattertext_explorer(corpus,
                                               category=writers[0],
                                               category_name=writers[0],
                                               not_category_name=writers[1],
                                               minimum_term_frequency=0,
                                               pmi_threshold_coefficient=0,
                                               width_in_pixels=1000,
                                               transform=st.Scalers.dense_rank)

    with open('./demo_compact.html', 'w') as f:
        f.write(html)
    f.close()
Ejemplo n.º 2
0
def main():
    shisei = _parse_geutenberg(
        'http://www.gutenberg.org/files/31617/31617-0.txt')
    horadanshaku = _parse_geutenberg(
        'http://www.gutenberg.org/files/34084/34084-0.txt')
    df = pd.DataFrame({
        'text': [shisei, horadanshaku],
        'title': ['Shisei', 'Horadanshaku tabimiyage'],
        'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki']
    })

    df['text'] = df['text'].apply(st.japanese_nlp)
    corpus = st.CorpusFromParsedDocuments(df,
                                          category_col='title',
                                          parsed_col='text').build()
    html = st.produce_scattertext_explorer(
        corpus,
        category='Shisei',
        category_name='Shisei',
        not_category_name='Horadanshaku tabimiyage',
        minimum_term_frequency=5,
        width_in_pixels=1000,
        metadata=df['title'] + ' by ' + df['author'],
        asian_mode=True)
    open('./demo_japanese.html', 'w').write(html)
    print('Open ./demo_japanese.html in Chrome or Firefox.')
Ejemplo n.º 3
0
def print_graph(corpus, speeches_df, category, type, not_type):
    """
    :param corpus:
    :param speeches_df:
    :param category:
    :param type:
    :param not_type:
    :return: produces html file with corpus visualization
    """
    if type == "1":
        type = "Dictatorship"
        not_type = "Democracy"
    if type == "2":
        type = "Democracy"
        not_type = "Dictatorship"
    #if type == "Francisco Franco":
    #    not_type = "Borbones"
    html = st.produce_scattertext_explorer(corpus,
                                           category=type,
                                           category_name=type,
                                           not_category_name=not_type,
                                           width_in_pixels=1000,
                                           metadata=speeches_df[category])
    open("./visualization/visualization_" + type + ".html",
         'wb').write(html.encode('utf-8'))
Ejemplo n.º 4
0
def create_scattertext_plot(df, category_col:str, text_col:str, nlp, filename:str, label_match:str, label_name:str, label_other_name:str, metadata_col:str, **kwargs):
    """ creates a html file with an interactive scattertext plot

    Will delete an 'index' column if there is one as the scattertext function needs to create it
    label_match must be one of 2 entries in the category_col
    label_name is the user-friendly name given to a match, e.g. if label_match is 'Yes', you might want a more meaningful label such as 'A good week'
    label_other_name is the label for the other entry - e.g. 'A bad week'
    **kwargs goes into scattertext.produce_scattertext_explorer, e.g. minimum_term_frequency=8,

    :returns: nothing, but creates a HTML file"""
    if 'index' in df.columns:
        df.drop('index',axis=1,inplace=True)
    corpus = st.CorpusFromPandas(df,category_col=category_col,text_col=text_col, nlp=nlp).build()
    html = st.produce_scattertext_explorer(corpus,
                                      category=label_match,
                                      category_name=label_name,
                                      not_category_name=label_other_name,
                                      metadata=corpus.get_df()[metadata_col],
                                      save_svg_button=True,
                                           **kwargs
                                      )


    html_file = open(filename, 'wb')
    html_file.write(html.encode('utf-8'))
    html_file.close()
    def scattertext_function(self):

        ## START
        nlp = spacy.load('en_core_web_sm')
        convention_df = pd.read_csv(
            "After_Classification/After_Classification_NY_6.csv")
        convention_df['parsed'] = convention_df.tweet.apply(nlp)

        ##Index(['Unnamed: 0', 'Date', 'name', 'tweet', 'death', 'Classification'], dtype='object')
        # print("Document Count")
        # print(convention_df.groupby('Classification')['tweet'].count())
        # print("Word Count")
        # print(convention_df.groupby('Classification').apply(lambda x: x.tweet.apply(lambda x: len(x.split())).sum()))
        # print(type(convention_df))

        ##Convert Dataframe into Scattertext Corpus
        corpus = st.CorpusFromParsedDocuments(convention_df,
                                              category_col='Classification',
                                              parsed_col='parsed').build()
        print(type(st.Scalers.log_scale_standardize))
        list(corpus.get_scaled_f_scores_vs_background().index[:10])
        html = st.produce_scattertext_explorer(
            corpus,
            category='pos',
            category_name='POS',
            not_category_name='NEG',
            minimum_term_frequency=5,
            width_in_pixels=1000,
            transform=st.Scalers.log_scale_standardize)

        file_name_1 = 'After_Classification_NY_6.html'
        open(file_name_1, 'wb').write(html.encode('utf-8'))
        print(IFrame(src=file_name_1, width=1200, height=700))
Ejemplo n.º 6
0
 def create_html_with_two_categories(self, category_one, category_two,
                                     name_of_file):
     html = st.produce_scattertext_explorer(self.term_cat_freq,
                                            category=category_one,
                                            category_name=category_one,
                                            not_category_name=category_two)
     open(name_of_file, 'wb').write(html.encode('utf-8'))
     self.name_of_file = name_of_file
Ejemplo n.º 7
0
def main():
    # convention_df = SampleCorpora.ConventionData2012.get_data()
    feat_builder = FeatsFromOnlyEmpath()
    # corpus = CorpusFromParsedDocuments(convention_df,
    #                                    category_col='party',
    #                                    parsed_col='text',
    #                                    feats_from_spacy_doc=feat_builder).build()
    # html = produce_scattertext_explorer(corpus,
    #                                     category='democrat',
    #                                     category_name='Democratic',
    #                                     not_category_name='Republican',
    #                                     width_in_pixels=1000,
    #                                     metadata=convention_df['speaker'],
    #                                     use_non_text_features=True,
    #                                     use_full_doc=True,
    #                                     topic_model_term_lists=feat_builder.get_top_model_term_lists())

    # ================================================================================
    all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds(
    )
    # print("all_satisfaction_score_comment_in_all_conds",all_satisfaction_score_comment_in_all_conds)
    # [['negative', 'Satisfaction', 'after a week----mouth ulccers,cudnt talk,eat,drink for 5 days....whole body burnt,headache, fatigue....quit---am slowly getting better, wudnt give to my worst

    # print("all_satisfaction_score_comment_in_all_conds",len(all_satisfaction_score_comment_in_all_conds))
    # 1402

    # ================================================================================
    columns = ['senti_on_Metfor_oral', 'feature', 'review']
    all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame(
        all_satisfaction_score_comment_in_all_conds,
        index=None,
        columns=columns)

    # ================================================================================
    corpus = CorpusFromParsedDocuments(
        all_satisfaction_score_comment_in_all_conds_df,
        category_col='senti_on_Metfor_oral',
        parsed_col='review',
        feats_from_spacy_doc=feat_builder).build()

    # ================================================================================
    html = produce_scattertext_explorer(
        corpus,
        category='negative',
        category_name='Negative',
        not_category_name='Positive',
        width_in_pixels=1000,
        metadata=all_satisfaction_score_comment_in_all_conds_df['feature'],
        use_non_text_features=True,
        use_full_doc=True,
        topic_model_term_lists=feat_builder.get_top_model_term_lists())

    # ================================================================================
    open(
        '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/Convention-Visualization-Empath.html',
        'wb').write(html.encode('utf-8'))
    print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
Ejemplo n.º 8
0
def vis():
    '''
    text1 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh.en.txt", "r").read()
    text2 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh_online.en.txt", "r").read()
    df = pd.DataFrame( [{'text': text.strip(), 'label': 'text1'} for text in text1.decode('utf-8', errors='ignore').split('\n')] + [{'text': text.strip(), 'label': 'text2'} for text in text2.decode('utf-8', errors='ignore').split('\n')]
    )
    term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, category_col = 'label', text_col = 'text', nlp = ST.whitespace_nlp ).build()
    filtered_term_doc_mat = (ST.TermDocMatrixFilter(pmi_threshold_coef = 1, minimum_term_freq = 1).filter(term_doc_mat))
    scatter_chart_data = (ST.ScatterChart(filtered_term_doc_mat).to_dict('text1', category_name='text1', not_category_name='text2'))
    viz_data_adapter = ST.viz.VizDataAdapter(scatter_chart_data)
    html = ST.viz.HTMLVisualizationAssembly(viz_data_adapter).to_html()
    open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
    IFrame(src='subj_obj_scatter.html', width = 1000, height=1000)
    '''

    SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
    data = io.BytesIO(urllib.urlopen(SUBJECTIVITY_URL).read())
    tarball = tarfile.open(fileobj=data, mode='r:gz')
    readme = tarball.extractfile('subjdata.README.1.0').read()
    quote = tarball.extractfile('quote.tok.gt9.5000').read()
    plot = tarball.extractfile('plot.tok.gt9.5000').read()

    text1 = open("tmp/flickr_test_1k_zh.en.txt", "r").read()
    text2 = open("tmp/flickr_test_1k_zh.en.txt", "r").read()
    # Examples of subjective sentences in corpus
    #quote.decode('utf-8', errors='ignore').split('\n')[:3]
    '''Construct subjective vs. objective pandas dataframe, 
    treating review quotes as subjective, and plot points as objective.
    '''
    df = pd.DataFrame(
        [{
            'text': text.strip(),
            'label': 'subjective'
        } for text in quote.decode('utf-8', errors='ignore').split('\n')] +
        [{
            'text': text.strip(),
            'label': 'objective'
        } for text in plot.decode('utf-8', errors='ignore').split('\n')])
    '''Convert Pandas dataframe to a term-document matrix, indicating
    the category column is "label" and the text column name is "text".'''
    nlp = spacy.load('en')

    corpus = ST.CorpusFromPandas(
        data_frame=df,
        category_col='label',
        text_col='text',
        # Note: use nlp=spacy.en.English() for text that's not pre-tokenized
        nlp=nlp).build()
    term_freq_df = corpus.get_term_freq_df()

    html = ST.produce_scattertext_explorer(corpus,
                                           category='label',
                                           category_name='subjective',
                                           not_category_name='objective',
                                           width_in_pixels=1000)
    open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))
Ejemplo n.º 9
0
def print_graph(corpus, speeches_df, category, type, not_type):
    """
    :param corpus:
    :param speeches_df:
    :param category:
    :param type:
    :param not_type:
    :return: produces html file with corpus visualization
    """
    html = st.produce_scattertext_explorer(corpus, category=type, category_name=type, not_category_name=not_type, width_in_pixels=1000, metadata=speeches_df[category])
    open("./visualization/visualization_" + type + ".html", 'wb').write(html.encode('utf-8'))
Ejemplo n.º 10
0
def get_sct_html(rest_name, city_name):
    rest_reviews = get_rest_reviews(rest_name, city_name)
    nlp = spacy.load('en_core_web_sm')
    corpus = sct.CorpusFromPandas(rest_reviews,
                             category_col='class',
                             text_col='text',
                             nlp=nlp).build()
    html = sct.produce_scattertext_explorer(corpus,
         category='good',
         category_name='Positive',
         not_category_name='Negative',
         width_in_pixels=900,
         metadata=rest_reviews['class'])
    return open("rest_reviews-Vis.html", 'wb').write(html.encode('utf-8'))
Ejemplo n.º 11
0
def create_scatterplot(df, return_corpus=False):
    '''Creates an HTML file to visualize differences in corpora.'''
    corpus = st.CorpusFromPandas(df,
                                 category_col='author',
                                 text_col='text',
                                 nlp=nlp).build()
    if return_corpus:
        return corpus
    html = st.produce_scattertext_explorer(corpus,
                                           category='EAP',
                                           category_name='Edger Allen Poe',
                                           not_category_name='HPL/MWS',
                                           width_in_pixels=1000,
                                           metadata=df['author'])
    open("Author-Visualization.html", 'wb').write(html.encode('utf-8'))
Ejemplo n.º 12
0
    def standard(self, dataframe):
        corpus = st.CorpusFromPandas(dataframe, category_col='Document Type',
                                     text_col='Text', nlp=self.nlp).build()

        html = st.produce_scattertext_explorer(corpus, category='1st Document',
                                               category_name='1st Document',
                                               not_category_name='2nd Document',
                                               width_in_pixels=1000)

        logger.getLogger().info("Opening Standard Visual")

        open(self.std_file, 'wb').write(html.encode('utf-8'))

        if os.path.isfile(self.std_file):
            logger.getLogger().info("Graph file created")
Ejemplo n.º 13
0
def main():
	df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv')
	df['text'] = df['text'].apply(chinese_nlp)
	corpus = CorpusFromParsedDocuments(df,
	                                   category_col='novel',
	                                   parsed_col='text').build()
	html = produce_scattertext_explorer(corpus,
	                                    category='Tale of Two Cities',
	                                    category_name='Tale of Two Cities',
	                                    not_category_name='Ulysses',
	                                    width_in_pixels=1000,
	                                    metadata=df['novel'],
	                                    asian_mode=True)
	open('./demo_chinese.html', 'w').write(html)
	print('Open ./demo_chinese.html in Chrome or Firefox.')
Ejemplo n.º 14
0
def main():
	df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv')
	df['text'] = df['text'].apply(chinese_nlp)
	corpus = CorpusFromParsedDocuments(df,
	                                   category_col='novel',
	                                   parsed_col='text').build()
	html = produce_scattertext_explorer(corpus,
	                                    category='Tale of Two Cities',
	                                    category_name='Tale of Two Cities',
	                                    not_category_name='Ulysses',
	                                    width_in_pixels=1000,
	                                    metadata=df['novel'],
	                                    chinese_mode=True)
	open('./demo_chinese.html', 'w').write(html)
	print('Open ./demo_chinese.html in Chrome or Firefox.')
Ejemplo n.º 15
0
    def standard(self, dataframe):
        corpus = st.CorpusFromPandas(dataframe,
                                     category_col='Document Type',
                                     text_col='Text',
                                     nlp=self.nlp).build()

        html = st.produce_scattertext_explorer(corpus,
                                               category='submission',
                                               category_name='Submission',
                                               not_category_name='Standard',
                                               width_in_pixels=1000,
                                               metadata=dataframe['Document'])

        logger.getLogger().info("Opening Standard Visual")
        open(self.std_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.std_file)
Ejemplo n.º 16
0
def main():
	convention_df = SampleCorpora.ConventionData2012.get_data()
	feat_builder = FeatsFromOnlyEmpath()
	corpus = CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='text',
	                                   feats_from_spacy_doc=feat_builder).build()
	html = produce_scattertext_explorer(corpus,
	                                    category='democrat',
	                                    category_name='Democratic',
	                                    not_category_name='Republican',
	                                    width_in_pixels=1000,
	                                    metadata=convention_df['speaker'],
	                                    use_non_text_features=True,
	                                    use_full_doc=True,
	                                    topic_model_term_lists=feat_builder.get_top_model_term_lists())
	open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8'))
	print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
Ejemplo n.º 17
0
def main():
    convention_df = SampleCorpora.ConventionData2012.get_data()

    corpus = CorpusFromParsedDocuments(
        convention_df,
        category_col='party',
        parsed_col='text',
        feats_from_spacy_doc=FeatsFromOnlyEmpath()).build()
    html = produce_scattertext_explorer(corpus,
                                        category='democrat',
                                        category_name='Democratic',
                                        not_category_name='Republican',
                                        width_in_pixels=1000,
                                        metadata=convention_df['speaker'],
                                        use_non_text_features=True,
                                        use_full_doc=True)
    open('./Convention-Visualization-Empath.html',
         'wb').write(html.encode('utf-8'))
    print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
Ejemplo n.º 18
0
def main():
    convention_df = SampleCorpora.ConventionData2012.get_data()

    corpus = CorpusFromPandas(
        convention_df,
        category_col='party',
        text_col='text',
        nlp=whitespace_nlp_with_sentences,
        feats_from_spacy_doc=FeatsFromGeneralInquirer()).build()
    html = produce_scattertext_explorer(corpus,
                                        category='democrat',
                                        category_name='Democratic',
                                        not_category_name='Republican',
                                        width_in_pixels=1000,
                                        metadata=convention_df['speaker'],
                                        use_non_text_features=True,
                                        use_full_doc=True)
    open('./demo_general_inquirer.html', 'wb').write(html.encode('utf-8'))
    print('Open ./demo_general_inquirer.html in Chrome or Firefox.')
Ejemplo n.º 19
0
def main():
	shisei = _parse_geutenberg('http://www.gutenberg.org/files/31617/31617-0.txt')
	horadanshaku = _parse_geutenberg('http://www.gutenberg.org/files/34084/34084-0.txt')
	df = pd.DataFrame({'text': [shisei, horadanshaku],
	                   'title': ['Shisei', 'Horadanshaku tabimiyage'],
	                   'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki']})

	df['text'] = df['text'].apply(st.japanese_nlp)
	corpus = st.CorpusFromParsedDocuments(df,
	                                      category_col='title',
	                                      parsed_col='text').build()
	html = st.produce_scattertext_explorer(corpus,
	                                       category='Shisei',
	                                       category_name='Shisei',
	                                       not_category_name='Horadanshaku tabimiyage',
	                                       minimum_term_frequency=5,
	                                       width_in_pixels=1000,
	                                       metadata=df['title'] + ' by ' + df['author'],
	                                       asian_mode=True)
	open('./demo_japanese.html', 'w').write(html)
	print('Open ./demo_japanese.html in Chrome or Firefox.')
def main():
	convention_df = SampleCorpora.ConventionData2012.get_data()
	feat_builder = FeatsFromGeneralInquirer()
	corpus = CorpusFromPandas(convention_df,
	                          category_col='party',
	                          text_col='text',
	                          nlp=whitespace_nlp_with_sentences,
	                          feats_from_spacy_doc=feat_builder).build()
	html = produce_scattertext_explorer(corpus,
	                                    category='democrat',
	                                    category_name='Democratic',
	                                    not_category_name='Republican',
	                                    width_in_pixels=1000,
	                                    metadata=convention_df['speaker'],
	                                    use_non_text_features=True,
	                                    use_full_doc=True,
	                                    topic_model_term_lists=feat_builder.get_top_model_term_lists(),
										metadata_descriptions=feat_builder.get_definitions()
										)
	open('./demo_general_inquirer.html', 'wb').write(html.encode('utf-8'))
	print('Open ./demo_general_inquirer.html in Chrome or Firefox.')
Ejemplo n.º 21
0
    def empath(self, dataframe):
        feat_builder = st.FeatsFromOnlyEmpath()
        empath_corpus = st.CorpusFromParsedDocuments(
            dataframe,
            category_col='Document Type',
            feats_from_spacy_doc=feat_builder,
            parsed_col='Text').build()

        html = st.produce_scattertext_explorer(
            empath_corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            width_in_pixels=1000,
            metadata=dataframe['Document'],
            use_non_text_features=True,
            use_full_doc=True,
            topic_model_term_lists=feat_builder.get_top_model_term_lists())

        logger.getLogger().info("Opening Empath Visual")
        open(self.empath_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.empath_file)
Ejemplo n.º 22
0
def scatterplot(df):
    '''
    input: a dataframe with text, CEO, and quarter 
    output: a scatterplot
    '''
    corpus = st.CorpusFromPandas(df,
                                 category_col='ceo',
                                 text_col='text',
                                 nlp=st.whitespace_nlp_with_sentences).build()

    html = st.produce_scattertext_explorer(
        corpus,
        category='Ballmer',
        category_name='Steve Ballmer Era',
        not_category_name='Satya Nadella Era',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df['quarter'],
    )

    open('../Charts/scattertext_demo.html', 'wb').write(html.encode('utf-8'))
Ejemplo n.º 23
0
    def scatter_viz(self):

        #load corpus
        corpus = self.create_corpus()

        #load cleaned df
        convention_df = self.clean_texts()

        html = produce_scattertext_explorer(
            corpus,
            category='left',
            category_name='Democratic',
            not_category_name='Republican',
            width_in_pixels=1000,
            minimum_term_frequency=1,
            metadata=convention_df['position'],
            term_significance=st.LogOddsRatioUninformativeDirichletPrior())

        file_name = key_path + '/templates/scattertext.html'
        open(file_name, 'wb').write(html.encode('utf-8'))

        return file_name
Ejemplo n.º 24
0
def generate_visual(data,
                    category,
                    category_name,
                    not_category_name,
                    filename='index.html'):
    
    import spacy
    import scattertext as st

    nlp = spacy.load('en_core_web_sm')

    corpus = st.CorpusFromPandas(data, 
                                 category_col='label', 
                                 text_col='abstract',
                                 nlp=nlp).build()

    html = st.produce_scattertext_explorer(corpus,
                                           category=category,
                                           category_name=category_name,
                                           not_category_name=not_category_name,
                                           width_in_pixels=1000,
                                           metadata=data['journal'])

    return html
Ejemplo n.º 25
0
from scattertext import SampleCorpora, whitespace_nlp_with_sentences
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.ScaledFScore import ScaledFScorePresets

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build()

html = produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=5,
    pmi_threshold_coefficient=8,
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    #term_scorer=ScaledFScorePresets(one_to_neg_one=True, beta=1),
    #d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
    #d3_url='scattertext/data/viz/scripts/d3.min.js',
)

open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
Ejemplo n.º 26
0
import spacy

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.LogOddsUniformativePriorScore import LogOddsUninformativePriorScore

nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
scores = -(LogOddsUninformativePriorScore.get_thresholded_score(
    term_freq_df['democrat freq'],
    term_freq_df['republican freq'],
    alpha_w=2.,
    threshold=0.1))
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    scores=scores,
                                    sort_by_dist=False,
                                    gray_zero_scores=True,
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_insignificant_greyed_out.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_insignificant_greyed_out.html in Chrome or Firefox.')
original_data = df.reset_index(drop=True)

df = pd.concat([original_data, cleaned_texts], axis=1)

df['parsed_text'] = df['parsed_text'].apply(chinese_nlp)

for i in np.arange(len(df['text'])):
    df['text'][i] = re.sub(pattern, '', df['text'][i])

df['text'] = df['text'].apply(chinese_nlp)

corpus = CorpusFromParsedDocuments(df,
                                   category_col='file_name',
                                   parsed_col='parsed_text').build()

html = produce_scattertext_explorer(corpus,
                                    category='安利蛋白粉评论.txt',
                                    category_name='安利蛋白粉评论.txt',
                                    not_category_name='汤臣倍健蛋白粉评论.txt',
                                    width_in_pixels=1000,
                                    metadata=df['file_name'],
                                    asian_mode=True,
                                    alternative_text_field="text")
open(
    'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein_review_compare.html',
    'w',
    encoding='utf-8').write(html)
print(
    'Open C:/Users/CNU074VP/Desktop/Chinese Topic Model in Chrome or Firefox.')
    # Scattertext attack vs support (only responses)
    # https://kanoki.org/2019/03/17/text-data-visualization-in-python/
    # https://github.com/JasonKessler/scattertext
    nlp = spacy.load('en_core_web_sm')
    for data_set in ['debate_test', 'debate_train', 'procon', 'political']:
        df_plot = df.loc[(df['org_dataset'] == data_set)
                         & (df['label'].isin(['attack', 'support']))]
        df_plot['parsed'] = df_plot['response'].apply(nlp)
        corpus = st.CorpusFromParsedDocuments(df_plot,
                                              category_col='label',
                                              parsed_col='parsed').build()
        html = st.produce_scattertext_explorer(
            corpus,
            category='attack',
            not_category_name='support',
            width_in_pixels=1000,
            minimum_term_frequency=5,
            transform=st.Scalers.log_scale_standardize,
            use_full_doc=True)
        file_name = './plots/scattertext_attack_support' + data_set + '.html'
        with open(file_name, 'wb') as file:
            file.write(html.encode('utf-8'))

    # Scattertext Nixon vs Kennedy
    df_plot = data_nix_ken
    df_plot['parsed'] = df_plot['text'].apply(nlp)
    corpus = st.CorpusFromParsedDocuments(df_plot,
                                          category_col='author',
                                          parsed_col='parsed').build()
    html = st.produce_scattertext_explorer(
        corpus,
Ejemplo n.º 29
0
from scattertext import SampleCorpora
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp

nlp = whitespace_nlp

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_without_spacy.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_without_spacy.html in Chrome or Firefox.')
        [
            'obama', 'romney', 'democrats', 'republicans', 'health',
            'military', 'taxes', 'education', 'olympics', 'auto', 'iraq',
            'iran', 'israel'
        ],
        scorer=RankDifference(),
        num_terms_per_topic=20))

topic_feature_builder = st.FeatsFromTopicModel(topic_model)

topic_corpus = st.CorpusFromParsedDocuments(
    convention_df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=topic_feature_builder).build()

html = st.produce_scattertext_explorer(
    topic_corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    use_non_text_features=True,
    use_full_doc=True,
    pmi_threshold_coefficient=0,
    topic_model_term_lists=topic_feature_builder.get_top_model_term_lists())

open('./demo_word_list_topic_model.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_word_list_topic_model.html in Chrome or Firefox.')
Ejemplo n.º 31
0
print("Número de documentos: " + str(corpus.get_num_docs()))
print("Tamanho de documentos: " + str(corpus.get_doc_lengths()))
print("Número de termos: " + str(corpus.get_num_terms()))

print("Palavras que diferem dos corpus comuns: ")
x = corpus.get_scaled_f_scores_vs_background()
print(list(x.index[0:10]))

#Frequência das palavras nas classes
term_freq_df = corpus.get_term_freq_df()
term_freq_df['positivo'] = corpus.get_scaled_f_scores('positivo')
term_freq_df['negativo'] = corpus.get_scaled_f_scores('negativo')

#Ordenando a frequência das palavras para obter as mais frequentes
termosPositivos = term_freq_df.sort_values(by='positivo', ascending=False)
termosNegativos = term_freq_df.sort_values(by='negativo', ascending=False)
print("Palavras mais frequentes entre as classes positivas: ")
print(list(termosPositivos.index[0:10]))

print("Palavras mais frequentes entre as classes negativas: ")
print(list(termosNegativos.index[0:10]))

#Função para a geração do gráfico interativo, o gráfico será gerado num arquivo html que deve ser aberto no navegador e pode demorar um pouco para carregar
html = scattertext.produce_scattertext_explorer(
    corpus,
    category='positivo',  #classe que ficar no eixo y
    category_name=
    'Positivo',  #Nomeando a classe apenas para visualização no gráfico
    not_category_name='Negativo',  #Nomeando a classe do eixo x
    width_in_pixels=1000)  #Tamanho do gráfico
open("graficos.html", 'wb').write(html.encode('utf-8'))
Ejemplo n.º 32
0
	df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
	df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
	df_mf.to_csv('emoji_data.csv', index=False)

nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)

corpus = st.CorpusFromParsedDocuments(
	df_mf,
	parsed_col='parse',
	category_col='gender',
	feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

html = st.produce_scattertext_explorer(
	corpus,
	category='f',
	category_name='Female',
	not_category_name='Male',
	use_full_doc=True,
	term_ranker=OncePerDocFrequencyRanker,
	sort_by_dist=False,
	metadata=(df_mf['User Name']
	          + ' (@' + df_mf['Nickname'] + ') '
	          + df_mf['Date'].astype(str)),
	width_in_pixels=1000
)

print('writing EmojiGender.html')
open("EmojiGender.html", 'wb').write(html.encode('utf-8'))
topic_model = {
	'money': ['money', 'bank', 'banks', 'finances', 'financial', 'loan', 'dollars', 'income'],
	'jobs': ['jobs', 'workers', 'labor', 'employment', 'worker', 'employee', 'job'],
	'patriotic': ['america', 'country', 'flag', 'americans', 'patriotism', 'patriotic'],
	'family': ['mother', 'father', 'mom', 'dad', 'sister', 'brother', 'grandfather', 'grandmother', 'son', 'daughter']
}
topic_feature_builder = st.FeatsFromTopicModel(topic_model)

topic_corpus = st.CorpusFromParsedDocuments(
	convention_df,
	category_col='party',
	parsed_col='parse',
	feats_from_spacy_doc=topic_feature_builder
).build()

html = st.produce_scattertext_explorer(
	topic_corpus,
	category='democrat',
	category_name='Democratic',
	not_category_name='Republican',
	width_in_pixels=1000,
	metadata=convention_df['speaker'],
	use_non_text_features=True,
	use_full_doc=True,
	pmi_threshold_coefficient=0,
	topic_model_term_lists=topic_feature_builder.get_top_model_term_lists()
)

open('./demo_custom_topic_model.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_custom_topic_model.html in Chrome or Firefox.')
# Join the two dataframes along the column
convention_df = pd.concat([df1, df2], axis=1)

# Place all text in same column and create tag for CNN or Fox
convention_df = pd.melt(convention_df)
convention_df = convention_df.dropna(axis=0, how='any')

# Build NLP parsing for corpus
English = st.whitespace_nlp_with_sentences

# Parse the text and create new column with parsed values
convention_df.groupby('variable').apply(lambda x: x.value.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.value.apply(English)

convention_df.iloc[:3]

# Generate corpus of language from pandas dataframe
corpus = st.CorpusFromPandas(convention_df, category_col='variable', text_col='value', nlp = English).build()

# Output html doc for visualization
## HTML FILE MUST ALREADY EXIST IN OUTPUT FOLDER TO WRITE ON
html = st.produce_scattertext_explorer(corpus,
                                       category='CNN',
                                       category_name='CNN',
                                       not_category_name='Fox',
                                       width_in_pixels=1000)

file_name = 'output/Trump.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Ejemplo n.º 35
0
from scattertext import SampleCorpora, whitespace_nlp_with_sentences
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.ScaledFScore import ScaledFScorePresets

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build()

html = produce_scattertext_explorer(
	corpus,
	category='democrat',
	category_name='Democratic',
	not_category_name='Republican',
	minimum_term_frequency=5,
	pmi_threshold_coefficient=8,
	width_in_pixels=1000,
	metadata=convention_df['speaker'],
	#term_scorer=ScaledFScorePresets(one_to_neg_one=True, beta=1),
	d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
	d3_url='scattertext/data/viz/scripts/d3.min.js',
)

open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
Ejemplo n.º 36
0
def scale(ar):
	return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    x_coords=frequencies_scaled,
                                    y_coords=scores_scaled,
                                    scores=scores,
                                    sort_by_dist=False,
                                    metadata=convention_df['speaker'],
                                    x_label='Log frequency',
                                    y_label='L2-penalized logistic regression coef')
fn = 'demo_custom_coordinates.html'
open(fn, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % fn)
Ejemplo n.º 37
0
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, RankDifference
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build()

html = produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=5,
    pmi_threshold_coefficient=8,
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    term_scorer=RankDifference(),
    d3_scale_chromatic_url=
    'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
    d3_url='scattertext/data/viz/scripts/d3.min.js',
)

open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
Ejemplo n.º 38
0
def main():
	parser = argparse.ArgumentParser(description="A primitive, incomplete commandline interface to Scattertext.")
	parser.add_argument('--datafile', action='store', dest='datafile', required=True,
	                    help="Path (or URL) of a CSV file with at least two columns."
	                         "Text and category column names are indicated by the --text_column"
	                         "and --category_column arguments.  By default, they are 'text', and 'category'. "
	                         "Optionally, a metadata "
	                         "column (named in the --metadata argument) can be present. ")
	parser.add_argument('--outputfile', action='store', dest='outputfile', default="-",
	                    help="Path of HTML file on which to store visualization. Pass in - (default) for stdout.")
	parser.add_argument('--text_column', action='store', dest='text_column', default="text",
	                    help="Name of the text column.")
	parser.add_argument('--category_column', action='store', dest='category_column', default="category",
	                    help="Name of the category column.")
	parser.add_argument('--metadata_column', action='store', dest='metadata_column', default=None,
	                    help="Name of the category column.")
	parser.add_argument('--positive_category', action='store', required=True,
	                    dest='positive_category',
	                    help="Postive category.  A value in category_column to be considered the positive class. "
	                         "All others will be considered negative.")
	parser.add_argument('--category_display_name', action='store',
	                    dest='category_display_name', default=None,
	                    help="Positive category name which will "
	                         "be used on the visualization. By default, it will just be the"
	                         "postive category value.")
	parser.add_argument('--not_category_display_name', action='store', default=None,
	                    dest='not_category_display_name',
	                    help="Positive category name which will "
	                         "be used on the visualization. By default, it will just be the word 'not' "
	                         "in front of the positive value.")
	parser.add_argument('--pmi_threshold', action='store',
	                    dest='pmi_threshold', type=int,
	                    help="2 * minimum allowable PMI value.  Default 6.")
	parser.add_argument('--width_in_pixels', action='store',
	                    dest='width_in_pixels', type=int, default=1000,
	                    help="Width of the visualization in pixels.")
	parser.add_argument('--minimum_term_frequency', action='store',
	                    dest='minimum_term_frequency', type=int, default=3,
	                    help="Minimum number of times a term needs to appear. Default 3")
	parser.add_argument('--regex_parser', action='store_true',
	                    dest='regex_parser', default=False,
	                    help="If present, don't use spaCy for preprocessing.  Instead, "
	                         "use a simple, dumb, regex.")
	parser.add_argument('--spacy_language_model', action='store',
	                    dest='spacy_language_model', default='en',
	                    help="If present, pick the spaCy language model to use. Default is 'en'. "
	                         "Other valid values include 'de' and 'fr'. --regex_parser will override."
	                         "Please see https://spacy.io/docs/api/language-models for moredetails")
	parser.add_argument('--one_use_per_doc', action='store_true',
	                    dest='one_use_per_doc', default=False,
	                    help="Only count one use per document.")
	args = parser.parse_args()
	df = pd.read_csv(args.datafile)

	if args.category_column not in df.columns:
		raise Exception("category_column (%s) must be a column name in csv. Must be one of %s"
		                % (args.category_column, ', '.join(df.columns)))
	if args.text_column not in df.columns:
		raise Exception("text_column (%s) must be a column name in csv. Must be one of %s"
		                % (args.text_column, ', '.join(df.columns)))
	if args.metadata_column is not None and args.metadata_column not in df.columns:
		raise Exception("metadata_column (%s) must be a column name in csv. Must be one of %s"
		                % (args.metadata_column, ', '.join(df.columns)))
	if args.positive_category not in df[args.category_column].unique():
		raise Exception("positive_category (%s) must be in the column ""%s"", with a case-sensitive match." %
		                (args.positive_category, args.category_column))
	if args.regex_parser:
		nlp = whitespace_nlp_with_sentences
	else:
		import spacy
		nlp = spacy.load(args.spacy_language_model)

	term_ranker = None
	if args.one_use_per_doc is True:
		term_ranker = OncePerDocFrequencyRanker

	category_display_name = args.category_display_name
	if category_display_name is None:
		category_display_name = args.positive_category
	not_category_display_name = args.not_category_display_name
	if not_category_display_name is None:
		not_category_display_name = 'Not ' + category_display_name

	corpus = CorpusFromPandas(df,
	                          category_col=args.category_column,
	                          text_col=args.text_column,
	                          nlp=nlp).build()
	html = produce_scattertext_explorer(corpus,
	                                    category=args.positive_category,
	                                    category_name=category_display_name,
	                                    not_category_name=not_category_display_name,
	                                    minimum_term_frequency=args.minimum_term_frequency,
	                                    pmi_filter_thresold=args.pmi_threshold,
	                                    width_in_pixels=args.width_in_pixels,
	                                    term_ranker=term_ranker,
	                                    metadata=None if args.metadata_column is None \
		                                    else df[args.metadata_column]
	                                    )
	if args.outputfile == '-':
		print(html)
	else:
		with open(args.outputfile, 'wb') as o:
			o.write(html.encode('utf-8'))
Ejemplo n.º 39
0
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party',
    parsed_col='parse').build().get_unigram_corpus().compact(
        st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       minimum_term_frequency=0,
                                       pmi_threshold_coefficient=0,
                                       width_in_pixels=1000,
                                       metadata=corpus.get_df()['speaker'],
                                       transform=st.Scalers.dense_rank,
                                       show_diagonal=False,
                                       max_overlapping=3,
                                       vertical_lines=0.5)
open('./demo_vertical_lines.html', 'w').write(html)
print('open ./demo_vertical_lines.html in Chrome')
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = (st.CorpusFromPandas(convention_df,
                              category_col='speaker',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build().get_unigram_corpus())

html = st.produce_scattertext_explorer(
    corpus,
    category='BARACK OBAMA',
    sort_by_dist=False,
    metadata=convention_df['party'] + ': ' + convention_df['speaker'],
    term_scorer=st.RankDifference(),
    transform=st.Scalers.dense_rank
)
file_name = 'demo_dense_rank_difference.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))