Esempio n. 1
0
    def gitc(self, dataframe):
        general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

        corpus = st.CorpusFromPandas(
            dataframe,
            category_col='Document Type',
            text_col='Text',
            nlp=st.whitespace_nlp_with_sentences,
            feats_from_spacy_doc=general_inquirer_feature_builder).build()

        html = st.produce_frequency_explorer(
            corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            use_non_text_features=True,
            use_full_doc=True,
            term_scorer=st.LogOddsRatioUninformativeDirichletPrior(),
            grey_threshold=1.96,
            width_in_pixels=1000,
            metadata=dataframe['Document'],
            topic_model_term_lists=general_inquirer_feature_builder.
            get_top_model_term_lists())

        logger.getLogger().info("Opening GITC-Visual")
        open(self.gitc_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.gitc_file)
Esempio n. 2
0
def plot_distinctive_words(x_label='',
                           x_files=[],
                           y_label='',
                           y_files=[],
                           max_words=10000,
                           max_files=100):
    '''
  Create a scatterplot that shows the distinctive words among x_files and y_files.
  Use x_label as the x axis label and y_label as the y_axis label.
  Return HTML content that can be rendered to show the distinctive words.
  '''
    rows = []
    for i in x_files[:max_files]:
        rows.append([x_label, ' '.join(open(i).read().split()[:max_words])])
    for i in y_files[:max_files]:
        rows.append([y_label, ' '.join(open(i).read().split()[:max_words])])
    df = pandas.DataFrame(rows, columns=['Group', 'Text'])
    nlp = spacy.load('en')
    nlp.max_length = 2**64
    corpus = scattertext.CorpusFromPandas(df,
                                          category_col='Group',
                                          text_col='Text',
                                          nlp=nlp).build()
    html = scattertext.produce_scattertext_html(corpus,
                                                category=y_label,
                                                category_name=y_label,
                                                not_category_name=x_label,
                                                minimum_term_frequency=5,
                                                width_in_pixels=1000)
    return html
Esempio n. 3
0
def create_scattertext_plot(df, category_col:str, text_col:str, nlp, filename:str, label_match:str, label_name:str, label_other_name:str, metadata_col:str, **kwargs):
    """ creates a html file with an interactive scattertext plot

    Will delete an 'index' column if there is one as the scattertext function needs to create it
    label_match must be one of 2 entries in the category_col
    label_name is the user-friendly name given to a match, e.g. if label_match is 'Yes', you might want a more meaningful label such as 'A good week'
    label_other_name is the label for the other entry - e.g. 'A bad week'
    **kwargs goes into scattertext.produce_scattertext_explorer, e.g. minimum_term_frequency=8,

    :returns: nothing, but creates a HTML file"""
    if 'index' in df.columns:
        df.drop('index',axis=1,inplace=True)
    corpus = st.CorpusFromPandas(df,category_col=category_col,text_col=text_col, nlp=nlp).build()
    html = st.produce_scattertext_explorer(corpus,
                                      category=label_match,
                                      category_name=label_name,
                                      not_category_name=label_other_name,
                                      metadata=corpus.get_df()[metadata_col],
                                      save_svg_button=True,
                                           **kwargs
                                      )


    html_file = open(filename, 'wb')
    html_file.write(html.encode('utf-8'))
    html_file.close()
Esempio n. 4
0
def getReviewPosNegPhrases(yelpScraperResult):
    if yelpScraperResult.empty:
        return pd.DataFrame()

    df = yelpScraperResult.copy()

    nlp.Defaults.stop_words |= {
        'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first',
        'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz',
        'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins',
        'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', ' ', 'it', "it's",
        'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l',
        '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!',
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.', ','
    }

    corpus = st.CorpusFromPandas(df, category_col=2, text_col=1,
                                 nlp=nlp).build()

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(
        '5.0 star rating')
    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(
        '1.0 star rating')
    dh = term_freq_df.sort_values(by='highratingscore', ascending=False)
    dh = dh[['highratingscore', 'poorratingscore']]
    dh = dh.reset_index(drop=False)
    dh = dh.rename(columns={'highratingscore': 'score'})
    dh = dh.drop(columns='poorratingscore')

    # positive dataframe, negative dataframe
    return dh.head(10), dh.tail(10)
Esempio n. 5
0
def vis():
    '''
    text1 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh.en.txt", "r").read()
    text2 = open("/home/jxgu/github/unparied_im2text_jxgu/tmp/aic_nmt_val_5k_zh_online.en.txt", "r").read()
    df = pd.DataFrame( [{'text': text.strip(), 'label': 'text1'} for text in text1.decode('utf-8', errors='ignore').split('\n')] + [{'text': text.strip(), 'label': 'text2'} for text in text2.decode('utf-8', errors='ignore').split('\n')]
    )
    term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df, category_col = 'label', text_col = 'text', nlp = ST.whitespace_nlp ).build()
    filtered_term_doc_mat = (ST.TermDocMatrixFilter(pmi_threshold_coef = 1, minimum_term_freq = 1).filter(term_doc_mat))
    scatter_chart_data = (ST.ScatterChart(filtered_term_doc_mat).to_dict('text1', category_name='text1', not_category_name='text2'))
    viz_data_adapter = ST.viz.VizDataAdapter(scatter_chart_data)
    html = ST.viz.HTMLVisualizationAssembly(viz_data_adapter).to_html()
    open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
    IFrame(src='subj_obj_scatter.html', width = 1000, height=1000)
    '''

    SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
    data = io.BytesIO(urllib.urlopen(SUBJECTIVITY_URL).read())
    tarball = tarfile.open(fileobj=data, mode='r:gz')
    readme = tarball.extractfile('subjdata.README.1.0').read()
    quote = tarball.extractfile('quote.tok.gt9.5000').read()
    plot = tarball.extractfile('plot.tok.gt9.5000').read()

    text1 = open("tmp/flickr_test_1k_zh.en.txt", "r").read()
    text2 = open("tmp/flickr_test_1k_zh.en.txt", "r").read()
    # Examples of subjective sentences in corpus
    #quote.decode('utf-8', errors='ignore').split('\n')[:3]
    '''Construct subjective vs. objective pandas dataframe, 
    treating review quotes as subjective, and plot points as objective.
    '''
    df = pd.DataFrame(
        [{
            'text': text.strip(),
            'label': 'subjective'
        } for text in quote.decode('utf-8', errors='ignore').split('\n')] +
        [{
            'text': text.strip(),
            'label': 'objective'
        } for text in plot.decode('utf-8', errors='ignore').split('\n')])
    '''Convert Pandas dataframe to a term-document matrix, indicating
    the category column is "label" and the text column name is "text".'''
    nlp = spacy.load('en')

    corpus = ST.CorpusFromPandas(
        data_frame=df,
        category_col='label',
        text_col='text',
        # Note: use nlp=spacy.en.English() for text that's not pre-tokenized
        nlp=nlp).build()
    term_freq_df = corpus.get_term_freq_df()

    html = ST.produce_scattertext_explorer(corpus,
                                           category='label',
                                           category_name='subjective',
                                           not_category_name='objective',
                                           width_in_pixels=1000)
    open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))
Esempio n. 6
0
def get_sct_html(rest_name, city_name):
    rest_reviews = get_rest_reviews(rest_name, city_name)
    nlp = spacy.load('en_core_web_sm')
    corpus = sct.CorpusFromPandas(rest_reviews,
                             category_col='class',
                             text_col='text',
                             nlp=nlp).build()
    html = sct.produce_scattertext_explorer(corpus,
         category='good',
         category_name='Positive',
         not_category_name='Negative',
         width_in_pixels=900,
         metadata=rest_reviews['class'])
    return open("rest_reviews-Vis.html", 'wb').write(html.encode('utf-8'))
Esempio n. 7
0
def getReviewPosNegPhrases(df_reviews, topk=10):

    if df_reviews.empty:
        return pd.DataFrame(), pd.DataFrame()

    df = df_reviews.copy()
    df['stars'] = df['stars'].astype(str)

    nlp = spacy.load("en_core_web_sm")
    nlp.Defaults.stop_words |= {
        'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first',
        'he', 'check-in', 'and', 'some', '=', '= =', 'male', 'u', 'want',
        'u want', 'cuz', 'also', 'find', 'him', "i've", 'deaf', 'on', 'her',
        'told', 'told him', 'ins', 'check-ins', 'check-in', 'check', 'I',
        'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'coffee', 'place',
        "it 's", "'s", 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this',
        'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '&', '1', '2', '3',
        '4', '5', '6', '7', '8', '9', '0', '/', '.', ','
    }

    corpus = st.CorpusFromPandas(df,
                                 category_col='stars',
                                 text_col='text',
                                 nlp=nlp).build()
    term_freq_df = corpus.get_term_freq_df()

    categories = df['stars'].unique()
    high, poor = np.array([]), np.array([])
    if '5' in categories:
        high = corpus.get_scaled_f_scores('5')
    elif '4' in categories:
        high = corpus.get_scaled_f_scores('4')
    if '1' in categories:
        poor = corpus.get_scaled_f_scores('1')
    elif '2' in categories:
        poor = corpus.get_scaled_f_scores('2')

    df_high, df_poor = pd.DataFrame(), pd.DataFrame()
    columns = ['term', 'score']
    if high.shape[0] > 0:
        df_high = pd.DataFrame([term_freq_df.index.tolist(), high]).T
        df_high = df_high.sort_values(1, ascending=False).head(topk)
        df_high.columns = columns
    if poor.shape[0] > 0:
        df_poor = pd.DataFrame([term_freq_df.index.tolist(), poor]).T
        df_poor = df_poor.sort_values(1, ascending=False).head(topk)
        df_poor.columns = columns

    # positive dataframe, negative dataframe
    return df_high.head(topk), df_poor.tail(topk)
Esempio n. 8
0
def create_corpus(category, speeches_df):
    """
    creates scattertext corpus from speeches dictionary
    :param category:
    :param speeches_df:
    :return:
    """
    corpus = st.CorpusFromPandas(speeches_df, category_col=category, text_col='text', nlp=nlp).build()
    update_stop = []
    for term in STOP_WORDS:
        if term in corpus._term_idx_store:
            update_stop.append(term)
    corpus = corpus.remove_terms(update_stop)
    return corpus
Esempio n. 9
0
def create_scatterplot(df, return_corpus=False):
    '''Creates an HTML file to visualize differences in corpora.'''
    corpus = st.CorpusFromPandas(df,
                                 category_col='author',
                                 text_col='text',
                                 nlp=nlp).build()
    if return_corpus:
        return corpus
    html = st.produce_scattertext_explorer(corpus,
                                           category='EAP',
                                           category_name='Edger Allen Poe',
                                           not_category_name='HPL/MWS',
                                           width_in_pixels=1000,
                                           metadata=df['author'])
    open("Author-Visualization.html", 'wb').write(html.encode('utf-8'))
Esempio n. 10
0
    def standard(self, dataframe):
        corpus = st.CorpusFromPandas(dataframe, category_col='Document Type',
                                     text_col='Text', nlp=self.nlp).build()

        html = st.produce_scattertext_explorer(corpus, category='1st Document',
                                               category_name='1st Document',
                                               not_category_name='2nd Document',
                                               width_in_pixels=1000)

        logger.getLogger().info("Opening Standard Visual")

        open(self.std_file, 'wb').write(html.encode('utf-8'))

        if os.path.isfile(self.std_file):
            logger.getLogger().info("Graph file created")
Esempio n. 11
0
    def standard(self, dataframe):
        corpus = st.CorpusFromPandas(dataframe,
                                     category_col='Document Type',
                                     text_col='Text',
                                     nlp=self.nlp).build()

        html = st.produce_scattertext_explorer(corpus,
                                               category='submission',
                                               category_name='Submission',
                                               not_category_name='Standard',
                                               width_in_pixels=1000,
                                               metadata=dataframe['Document'])

        logger.getLogger().info("Opening Standard Visual")
        open(self.std_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.std_file)
Esempio n. 12
0
 def __init__(self,
              list_directory,
              list_author,
              language: str = 'fr',
              encoding='utf-8'):
     self.list_text = self.read_directory(list_directory, encoding)
     self.list_author = list_author
     self.df = pd.DataFrame()
     self.df["text"] = self.list_text
     self.df["author"] = self.list_author
     self.language = language
     self.nlp = spacy.load(language)
     self.corpus = st.CorpusFromPandas(self.df,
                                       category_col='author',
                                       text_col='text',
                                       nlp=self.nlp).build()
Esempio n. 13
0
    def word_similarity_graph(self, dataframe, word):
        corpus = st.CorpusFromPandas(dataframe, category_col='Document Type',
                                     text_col='Text', nlp=self.nlp).build()

        html = word_similarity_explorer(corpus,
                                        category='1st Document',
                                         category_name='1st Document',
                                         not_category_name='2nd Document',
                                         target_term=word,
                                         minimum_term_frequency=5,
                                         pmi_threshold_coefficient=4,
                                         width_in_pixels=1000,
                                         alpha=0.01,
                                         max_p_val=0.05,
                                         save_svg_button=True)
        logger.getLogger().info("Opening Word Similarity Visual")
        open(self.term_file, 'wb').write(html.encode('utf-8'))
Esempio n. 14
0
def processor(df_reviews):
    
    if len(df_reviews)==0:
        return None
    
    nlp = spacy.load("en_core_web_sm-2.1.0/en_core_web_sm/en_core_web_sm-2.1.0")

    # add stop words
    with open('stopwords.txt', 'r') as f:
        str = f.read()
        set_stopwords = set(str.split('\n'))
    nlp.Defaults.stop_words |= set_stopwords
 
    corpus = (scattertext.CorpusFromPandas(df_reviews, 
                                          category_col='rating', 
                                          text_col='text',
                                          nlp=nlp)
              .build()
              .remove_terms(nlp.Defaults.stop_words, ignore_absences=True)
             )

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores('5.0 star rating')
    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores('1.0 star rating')

    df_high = term_freq_df.sort_values(by='highratingscore', 
                                       ascending = False)
    df_poor = term_freq_df.sort_values(by='poorratingscore', 
                                       ascending=False)

    df_high = df_high[['highratingscore', 'poorratingscore']]
    df_high['highratingscore'] = round(df_high['highratingscore'], 2)
    df_high['poorratingscore'] = round(df_high['poorratingscore'], 2)
    df_high = df_high.reset_index(drop=False)
    df_high = df_high.head(5)

    df_poor = df_poor[['highratingscore', 'poorratingscore']]
    df_poor['highratingscore'] = round(df_poor['highratingscore'], 2)
    df_poor['poorratingscore'] = round(df_poor['poorratingscore'], 2)
    df_poor = df_poor.reset_index(drop=False)
    df_poor = df_poor.head(5)

    df_terms = pd.concat([df_high, df_poor],
                         ignore_index=True)
    return df_terms
Esempio n. 15
0
def getYelpWords(yelpScraperResult):
    df = yelpScraperResult

    nlp.Defaults.stop_words |= {
        'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first',
        'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz',
        'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins',
        'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', ' ', 'it', "it's",
        'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l',
        '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!',
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '/', '.', ','
    }

    corpus = st.CorpusFromPandas(df, category_col=2, text_col=1,
                                 nlp=nlp).build()

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(
        '5.0 star rating')

    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(
        '1.0 star rating')
    dh = term_freq_df.sort_values(by='highratingscore', ascending=False)
    dh = dh[['highratingscore', 'poorratingscore']]
    dh = dh.reset_index(drop=False)
    dh = dh.rename(columns={'highratingscore': 'score'})
    dh = dh.drop(columns='poorratingscore')
    positive_df = dh.head(10)
    negative_df = dh.tail(10)
    results = {
        'positive': [{
            'term': pos_term,
            'score': pos_score
        } for pos_term, pos_score in zip(positive_df['term'],
                                         positive_df['score'])],
        'negative': [{
            'term': neg_term,
            'score': neg_score
        } for neg_term, neg_score in zip(negative_df['term'],
                                         negative_df['score'])]
    }
    return results
Esempio n. 16
0
def wordfreqdf(df):
    corpus = st.CorpusFromPandas(df,
                                 category_col='party',
                                 text_col='work_for',
                                 nlp=nlp).build()
    term_freq_df = corpus.get_term_freq_df()
    result_df = pd.DataFrame(columns=["1", "2", "3", "4", "5", "6"])
    parties = df['party'].sort_values().unique()

    index = 0
    for party in parties:
        party_score_name = party + "_Score"
        term_freq_df[party_score_name] = corpus.get_scaled_f_scores(party)
        result_df.loc[index] = list(
            term_freq_df.sort_values(by=party_score_name,
                                     ascending=False).index[:6])
        index = index + 1
    result_df["Party"] = parties
    result_df.set_index("Party", inplace=True)
    return result_df
Esempio n. 17
0
    def chrctrstc(self, dataframe):
        corpus = (st.CorpusFromPandas(dataframe,
                                      category_col='Document Type',
                                      text_col='Text',
                                      nlp=st.whitespace_nlp_with_sentences).
                  build().get_unigram_corpus().compact(
                      st.ClassPercentageCompactor(
                          term_count=2,
                          term_ranker=st.OncePerDocFrequencyRanker)))

        html = st.produce_characteristic_explorer(
            corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            metadata=dataframe['Document'])

        logger.getLogger().info("Opening Characteristic Visual")
        open(self.chr_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.chr_file)
Esempio n. 18
0
def create_visual_corpus(category, speeches_df):
    """
    creates scattertext corpus from speeches dictionary
    :param category:
    :param speeches_df:
    :return:
    """
    corpus = st.CorpusFromPandas(speeches_df,
                                 category_col=category,
                                 text_col='text',
                                 nlp=nlp).build()
    update_stop = []
    STOP_WORDS.update([
        "»", "—", "«", "cuyas", "cuyos", "100", "fué", "ido", "hubieran",
        "hagan", "–", "hubiera", "tuve"
    ])
    for term in STOP_WORDS:
        if term in corpus._term_idx_store:
            update_stop.append(term)
    corpus = corpus.remove_terms(update_stop)
    return corpus
Esempio n. 19
0
def scatterplot(df):
    '''
    input: a dataframe with text, CEO, and quarter 
    output: a scatterplot
    '''
    corpus = st.CorpusFromPandas(df,
                                 category_col='ceo',
                                 text_col='text',
                                 nlp=st.whitespace_nlp_with_sentences).build()

    html = st.produce_scattertext_explorer(
        corpus,
        category='Ballmer',
        category_name='Steve Ballmer Era',
        not_category_name='Satya Nadella Era',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df['quarter'],
    )

    open('../Charts/scattertext_demo.html', 'wb').write(html.encode('utf-8'))
Esempio n. 20
0
def generate_visual(data,
                    category,
                    category_name,
                    not_category_name,
                    filename='index.html'):
    
    import spacy
    import scattertext as st

    nlp = spacy.load('en_core_web_sm')

    corpus = st.CorpusFromPandas(data, 
                                 category_col='label', 
                                 text_col='abstract',
                                 nlp=nlp).build()

    html = st.produce_scattertext_explorer(corpus,
                                           category=category,
                                           category_name=category_name,
                                           not_category_name=not_category_name,
                                           width_in_pixels=1000,
                                           metadata=data['journal'])

    return html
Esempio n. 21
0
def ValuePredictor(yelp_url, from_isbn=False):
    '''Takes a url, scrape site for reviews
    and calculates the term frequencies
    sorts and returns the top 10 as a json object
    containing term, highratingscore, poorratingscore.'''

    base_url = "https://www.yelp.com/biz/"  # add business id
    api_url = "/review_feed?sort_by=date_desc&start="
    bid = yelp_url.replace('https://www.yelp.com/biz/', '')
    if '?' in yelp_url:  #deletes everything after "?" in url
        bid = yelp_url.split('?')[0]

    class Scraper():
        def __init__(self):
            self.data = pd.DataFrame()

        def get_data(self, n, bid=bid):
            with Session() as s:
                with s.get(
                        base_url + bid + api_url + str(n * 20)
                ) as resp:  #makes an http get request to given url and returns response as json
                    r = dict(
                        resp.json())  #converts json response into a dictionary
                    _html = html.fromstring(
                        r['review_list'])  #loads from dictionary

                    dates = _html.xpath(
                        "//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()"
                    )
                    reviews = [
                        el.text for el in _html.xpath(
                            "//div[@class='review-content']/p")
                    ]
                    ratings = _html.xpath(
                        "//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title"
                    )

                    df = pd.DataFrame([dates, reviews, ratings]).T

                    self.data = pd.concat([self.data, df])

        def scrape(self):  #makes it faster
            # multithreaded looping
            with Executor(max_workers=40) as e:
                list(e.map(self.get_data, range(10)))

    s = Scraper()
    s.scrape()
    df = s.data  #converts scraped data into
    df.columns = ['date', 'review', 'rating']

    df = df.set_index(df.columns.drop('review', 1).tolist()).review.str.split(
        '.', expand=True).stack().reset_index().rename(columns={
            0: 'review'
        }).loc[:, df.columns]

    df = df.replace(',', '')
    df = df.replace('!', '')
    df = df.replace('#', '')
    df = df.replace('.', '')
    tokenizer = Tokenizer(nlp.vocab)
    STOP_WORDS = nlp.Defaults.stop_words.union([
        'gets', 'incredible', 'disappoint', 'from', 'perfection', 'loved',
        'definitely', 'happy', 'find', 'found', 'simply', 'fantastic',
        'recommend', 'feel', 'little', 'i', 'wow', 'absolute', 'favorite',
        'excellent', 'delicious', 'great', 'maybe', 'very', 'enjoy', 'list',
        'gave', 'date', 'went', 'disappointed', 'nyc', 'got', '#', 'crazy',
        'other', 'fairness', 'fair', 'mid', 'from', 'highly', 'perfect',
        'perfectly', 'come', 'lovely', 'visit', 'ny', 'nyc', 'best', 'amazing',
        'love', 'absolutely', 'like', 'good', 'other', 'from', 'ny',
        'restaurant', 'we', 'will', 'because', 'not', 'friends', 'amazing',
        'awesome', 'first', 'he', 'check-in', '=', '= =', 'male', 'u', 'want',
        'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told',
        'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i',
        'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this',
        'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve',
        'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '(', ')',
        '/', '.', ',', '!'
    ])
    # STOP_WORDS
    df = df[df['review'] != None]
    tokens = []

    for doc in tokenizer.pipe(df['review'], batch_size=500):

        doc_tokens = []

        for token in doc:
            if (token.text not in STOP_WORDS) & (token.is_punct == False):
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)

    df['review'] = tokens
    jointty = lambda x: ' '.join(map(lambda x: str(x), x['review']))
    df['review'] = df.apply(jointty, axis=1)
    df['review'].replace(' ', np.nan, inplace=True)
    df = df.dropna()

    corpus = (st.CorpusFromPandas(df,
                                  category_col='rating',
                                  text_col='review',
                                  nlp=nlp).build().remove_terms(
                                      STOP_WORDS, ignore_absences=True))

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(
        '5.0 star rating')

    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(
        '1.0 star rating')
    # term_freq_df = term_freq_df[term_freq_df['1.0 star rating freq'] > 3]
    dp = term_freq_df.sort_values(by='poorratingscore', ascending=False)
    dp = dp[~dp.index.str.contains('-')]
    dp = dp[~dp.index.str.contains("'")]
    dp = dp[~dp.index.str.contains('/')]
    dh = term_freq_df.sort_values(by='highratingscore', ascending=False)
    dh = dh[~dh.index.str.contains('-')]
    dh = dh[~dh.index.str.contains("'")]
    dh = dh[~dh.index.str.contains('/')]
    dhi = dh.head(75)
    dpo = dh.tail(75)
    dfinal = pd.concat([dhi, dpo])
    # dh = dh.reset_index(drop=False)

    # return dh.to_dict('index')
    return dfinal.to_dict('index')
Esempio n. 22
0
from sklearn.decomposition import KernelPCA, NMF
from sklearn.preprocessing import RobustScaler
from statsmodels.multivariate.pca import PCA

import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=general_inquirer_feature_builder,
).build().get_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    use_metadata=True,
    category_projector=st.CategoryProjector(compactor=None),
    topic_model_term_lists=general_inquirer_feature_builder.
    get_top_model_term_lists(),
    topic_model_preview_size=100,
    metadata_descriptions=general_inquirer_feature_builder.get_definitions(),
    metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_geninq.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
# Join the two dataframes along the column
convention_df = pd.concat([df1, df2], axis=1)

# Place all text in same column and create tag for CNN or Fox
convention_df = pd.melt(convention_df)
convention_df = convention_df.dropna(axis=0, how='any')

# Build NLP parsing for corpus
English = st.whitespace_nlp_with_sentences

# Parse the text and create new column with parsed values
convention_df.groupby('variable').apply(lambda x: x.value.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.value.apply(English)

convention_df.iloc[:3]

# Generate corpus of language from pandas dataframe
corpus = st.CorpusFromPandas(convention_df, category_col='variable', text_col='value', nlp = English).build()

# Output html doc for visualization
## HTML FILE MUST ALREADY EXIST IN OUTPUT FOLDER TO WRITE ON
html = st.produce_scattertext_explorer(corpus,
                                       category='CNN',
                                       category_name='CNN',
                                       not_category_name='Fox',
                                       width_in_pixels=1000)

file_name = 'output/Trump.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Esempio n. 24
0
import spacy  #Utilizado pelo próprio scattertext para fazer o preprocessamento
import pandas  #Utilizado para formatar o dataset para criação do corpus do scattertext
import pickle  #Usado apenas para carregar o dataset, mas você pode carregar da forma que quiser

#Carregando o dataset
data = open("data1", 'rb')
documentos, classes = pickle.load(data)

#Passando o seu dataset para o formato de DataFrame do pandas, onde uma tabela será criada para gerar o corpus do scattertext, os textos não devem estar preprocessados, pois o scattertext vai usar o spacy para isso
dict = {"texto": documentos, "classes": classes}
data = pandas.DataFrame(dict)

#Gerando o corpus pelo scattertext, a partir disso você terá acesso a diversas informações úteis sobre o seu dataset
nlp = spacy.load('en')
corpus = scattertext.CorpusFromPandas(data,
                                      category_col='classes',
                                      text_col='texto',
                                      nlp=nlp).build()

#Exemplos
print("Número de documentos: " + str(corpus.get_num_docs()))
print("Tamanho de documentos: " + str(corpus.get_doc_lengths()))
print("Número de termos: " + str(corpus.get_num_terms()))

print("Palavras que diferem dos corpus comuns: ")
x = corpus.get_scaled_f_scores_vs_background()
print(list(x.index[0:10]))

#Frequência das palavras nas classes
term_freq_df = corpus.get_term_freq_df()
term_freq_df['positivo'] = corpus.get_scaled_f_scores('positivo')
term_freq_df['negativo'] = corpus.get_scaled_f_scores('negativo')
Esempio n. 25
0
#%%
import scattertext as st
import spacy
from pprint import pprint
import feather as fea


#%%

df = fea.read_dataframe("C:/Users/au615270/Dropbox/CROW_FAR/First_Repository_CROW_FAR/full model files/GloVe Model/dataframe.feather")

df.iloc[0]

# %%

nlp = spacy.load('en_core_web_sm')
corpus = st.CorpusFromPandas(df, 
                              category_col='RU', 
                              text_col='referat',
                              nlp=nlp).build()

#%%
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))
Esempio n. 26
0
def getPosNegLongPhrases(df_reviews, topk=10):
    nlp = spacy.load("en_core_web_sm")
    if df_reviews.empty:
        return pd.DataFrame()
    df = df_reviews.copy()
    df['stars'] = df['stars'].astype(str)
    df = df.dropna()
    df['only_alphabets'] = df['text'].apply(
        lambda x: ' '.join(re.findall("[a-zA-Z]+", x)))

    for key in replace_dict_phrase_count.keys():
        df['only_alphabets'] = df['only_alphabets'].str.lower()

    stopwords = [
        'maybe', 'from', 'first', 'here', 'only', 'put', 'where', 'got',
        'sure', 'their', 'us', 'definitely', 'food', 'yet', 'our', 'go',
        'since', 'really', 'very', 'two', "don t", 'with', 'if', "hers",
        'which', 'came', 'all', 'me', 'makes', 'make', 'were', 'immediately',
        'get', 'been', 'ahead', 'also', 'that', 'were', 'one', 'have', 'see',
        'what', 'to', 'we', 'had', 'the', "re", 'it', 'or', 'he', 'she', 'we',
        'us', 'how', 'went', 'no', 'of', 'has', 'by', 'bit', 'thing', 'place',
        'so', 'ok', 'and', 'they', 'none', 'was', 'you', "ve", 'was', 'did',
        'be', 'and', 'but', 'is', 'as', 'you', 'has', 'and', 'had', 'was',
        'him', 'so', 'my', 'did', 'our', 'there', 'would', 'her', 'him', 'it',
        'is', 'by', 'bit', 'thing', 'place', 'while', 'check in', 'they',
        'them', 'want', 'good', 'husband', 'want', 'love', 'something', 'your',
        'they', 'your', 'cuz', 'him', "i ll", 'her', 'told', 'check', 'im',
        "his", 'they', 'this', 'it s', 'they', 'this', "won t", 'the', 'it',
        'i ve'
    ]

    def filter_stopwords(text):
        for i in str(text):
            if i not in stopwords:
                return str(text)

    # if item in stopwords list partially matches, delete, single letters like 'i' would be deleted
    # from inside individual words if in list
    df = df[~df['only_alphabets'].isin(stopwords)]
    # if the following words fully matches, filter out
    try:
        corpus = st.CorpusFromPandas(df,
                                     category_col='stars',
                                     text_col='only_alphabets',
                                     nlp=nlp).build()
        term_freq_df = corpus.get_term_freq_df()
        term_freq_df = pd.DataFrame(term_freq_df.to_records(
        ))  # flatten multi-level index to rename columns
        term_freq_df = term_freq_df.rename(columns={
            '5 freq': '5.0',
            '4 freq': '4.0',
            '2 freq': '2.0',
            '1 freq': '1.0'
        })

        categories = df['stars'].unique()
        freq_word_list = np.array([])
        if '5' in categories:
            freq_word_list = corpus.get_scaled_f_scores('5')
        elif '4' in categories:
            freq_word_list = corpus.get_scaled_f_scores('4')
        if '1' in categories:
            freq_word_list = corpus.get_scaled_f_scores('1')
        elif '2' in categories:
            freq_word_list = corpus.get_scaled_f_scores('2')

        df_wordFreq = pd.DataFrame()
        columns = ['term', 'score']
        if freq_word_list.shape[0] > 0:
            df_wordFreq = pd.DataFrame(
                [term_freq_df.term.tolist(), freq_word_list]).T
            df_wordFreq = df_wordFreq.sort_values(1,
                                                  ascending=True)  #.head(topk)
            df_wordFreq.columns = columns
    except:
        df['word_list'] = df['only_alphabets'].apply(
            lambda x: x[1:-1].split(' '))
        df['word_list'] = df['word_list'].astype(str)

        df['word_list'] = df['word_list'].apply(
            lambda x: ''.join([str(i) for i in x]))
        df['word_list'] = df['word_list'].str.lower()

        df_wordFreq = df[['word_list', 'stars']]

        s = df_wordFreq.apply(lambda x: pd.Series(x['word_list']),
                              axis=1).stack().reset_index(level=1, drop=True)
        s.name = 'word_list'

        df_wordFreq = pd.DataFrame(df['word_list'].str.split(',').tolist(),
                                   index=df['stars']).stack()

        df_wordFreq = df_wordFreq.reset_index()[[
            0, 'stars'
        ]]  # var1 variable is currently labeled 0
        df_wordFreq.columns = ['term', 'score']  # renaming var1
        df_wordFreq = df_wordFreq.reset_index(drop=False)

        replace_dict_phrase_count = {
            '[': '',
            ']': '',
            '-': '',
            '!': '',
            '.': '',
            "'": '',
            ' ': ''
        }
        for key in replace_dict_phrase_count.keys():
            df_wordFreq['term'] = df_wordFreq['term'].str.replace(
                key, replace_dict_phrase_count[key])
            df_wordFreq['term'] = df_wordFreq['term'].str.lower()
    x, y = df_wordFreq.shape
    if x > 100:
        df_wordFreq = pd.concat([df_wordFreq.head(50), df_wordFreq.tail(50)])
    x, y = df_wordFreq.shape  # updated size
    top_terms_list = []
    for i in range(math.ceil(x / 2)):
        try:
            new_df = df[df['only_alphabets'].str.contains(
                df_wordFreq['term'].iloc[i])]  #if word appears
            # in review, create a dataframe with each row being the word occurring in a different review
            pos_first_df = new_df.sort_values(
                by='stars', ascending=False)  #rank the dataframe with most
            # positive reviews first
            if pos_first_df['text'].iloc[
                    0] not in top_terms_list:  #get the highest star rating review
                top_terms_list.append(pos_first_df['text'].iloc[0])
        except IndexError as e:
            pass
    worst_terms_list = []
    for i in reversed(range(math.ceil(x / 2), x)):
        try:
            new_df = df[df['only_alphabets'].str.contains(
                df_wordFreq['term'].iloc[i])]  #if word appears
            # in review, create a dataframe with each row being the word occurring in a different review
            neg_first_df = new_df.sort_values(
                by='stars', ascending=True)  #rank the dataframe with worst
            # reviews first
            if neg_first_df['text'].iloc[
                    0] not in worst_terms_list:  #get the lowest star rating review
                worst_terms_list.append(
                    neg_first_df['text'].iloc[0])  #prevent duplicates
        except IndexError as e:
            pass
    del [df, new_df, neg_first_df, pos_first_df]
    negative_list = []
    for i in reversed(range(math.ceil(x / 2), x)):
        for list_of_words in worst_terms_list:
            word_list = list_of_words.split(' ')
            for word in word_list:
                try:
                    if df_wordFreq['term'].iloc[
                            i] == word:  # find word occurrence in original
                        # comma separated word list of reviews
                        try:
                            index = word_list.index(word)
                            string_from_phrases = ' '.join(
                                word_list[max(0, index -
                                              2):min(index +
                                                     4, len(word_list))])
                            negative_list.append(string_from_phrases)
                        except ValueError as e:
                            pass
                except IndexError as e:  #if there are less than the last half of
                    # the df_wordFreq words fter stopword filtering, just get the first word and
                    # its occurrence in the original review
                    if df_wordFreq['term'].iloc[0] == word:
                        try:
                            index = word_list.index(word)
                            string_from_phrases = ' '.join(
                                word_list[max(0, index -
                                              2):min(index +
                                                     4, len(word_list))])
                            negative_list.append(string_from_phrases)
                        except ValueError as e:
                            pass
    negative_df = pd.DataFrame(negative_list)
    negative_df = negative_df.reset_index(drop=False)
    negative_df = negative_df.rename(columns={'index': 'score', 0: 'term'})
    neg_no_dup = negative_df.drop_duplicates(subset='term')
    negative_phrase_list, y = neg_no_dup.shape
    if negative_phrase_list <= 10:
        num_time_append = 10 - negative_phrase_list
        for i in range(num_time_append):
            if 'term' not in list(negative_df):
                negative_df = negative_df.append(
                    pd.DataFrame([.5], columns=['score']))
                negative_df['term'] = ''
            negative_df = negative_df.append(
                pd.DataFrame([[.5, '']], columns=['score', 'term']))
    else:
        negative_df = neg_no_dup
        del [neg_no_dup]
    replace_dict_phrase = {
        ',': ' ',
        '\u00a0': '',
        '\n': '',
        '!': '',
        '.': '',
        "'": ''
    }
    for key in replace_dict_phrase.keys():
        negative_df['term'] = negative_df['term'].str.replace(
            key, replace_dict_phrase[key])
    #normalize score for positive connotation words going from 0 to 0.5
    negative_df['score'] = negative_df['score'].div(
        (negative_df['score'].max()) * 2, axis=0)
    negative_df = negative_df.sort_values(by=['score'], ascending=False)
    negative_df['score'] = negative_df['score'].round(decimals=4)

    positive_list = []
    for i in range(math.ceil(x / 2)):
        for list_of_words in top_terms_list:
            word_list = list_of_words.split(' ')
            for word in word_list:
                try:
                    if df_wordFreq['term'].iloc[
                            i] == word:  # find word occurrence in original
                        # comma separated word list of reviews
                        try:
                            index = word_list.index(word)
                            string_from_phrases = ','.join(
                                word_list[max(0, index -
                                              2):min(index +
                                                     4, len(word_list))])
                            positive_list.append(string_from_phrases)
                        except ValueError as e:
                            pass
                except IndexError as e:
                    if df_wordFreq['term'].iloc[
                            0] == word:  #if there are less than the first half of
                        # the df_wordFreq words fter stopword filtering, just get the first word and
                        # its occurrence in the original review
                        try:
                            index = word_list.index(word)
                            string_from_phrases = ','.join(
                                word_list[max(0, index -
                                              2):min(index +
                                                     4, len(word_list))])
                            positive_list.append(string_from_phrases)
                        except ValueError as e:
                            pass
    positive_df = pd.DataFrame(positive_list)
    positive_df = positive_df.reset_index(drop=False)
    positive_df = positive_df.rename(columns={'index': 'score', 0: 'term'})
    pos_no_dup = positive_df.drop_duplicates(subset='term')
    positive_phrase_list, y = pos_no_dup.shape
    if positive_phrase_list <= 10:
        num_time_append = 10 - positive_phrase_list
        for i in range(num_time_append):
            if 'term' not in list(positive_df):
                positive_df = positive_df.append(
                    pd.DataFrame([.5], columns=['score']))
                positive_df['term'] = ''
            positive_df = positive_df.append(
                pd.DataFrame([[.5, '']], columns=['score', 'term']))
    else:
        positive_df = pos_no_dup
        del [pos_no_dup]
    for key in replace_dict_phrase.keys():
        positive_df['term'] = positive_df['term'].str.replace(
            key, replace_dict_phrase[key])
    #normalize score for positive connotation words going from 0.5 to 1
    positive_df['score'] = positive_df['score'].div(
        ((positive_df['score'].max()) * 2), axis=0) + 0.5
    positive_df = positive_df.sort_values(by=['score'], ascending=False)
    positive_df['score'] = positive_df['score'].round(decimals=4)

    return positive_df.head(topk), negative_df.tail(topk)
Esempio n. 27
0
from sklearn.decomposition import PCA, FastICA, SparsePCA

import scattertext as st
from scattertext import CategoryProjector, RankDifference, ScaledFScorePresetsNeg1To1
from scattertext.cartegoryprojector.OptimalProjection import get_optimal_category_projection
from scattertext.termcompaction.AssociationCompactor import ScorePercentileCompactor, AssociationCompactor
from scattertext.termscoring import ScaledFScore

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()
'''
category_projection = get_optimal_category_projection(
    corpus,
    n_dims=2,
    n_steps=20,
    projector=lambda n_terms, n_dims: CategoryProjector(AssociationCompactor(n_terms, scorer=RankDifference),
                                                        projector=PCA(n_dims)))
'''

html = st.produce_pairplot(corpus,
                           #category_projection=category_projection,
                           metadata=movie_df['category'] + ': ' + movie_df['movie_name'])
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(
    corpus, metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Esempio n. 29
0
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences).build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1).use_general_term_frequencies().
          use_all_categories().get_priors())
(open(fn, 'wb').write(
    st.produce_fightin_words_explorer(
        corpus,
        category='fresh',
        not_categories=['rotten'],
        metadata=df['movie_name'],
        term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10),
    ).encode('utf-8')))
print(fn)
def ValuePredictor(yelp_url, from_isbn=False):
    '''Takes a url, scrape site for reviews
    and calculates the term frequencies 
    sorts and returns the top 10 as a json object
    containing term, highratingscore, poorratingscore.'''

    base_url = "https://www.yelp.com/biz/"  # add business id
    api_url = "/review_feed?sort_by=date_desc&start="
    bid = "flower-child-addison-2"  # business id

    class Scraper():
        def __init__(self):
            self.data = pd.DataFrame()

        def get_data(self, n, bid=bid):
            with Session() as s:
                with s.get(
                        base_url + bid + api_url + str(n * 20)
                ) as resp:  #makes an http get request to given url and returns response as json
                    r = loads(resp.content
                              )  #converts json response into a dictionary
                    _html = html.fromstring(
                        r['review_list'])  #loads from dictionary

                    dates = _html.xpath(
                        "//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()"
                    )
                    reviews = [
                        el.text for el in _html.xpath(
                            "//div[@class='review-content']/p")
                    ]
                    ratings = _html.xpath(
                        "//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title"
                    )

                    df = pd.DataFrame([dates, reviews, ratings]).T

                    self.data = pd.concat([self.data, df])

        def scrape(self):  #makes it faster
            # multithreaded looping
            with Executor(max_workers=40) as e:
                list(e.map(self.get_data, range(10)))

    s = Scraper()
    s.scrape()
    df = s.data
    df = df.sample(100)

    nlp.Defaults.stop_words |= {
        'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz',
        'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins',
        '1 check', 'I', 'i"m', 'i', ' ', 'it', "it's", 'it.', 'they', 'coffee',
        'place', 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this',
        'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1', '2', '3', '4',
        '5', '6', '7', '8', '9', '0', '/', '.', ','
    }

    corpus = st.CorpusFromPandas(df, category_col=2, text_col=1,
                                 nlp=nlp).build()

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(
        '5.0 star rating')

    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(
        '1.0 star rating')

    df = term_freq_df.sort_values(by='poorratingscore', ascending=False)

    df['highratingscore'] = round(df['highratingscore'], 2)
    df['poorratingscore'] = round(df['poorratingscore'], 2)

    list1 = []
    for i in df.index[:10]:
        list1.append(i)

    return json.dumps(list1)