def wordcloud_plot(search_term, tweets_dataframe, save_path):

    tweets = tweets_dataframe["Tweet"].dropna().values

    # Extract and clean words
    all_words = TextBlob(" ".join(tweets).upper()).words.lemmatize()
    # Get stop-words
    stop_words = list(set(stopwords.words('english'))) + ['thi']
    # Remove Stop and Short Words
    words = [
        w for w in all_words if len(w) > 2 and w.lower() not in stop_words
    ]

    # Convert into one long string
    tweet_str = " ".join(words)

    # Create word-cloud
    word_cloud = WordCloud(
        font_path=
        "/Users/jamesashford/Documents/Projects/Hackathons/Oxford Hack 2020/OxHack-2020/TCARS/rsc/swiss_911_ultra_compressed_bt.ttf",
        mode="RGBA",
        background_color=None,
        colormap="Blues",
        width=1000,
        height=600,
        max_words=2000)
    word_cloud.generate(tweet_str)
    # Save
    file_name = f"{save_path}/{search_term}_wordcloud.png"
    word_cloud.to_file(file_name)

    return True
def __create__(text, pic_path):
    if str(platform.system()).lower() == 'windows':
        font_path = 'C:/Windows/Fonts/STFANGSO.ttf'
    else:
        font_path = '/usr/share/fonts/win/msyh.ttf'
    mask = imread(jiebaSource + '/back.jpg')  # 读取背景图片
    wordcloud = WordCloud(mask=mask,
                          background_color='white',
                          max_font_size=240,
                          random_state=180,
                          font_path=font_path).generate(text)
    wordcloud.to_file(pic_path)
Exemple #3
0
def wordcloud(topics, k):
    from wordcloud.wordcloud import WordCloud

    for label, freqs in topics:
        highlight_words = []
        wordcloud = WordCloud(color_func=grey_color_func,
                              random_state=1,
                              margin=10,
                              background_color='white').fit_words(freqs)

        wordcloud.to_file("./intermediate_data/figures/BTM_wordcould/" +
                          str(k) + "tp/hpv.%s.tagcloud.png" % (label))
Exemple #4
0
def wordcloud(topics=[]):
    from wordcloud.wordcloud import WordCloud

    for label, freqs in topics:
        # logger.info(label)
        # logger.info(freqs[0])
        # quit()
        highlight_words = [];
        wordcloud = WordCloud(color_func = grey_color_func, random_state=1, margin=10, background_color='white').fit_words(freqs)
        # wordcloud.to_file("./all_data/figures/adv_in_nj/hpv.%s.tagcloud.png"%(label))
        # wordcloud.to_file("./intermediate_data/promotional/25tp/hpv.%s.tagcloud.png"%(label))
        # wordcloud.to_file("./intermediate_data/laypeople/15tp/hpv.%s.tagcloud.png"%(label))
        wordcloud.to_file("./intermediate_data/hpv_tweets/35tp/hpv.%s.tagcloud.png"%(label))
Exemple #5
0
def generate_word_cloud(world_file):
    text = codecs.open(world_file,encoding = 'utf-8').read()
    wordlist_after_jieba = jieba.cut(text, cut_all=True)
    wl_space_split = " ".join(wordlist_after_jieba)
    data = dict(Counter(wordlist_after_jieba))
    background = np.array(Image.open("love.jpg"))
    my_wordcloud = WordCloud(mask= background,background_color="black")
    my_wordcloud.generate(wl_space_split)

    image_colors = ImageColorGenerator(background)
    my_wordcloud.recolor(color_func=image_colors)


    my_wordcloud.to_file(world_file.split(".")[0]+".png")
def make_cloud():
    print("Generating word cloud...")
    authors, dates, contents, bigstring = split_individual("archive.pkl")

    def rainbow_color_func(word,
                           font_size,
                           position,
                           orientation,
                           random_state=None,
                           **kwargs):

        return "hsl(hue,100%,50%)".replace("hue",
                                           str(int(position[1] / 1000 * 360)))

    wc = WordCloud(width=2000,
                   height=2000,
                   color_func=rainbow_color_func,
                   stopwords=None,
                   collocations=False)
    wc.generate(bigstring)
    wc.to_file("wordcloud-rainbow.png")
    wc = WordCloud(width=2000, height=2000, stopwords=None)
    wc.generate(bigstring)
    wc.to_file("wordcloud-standard.png")

    wc.generate_from_frequencies(Counter(authors))
    wc.to_file("wordcloud-rainbow-authors.png")
    wc = WordCloud(width=2000, height=2000, stopwords=None)
    wc.generate_from_frequencies(Counter(authors))
    wc.to_file("wordcloud-standard-authors.png")
Exemple #7
0
class PyCloudWords:
    def __init__(self,
                 font_path='C:/Windows/Fonts/simkai.ttf',
                 background_color="white",
                 max_font_size=200,
                 mask=None):
        self.mask = mask
        self.graph = None
        if mask:
            image = Image.open(mask)
            self.graph = np.array(image)
            self.wc = WordCloud(font_path=font_path,
                                background_color=background_color,
                                max_font_size=max_font_size,
                                mask=self.graph,
                                collocations=False)

    def generate(
        self,
        raw_text=None,
        frequent_dict=None,
    ):
        if raw_text and isinstance(raw_text, str):
            wl_space_split = " ".join(jieba.cut(raw_text))
            self.wc.generate(wl_space_split)
            plt.imshow(self.wc)
            plt.axis("off")
            plt.show()
            self.wc.to_file("my_wordcloud.png")
            return

        if frequent_dict and isinstance(frequent_dict, dict):
            self.wc.generate_from_frequencies(frequent_dict)
            plt.imshow(self.wc)
            plt.axis("off")
            plt.show()
            self.wc.to_file("my_wordcloud.png")
            return

        if self.mask:
            image_color = ImageColorGenerator(self.graph)
            self.wc.recolor(color_func=image_color)
Exemple #8
0
def generate_word_cloud(world_file):
    font = r'C:\Windows\Fonts\Microsoft YaHei UI\MSYHBD.TTC'  #os.path.join(os.path.dirname(__file__), "DroidSansFallbackFull.ttf")
    text = codecs.open(world_file, encoding='utf-8').read()
    wordlist_after_jieba = jieba.cut(text, cut_all=True)
    wl_space_split = " ".join(wordlist_after_jieba)
    backgroup_mask = np.array(Image.open("love.jpg"))
    my_wordcloud = WordCloud(font_path=font,
                             mask=backgroup_mask,
                             background_color="black",
                             max_font_size=100,
                             random_state=42)
    my_wordcloud.generate(wl_space_split)

    image_colors = ImageColorGenerator(backgroup_mask)
    my_wordcloud.recolor(color_func=image_colors)

    #plt.imshow(my_wordcloud)
    #plt.axis("off")
    #plt.show()
    my_wordcloud.to_file("love.png")
Exemple #9
0
import jieba
from wordcloud.wordcloud import WordCloud

with open('comments/AllComments_shanqiu.txt', 'r', encoding='utf-8') as r:
    datas = r.read()

word_c = WordCloud(font_path='STXINWEI.TTF',
                   width=1000,
                   height=5000,
                   margin=10,
                   background_color='pink')
word_c.generate(datas)
word_c.to_file('ciyun.jpg')
Exemple #10
0
from wordcloud.wordcloud import WordCloud

with open('shenteng.txt', 'r', encoding='utf-8') as r:
    datas = r.read()

word_c = WordCloud(font_path='STXINWEI.TTF',
                   width=1000,
                   height=1000,
                   margin=10,
                   background_color='pink')
word_c.generate(datas)
word_c.to_file('shenten_ciyun.jpg')
Exemple #11
0
def on_buttonpress():
    if text_input.value != "":
        tweets = get_tweets()
        allemojis = []
        ef = []
        for i in range(len(t)):
            rr = len(t[i])
            for j in range(rr):
                allemojis = "".join(t[i][j]['emoji'])
                emoji_list = emoji.emoji_lis(allemojis)
                if emoji_list != []:
                    ef.append(emoji_list)
                em = []
                for k in range(len(ef)):
                    for l in range(len(ef[k])):
                        em.append(ef[k][l]['emoji'])
                emoji_series = pd.Series(em)
                emojis = pd.DataFrame(
                    emoji_series.value_counts()).reset_index().rename(
                        columns={
                            'index': 'emoji',
                            0: 'Count'
                        })
                emojis['Rank'] = pd.Series(range(1, len(emojis)))
        emojis = emojis.head(10)
        emojis['Rank'] = emojis['Rank'].apply(lambda x: int(x))
        source_emoji.data = dict(emoji=emojis['emoji'],
                                 Count=emojis['Count'],
                                 Rank=emojis['Rank'],
                                 color=Paired[10])
        labels = LabelSet(x="Rank",
                          y="Count",
                          text="emoji",
                          level='glyph',
                          render_mode='canvas',
                          source=source_emoji,
                          x_offset=-16,
                          y_offset=-14,
                          text_font_size="23pt")
        p.vbar(x="Rank",
               top="Count",
               width=0.95,
               source=source_emoji,
               color="color")
        p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
        p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
        p.y_range.start = 0
        p.x_range.start = 0
        p.xaxis[0].ticker.desired_num_ticks = 10
        p.add_layout(labels)
        p.xaxis.minor_tick_line_color = None
        p.xgrid.visible = False
        p.ygrid.visible = False
        p.xaxis.major_tick_line_color = None
        p.add_tools(hover)

        ####wordcloud
        alltweets = []
        for j in range(len(t)):
            for i in range(len(t[j])):
                alltweets.append(t[j][i]['text'])
        combined_tweets = "".join(alltweets)
        cleaned_tweets = text_clean(combined_tweets)
        ss = pd.DataFrame(
            pd.Series(cleaned_tweets).value_counts()).reset_index().rename(
                columns={
                    'index': 'Word',
                    0: 'Count'
                })
        ss['Rank'] = pd.Series(range(1, len(ss)))
        ss = ss.head(10)
        ss['Rank'] = ss['Rank'].apply(lambda x: int(x))

        data_text = dict(Word=list(ss.Word),
                         Count=list(ss.Count),
                         Rank=list(ss.Rank),
                         color=Category20[10])
        #labels_text = LabelSet(x="Rank", y="Count", text="Word", level='glyph', render_mode='css', source = ColumnDataSource(data_text),
        #x_offset = -7, text_font_size="10pt", y_offset = 7, angle = 45)
        #p1.hbar(y = 'Rank', height = 0.9, right = 'Count', source = ColumnDataSource(data_text), color = "maroon")
        p1.xaxis[0].ticker.desired_num_ticks = 10
        p1.xaxis.major_label_overrides = {
            1: list(ss.Word)[0],
            2: list(ss.Word)[1],
            3: list(ss.Word)[2],
            4: list(ss.Word)[3],
            5: list(ss.Word)[4],
            6: list(ss.Word)[5],
            7: list(ss.Word)[6],
            8: list(ss.Word)[7],
            9: list(ss.Word)[8],
            10: list(ss.Word)[9]
        }
        p1.vbar(x="Rank",
                top="Count",
                width=0.95,
                source=ColumnDataSource(data_text),
                color="color")
        #p1.add_layout(labels_text)
        p1.xaxis.minor_tick_line_color = None
        p1.xaxis.major_tick_line_color = None
        p1.xgrid.visible = False
        p1.ygrid.visible = False
        p1.add_tools(hover1)

        ###Actual wordcloud
        from nltk.corpus import stopwords
        stopwords = set(STOPWORDS)
        stopwords.add('one')
        stopwords.add('also')
        stopwords.add('twitter')
        stopwords.add('pic')
        stopwords.add('https')
        stopwords.add('bit')
        stopwords.add('ly')
        stopwords.add('via')
        stopwords.add('buff')
        wordcloud_good = WordCloud(
            colormap="Dark2",
            width=750,
            height=500,
            stopwords=stopwords,
            scale=4,
            max_words=300,
            background_color='white').generate(combined_tweets)
        wordcloud_good.to_file('temp.png')
        word_img = Image.open('temp.png').convert('RGBA')
        xdim, ydim = word_img.size
        img = np.empty((ydim, xdim), dtype=np.uint32)
        view = img.view(dtype=np.uint8).reshape((ydim, xdim, 4))
        view[:, :, :] = np.flipud(np.asarray(word_img))

        dim = max(xdim, ydim)
        fig.image_rgba(image=[img], x=0, y=0, dw=500, dh=750)

        ##Sentiments graph
        def loadLexicon(fname):
            newLex = set()
            lex_conn = open(fname)
            #add every word in the file to the set
            for line in lex_conn:
                newLex.add(line.strip(
                ))  # remember to strip to remove the lin-change character
            lex_conn.close()
            return newLex

        sia = SIA()
        results = []
        for line in alltweets:
            pol_score = sia.polarity_scores(line)
            pol_score['Tweet'] = line
            results.append(pol_score)
        polarity = pd.DataFrame(results)
        most_neg = polarity.sort_values('compound', ascending=True)[0:5]
        most_pos = polarity.sort_values('compound', ascending=False)[0:5]
        polarity['Polarity'] = 0
        polarity.loc[polarity['compound'] > 0, 'Polarity'] = 1
        polarity.loc[polarity['compound'] < 0, 'Polarity'] = -1
        perc = pd.DataFrame(
            polarity.Polarity.value_counts(normalize=True) *
            100).reset_index().rename(columns={
                'index': 'Sentiment',
                'Polarity': 'Percentage'
            })
        perc.Percentage = perc.Percentage.apply(lambda x: round(x, 2))
        perc["Percentage_Text"] = perc.Percentage.apply(
            lambda x: str(round(x, 2)) + "%")
        data_text = dict(Sentiment=list(perc.Sentiment),
                         Percentage=list(perc.Percentage),
                         color=Category20[3],
                         Percentage_Text=list(perc.Percentage_Text))
        labels = LabelSet(x="Sentiment",
                          y="Percentage",
                          text="Percentage_Text",
                          level='glyph',
                          render_mode='css',
                          source=ColumnDataSource(data_text),
                          x_offset=-10,
                          text_font_size="10pt")
        sent.xaxis[0].ticker.desired_num_ticks = 3
        sent.xaxis.major_label_overrides = {
            1: 'Positive',
            0: 'Neutral',
            -1: 'Negative'
        }
        sent.xaxis.minor_tick_line_color = None
        sent.xaxis.major_tick_line_color = None
        sent.vbar(x="Sentiment",
                  top="Percentage",
                  width=0.80,
                  source=ColumnDataSource(data_text),
                  color="color")
        sent.xgrid.visible = False
        sent.ygrid.visible = False
        sent.add_layout(labels)

        ##Emotion break down
        nrc = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
                          sep="\t",
                          header=None,
                          names=["term", "category", "flag"])
        emotions = []
        final = pd.DataFrame()
        term = list(nrc.term)
        for i in range(len(cleaned_tweets)):
            if cleaned_tweets[i] in term:
                sub = nrc[nrc.term == cleaned_tweets[i]]
                s = sub[sub.flag == 1]
                if list(s.category) != []:
                    emotions.append(list(s.category))
        emotions_clubed = []
        for i in emotions:
            for j in range(len(i)):
                if i[j] == 'positive':
                    i[j] = 'joy'
                elif i[j] == 'negative':
                    i[j] = 'sadness'
                emotions_clubed.append(i[j])
        radar = pd.DataFrame(
            pd.Series(emotions_clubed))[0].value_counts(normalize=True)
        radar = round(radar * 100, 2)
        radar_df = pd.DataFrame(radar).reset_index()
        radar_df['angle'] = radar_df[0] / radar_df[0].sum() * 2 * pi
        radar_df['color'] = Category20c[radar_df.shape[0]]
        radar_df = radar_df.rename(
            columns={
                'index': 'Emotion',
                0: 'Percentage',
                'angle': 'Angle',
                'color': 'Color'
            })
        source_events = ColumnDataSource(
            data=dict(Emotion=radar_df['Emotion'],
                      Percentage=radar_df['Percentage'],
                      Angle=radar_df['Angle'],
                      Color=radar_df['Color']))
        plot_events.wedge(x=0,
                          y=1,
                          radius=0.47,
                          start_angle=cumsum('Angle', include_zero=True),
                          end_angle=cumsum('Angle'),
                          line_color="white",
                          fill_color='Color',
                          source=source_events,
                          legend="Emotion")

        plot_events.add_tools(hover2)

        ##LDA
        stopwords = set(STOPWORDS)
        stopwords.add('one')
        stopwords.add('also')
        stopwords.add('twitter')
        stopwords.add('pic')
        stopwords.add('https')
        stopwords.add('bit')
        stopwords.add('ly')
        stopwords.add('via')
        stopwords.add('buff')
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        stop_words=stopwords)
        matrix = tf_vectorizer.fit_transform(alltweets)
        vocab = tf_vectorizer.get_feature_names()
        model = lda.LDA(n_topics=topic_num, n_iter=900)
        model.fit(matrix)
        topics = []
        topics.append(
            "<b>Most Frequent Words used in 12 different Topics found in the search</b><br><br>"
        )
        top_words_num = 20
        topic_mixes = model.topic_word_
        for i in range(topic_num):  #for each topic
            top_indexes = np.argsort(topic_mixes[i])[::-1][:top_words_num]
            my_top = ''
            for ind in top_indexes:
                my_top += vocab[ind] + ' '
            topics.append('TOPIC:' + str(i + 1) + ' --> ' + str(my_top) +
                          '<br><br>')
        pos_nes = []
        pos_neg = []
        pos_nes.append("<b> Top 5 most Positive Tweets </b><br><br>")
        pos = (["> " + i + '<br><br>' for i in most_pos['Tweet']])
        pos_nes.append(pos)
        pos_nes.append("<b> Top 5 most Negative Tweets </b><br><br>")
        neg = (["> " + i + '<br><br>' for i in most_neg['Tweet']])
        pos_nes.append(neg)
        for i in range(len(pos_nes)):
            pos_neg.append("".join(pos_nes[i]))
        d = Div(
            text=
            """<div style="width: 49%; text-align: justify; float: left">""" +
            "".join(pos_neg) + "</div>" + """
        <div style="width: 49%; text-align: justify; float: right">""" +
            "".join(topics) + """
        </div><div style="width: 2%;text-align:justify;float:center">""" +
            " " + "</div>",
            width=1500,
            height=500)
        column2.children.append(d)

    else:
        pass
Exemple #12
0
mask_image = np.array(Image.open("images.png"))
wordcloud_good = WordCloud(colormap="Paired",
                           mask=mask_image,
                           font_path=None,
                           width=30,
                           height=20,
                           scale=2,
                           max_words=1000,
                           stopwords=stopwords)
wordcloud_good.generate(good_para)
plt.figure(figsize=(7, 10))
plt.imshow(wordcloud_good, interpolation="bilinear", cmap=plt.cm.autumn)
plt.axis('off')
plt.figure(figsize=(10, 6))
plt.show()
wordcloud_good.to_file("good.png")

# In[27]:

stopwords = set(STOPWORDS)
wordcloud_neu = WordCloud(colormap="plasma",
                          font_path=None,
                          width=1100,
                          height=700,
                          scale=2,
                          max_words=1000,
                          stopwords=stopwords).generate(new_para)
plt.figure(figsize=(7, 10))
plt.imshow(wordcloud_neu, cmap=plt.cm.autumn)
plt.axis('off')
plt.show()
Exemple #13
0
# Remove Stop and Short Words
words = [w for w in all_words if len(w) > 2 and w.lower() not in stop_words]

# Convert into one long string
tweet_str = " ".join(words)

# Create word-cloud
word_cloud = WordCloud(
    font_path=f"{PROJ_PATH}rsc/swiss_911_ultra_compressed_bt.ttf",
    mode="RGBA",
    background_color=None,
    colormap="Blues",
    width=1000,
    height=1000,
    max_words=2000)
word_cloud.generate(tweet_str)
# Save
save_name = f"{PROJ_PATH}output/{search_term}_wordcloud.png"
word_cloud.to_file(save_name)

# Show in matplotlib
if PLOT:
    plt.figure(figsize=(15, 10))
    plt.imshow(word_cloud)  #, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Get counts of each word
# counts = dict(Counter(words))
# ord_counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
# print(ord_counts)