Exemple #1
0
def wordcloud(topics, k):
    from wordcloud.wordcloud import WordCloud

    for label, freqs in topics:
        highlight_words = []
        wordcloud = WordCloud(color_func=grey_color_func,
                              random_state=1,
                              margin=10,
                              background_color='white').fit_words(freqs)

        wordcloud.to_file("./intermediate_data/figures/BTM_wordcould/" +
                          str(k) + "tp/hpv.%s.tagcloud.png" % (label))
def __create__(text, pic_path):
    if str(platform.system()).lower() == 'windows':
        font_path = 'C:/Windows/Fonts/STFANGSO.ttf'
    else:
        font_path = '/usr/share/fonts/win/msyh.ttf'
    mask = imread(jiebaSource + '/back.jpg')  # 读取背景图片
    wordcloud = WordCloud(mask=mask,
                          background_color='white',
                          max_font_size=240,
                          random_state=180,
                          font_path=font_path).generate(text)
    wordcloud.to_file(pic_path)
Exemple #3
0
def wordcloud(topics=[]):
    from wordcloud.wordcloud import WordCloud

    for label, freqs in topics:
        # logger.info(label)
        # logger.info(freqs[0])
        # quit()
        highlight_words = [];
        wordcloud = WordCloud(color_func = grey_color_func, random_state=1, margin=10, background_color='white').fit_words(freqs)
        # wordcloud.to_file("./all_data/figures/adv_in_nj/hpv.%s.tagcloud.png"%(label))
        # wordcloud.to_file("./intermediate_data/promotional/25tp/hpv.%s.tagcloud.png"%(label))
        # wordcloud.to_file("./intermediate_data/laypeople/15tp/hpv.%s.tagcloud.png"%(label))
        wordcloud.to_file("./intermediate_data/hpv_tweets/35tp/hpv.%s.tagcloud.png"%(label))
Exemple #4
0
 def __init__(self,
              font_path='C:/Windows/Fonts/simkai.ttf',
              background_color="white",
              max_font_size=200,
              mask=None):
     self.mask = mask
     self.graph = None
     if mask:
         image = Image.open(mask)
         self.graph = np.array(image)
         self.wc = WordCloud(font_path=font_path,
                             background_color=background_color,
                             max_font_size=max_font_size,
                             mask=self.graph,
                             collocations=False)
Exemple #5
0
def gen_wordcloud(dataframe: pd.DataFrame, stopwords=None):
    """
    generates a wordcloud
    :param dataframe: input dataframe
    :param stopwords: set of user stopwords. If you want to include all words use stopwords = 'off'
    :return: None
    """
    import nltk
    from wordcloud.wordcloud import WordCloud

    # get stopwords from nltk and merge with given stopwords
    nltk.download('stopwords')
    if stopwords == 'off':
        stopwords = {}
    elif stopwords is not None:
        stopwords = stopwords | set(nltk.corpus.stopwords.words('english'))
    else:
        stopwords = nltk.corpus.stopwords.words('english')

    dataframe = dataframe[dataframe['category'] == 'TEXT']
    text = dataframe['message'].str.cat(sep=' ')
    cloud = WordCloud(stopwords=stopwords).generate(text)
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
Exemple #6
0
class PyCloudWords:
    def __init__(self,
                 font_path='C:/Windows/Fonts/simkai.ttf',
                 background_color="white",
                 max_font_size=200,
                 mask=None):
        self.mask = mask
        self.graph = None
        if mask:
            image = Image.open(mask)
            self.graph = np.array(image)
            self.wc = WordCloud(font_path=font_path,
                                background_color=background_color,
                                max_font_size=max_font_size,
                                mask=self.graph,
                                collocations=False)

    def generate(
        self,
        raw_text=None,
        frequent_dict=None,
    ):
        if raw_text and isinstance(raw_text, str):
            wl_space_split = " ".join(jieba.cut(raw_text))
            self.wc.generate(wl_space_split)
            plt.imshow(self.wc)
            plt.axis("off")
            plt.show()
            self.wc.to_file("my_wordcloud.png")
            return

        if frequent_dict and isinstance(frequent_dict, dict):
            self.wc.generate_from_frequencies(frequent_dict)
            plt.imshow(self.wc)
            plt.axis("off")
            plt.show()
            self.wc.to_file("my_wordcloud.png")
            return

        if self.mask:
            image_color = ImageColorGenerator(self.graph)
            self.wc.recolor(color_func=image_color)
def plot_words_cloud():
    '''
    词云
    :return:
    '''
    tomato_str = ' '.join(tomato_com['comment'])

    words_list = []

    word_generator = jieba.cut_for_search(tomato_str)

    for word in word_generator:
        words_list.append(word)

    words_list = [k for k in words_list if len(k) > 1]

    back_color = imread('/Users/afa/myFiles/tmp/灰姑娘.png')  # 解析该图片
    wc = WordCloud(
        background_color='white',  # 背景颜色
        max_words=200,  # 最大词数
        mask=back_color,  # 以该参数值作图绘制词云[设置词云形状],这个参数不为空时,width和height会被忽略
        max_font_size=300,  # 显示字体的最大值
        stopwords=STOPWORDS.add('苟利国'),  # 使用内置的屏蔽词,再添加'苟利国'
        font_path="/Library/Fonts/Songti.ttc",
        random_state=42,  # 为每个词返回一个PIL颜色
        # width=1000,                       # 图片的宽
        # height=860                        #图片的长
    )

    tomato_count = Counter(words_list)

    wc.generate_from_frequencies(tomato_count)
    # 基于彩色图像生成相应彩色
    image_colors = ImageColorGenerator(back_color)

    # 绘制词云
    plt.figure()
    plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
    plt.axis('off')
    plt.show()
    return
Exemple #8
0
def generate_word_cloud(world_file):
    text = codecs.open(world_file,encoding = 'utf-8').read()
    wordlist_after_jieba = jieba.cut(text, cut_all=True)
    wl_space_split = " ".join(wordlist_after_jieba)
    data = dict(Counter(wordlist_after_jieba))
    background = np.array(Image.open("love.jpg"))
    my_wordcloud = WordCloud(mask= background,background_color="black")
    my_wordcloud.generate(wl_space_split)

    image_colors = ImageColorGenerator(background)
    my_wordcloud.recolor(color_func=image_colors)


    my_wordcloud.to_file(world_file.split(".")[0]+".png")
def wordcloud_plot(search_term, tweets_dataframe, save_path):

    tweets = tweets_dataframe["Tweet"].dropna().values

    # Extract and clean words
    all_words = TextBlob(" ".join(tweets).upper()).words.lemmatize()
    # Get stop-words
    stop_words = list(set(stopwords.words('english'))) + ['thi']
    # Remove Stop and Short Words
    words = [
        w for w in all_words if len(w) > 2 and w.lower() not in stop_words
    ]

    # Convert into one long string
    tweet_str = " ".join(words)

    # Create word-cloud
    word_cloud = WordCloud(
        font_path=
        "/Users/jamesashford/Documents/Projects/Hackathons/Oxford Hack 2020/OxHack-2020/TCARS/rsc/swiss_911_ultra_compressed_bt.ttf",
        mode="RGBA",
        background_color=None,
        colormap="Blues",
        width=1000,
        height=600,
        max_words=2000)
    word_cloud.generate(tweet_str)
    # Save
    file_name = f"{save_path}/{search_term}_wordcloud.png"
    word_cloud.to_file(file_name)

    return True
Exemple #10
0
def generate_word_cloud(world_file):
    font = r'C:\Windows\Fonts\Microsoft YaHei UI\MSYHBD.TTC'  #os.path.join(os.path.dirname(__file__), "DroidSansFallbackFull.ttf")
    text = codecs.open(world_file, encoding='utf-8').read()
    wordlist_after_jieba = jieba.cut(text, cut_all=True)
    wl_space_split = " ".join(wordlist_after_jieba)
    backgroup_mask = np.array(Image.open("love.jpg"))
    my_wordcloud = WordCloud(font_path=font,
                             mask=backgroup_mask,
                             background_color="black",
                             max_font_size=100,
                             random_state=42)
    my_wordcloud.generate(wl_space_split)

    image_colors = ImageColorGenerator(backgroup_mask)
    my_wordcloud.recolor(color_func=image_colors)

    #plt.imshow(my_wordcloud)
    #plt.axis("off")
    #plt.show()
    my_wordcloud.to_file("love.png")
Exemple #11
0
def gen_wordcloud(file_name):
    with open (file_name,'r') as file:
        data=file.read().lower().split("\n")
        
        words= []
        for line in data:
            for word in line.split(" "):
                words.append(word)
        """
        stemmed_words=[]
        for word in words:
            stemmed_words.append(ls.stem(word))
            #print(word)
            #print(ls.stem(word))
        print (stemmed_words)
        """
        stopwords2=['http','https',"the","say","\'trump",'co',"trump","u00eda","uf0339","udd25","u201d","u26a1","u00e9","u2019re",'realdonaldtrump','ud','nhttps','ude','potus',"udc47","n27","n19",'udd','president','amp','ryan',"'gop", "u2019s","Trump","ndr","donald","'s","'n","gop","'w","u2026","u2019t","'u2019s","u2019m","ude02","ud83c","ufe0f","ud83d","u0627","u2764","ude0d","u0644"
            ]
        stopwords = nltk.corpus.stopwords.words('english')
        words2 = []
        for w in words:
            if w not in stopwords and len(w) > 1 and w not in stopwords2:
                words2.append(w)
                
        freq = nltk.FreqDist(words2)
        freq.plot(35)
        
        text2 = ' '.join(words2)
        print (text2)
        
        wc = WordCloud(max_font_size=40).generate(text2)
        plt.figure()
        plt.imshow(wc)
        plt.axis("off")
        plt.show()
	return words2
Exemple #12
0
def on_buttonpress():
    if text_input.value != "":
        tweets = get_tweets()
        allemojis = []
        ef = []
        for i in range(len(t)):
            rr = len(t[i])
            for j in range(rr):
                allemojis = "".join(t[i][j]['emoji'])
                emoji_list = emoji.emoji_lis(allemojis)
                if emoji_list != []:
                    ef.append(emoji_list)
                em = []
                for k in range(len(ef)):
                    for l in range(len(ef[k])):
                        em.append(ef[k][l]['emoji'])
                emoji_series = pd.Series(em)
                emojis = pd.DataFrame(
                    emoji_series.value_counts()).reset_index().rename(
                        columns={
                            'index': 'emoji',
                            0: 'Count'
                        })
                emojis['Rank'] = pd.Series(range(1, len(emojis)))
        emojis = emojis.head(10)
        emojis['Rank'] = emojis['Rank'].apply(lambda x: int(x))
        source_emoji.data = dict(emoji=emojis['emoji'],
                                 Count=emojis['Count'],
                                 Rank=emojis['Rank'],
                                 color=Paired[10])
        labels = LabelSet(x="Rank",
                          y="Count",
                          text="emoji",
                          level='glyph',
                          render_mode='canvas',
                          source=source_emoji,
                          x_offset=-16,
                          y_offset=-14,
                          text_font_size="23pt")
        p.vbar(x="Rank",
               top="Count",
               width=0.95,
               source=source_emoji,
               color="color")
        p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
        p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks
        p.y_range.start = 0
        p.x_range.start = 0
        p.xaxis[0].ticker.desired_num_ticks = 10
        p.add_layout(labels)
        p.xaxis.minor_tick_line_color = None
        p.xgrid.visible = False
        p.ygrid.visible = False
        p.xaxis.major_tick_line_color = None
        p.add_tools(hover)

        ####wordcloud
        alltweets = []
        for j in range(len(t)):
            for i in range(len(t[j])):
                alltweets.append(t[j][i]['text'])
        combined_tweets = "".join(alltweets)
        cleaned_tweets = text_clean(combined_tweets)
        ss = pd.DataFrame(
            pd.Series(cleaned_tweets).value_counts()).reset_index().rename(
                columns={
                    'index': 'Word',
                    0: 'Count'
                })
        ss['Rank'] = pd.Series(range(1, len(ss)))
        ss = ss.head(10)
        ss['Rank'] = ss['Rank'].apply(lambda x: int(x))

        data_text = dict(Word=list(ss.Word),
                         Count=list(ss.Count),
                         Rank=list(ss.Rank),
                         color=Category20[10])
        #labels_text = LabelSet(x="Rank", y="Count", text="Word", level='glyph', render_mode='css', source = ColumnDataSource(data_text),
        #x_offset = -7, text_font_size="10pt", y_offset = 7, angle = 45)
        #p1.hbar(y = 'Rank', height = 0.9, right = 'Count', source = ColumnDataSource(data_text), color = "maroon")
        p1.xaxis[0].ticker.desired_num_ticks = 10
        p1.xaxis.major_label_overrides = {
            1: list(ss.Word)[0],
            2: list(ss.Word)[1],
            3: list(ss.Word)[2],
            4: list(ss.Word)[3],
            5: list(ss.Word)[4],
            6: list(ss.Word)[5],
            7: list(ss.Word)[6],
            8: list(ss.Word)[7],
            9: list(ss.Word)[8],
            10: list(ss.Word)[9]
        }
        p1.vbar(x="Rank",
                top="Count",
                width=0.95,
                source=ColumnDataSource(data_text),
                color="color")
        #p1.add_layout(labels_text)
        p1.xaxis.minor_tick_line_color = None
        p1.xaxis.major_tick_line_color = None
        p1.xgrid.visible = False
        p1.ygrid.visible = False
        p1.add_tools(hover1)

        ###Actual wordcloud
        from nltk.corpus import stopwords
        stopwords = set(STOPWORDS)
        stopwords.add('one')
        stopwords.add('also')
        stopwords.add('twitter')
        stopwords.add('pic')
        stopwords.add('https')
        stopwords.add('bit')
        stopwords.add('ly')
        stopwords.add('via')
        stopwords.add('buff')
        wordcloud_good = WordCloud(
            colormap="Dark2",
            width=750,
            height=500,
            stopwords=stopwords,
            scale=4,
            max_words=300,
            background_color='white').generate(combined_tweets)
        wordcloud_good.to_file('temp.png')
        word_img = Image.open('temp.png').convert('RGBA')
        xdim, ydim = word_img.size
        img = np.empty((ydim, xdim), dtype=np.uint32)
        view = img.view(dtype=np.uint8).reshape((ydim, xdim, 4))
        view[:, :, :] = np.flipud(np.asarray(word_img))

        dim = max(xdim, ydim)
        fig.image_rgba(image=[img], x=0, y=0, dw=500, dh=750)

        ##Sentiments graph
        def loadLexicon(fname):
            newLex = set()
            lex_conn = open(fname)
            #add every word in the file to the set
            for line in lex_conn:
                newLex.add(line.strip(
                ))  # remember to strip to remove the lin-change character
            lex_conn.close()
            return newLex

        sia = SIA()
        results = []
        for line in alltweets:
            pol_score = sia.polarity_scores(line)
            pol_score['Tweet'] = line
            results.append(pol_score)
        polarity = pd.DataFrame(results)
        most_neg = polarity.sort_values('compound', ascending=True)[0:5]
        most_pos = polarity.sort_values('compound', ascending=False)[0:5]
        polarity['Polarity'] = 0
        polarity.loc[polarity['compound'] > 0, 'Polarity'] = 1
        polarity.loc[polarity['compound'] < 0, 'Polarity'] = -1
        perc = pd.DataFrame(
            polarity.Polarity.value_counts(normalize=True) *
            100).reset_index().rename(columns={
                'index': 'Sentiment',
                'Polarity': 'Percentage'
            })
        perc.Percentage = perc.Percentage.apply(lambda x: round(x, 2))
        perc["Percentage_Text"] = perc.Percentage.apply(
            lambda x: str(round(x, 2)) + "%")
        data_text = dict(Sentiment=list(perc.Sentiment),
                         Percentage=list(perc.Percentage),
                         color=Category20[3],
                         Percentage_Text=list(perc.Percentage_Text))
        labels = LabelSet(x="Sentiment",
                          y="Percentage",
                          text="Percentage_Text",
                          level='glyph',
                          render_mode='css',
                          source=ColumnDataSource(data_text),
                          x_offset=-10,
                          text_font_size="10pt")
        sent.xaxis[0].ticker.desired_num_ticks = 3
        sent.xaxis.major_label_overrides = {
            1: 'Positive',
            0: 'Neutral',
            -1: 'Negative'
        }
        sent.xaxis.minor_tick_line_color = None
        sent.xaxis.major_tick_line_color = None
        sent.vbar(x="Sentiment",
                  top="Percentage",
                  width=0.80,
                  source=ColumnDataSource(data_text),
                  color="color")
        sent.xgrid.visible = False
        sent.ygrid.visible = False
        sent.add_layout(labels)

        ##Emotion break down
        nrc = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
                          sep="\t",
                          header=None,
                          names=["term", "category", "flag"])
        emotions = []
        final = pd.DataFrame()
        term = list(nrc.term)
        for i in range(len(cleaned_tweets)):
            if cleaned_tweets[i] in term:
                sub = nrc[nrc.term == cleaned_tweets[i]]
                s = sub[sub.flag == 1]
                if list(s.category) != []:
                    emotions.append(list(s.category))
        emotions_clubed = []
        for i in emotions:
            for j in range(len(i)):
                if i[j] == 'positive':
                    i[j] = 'joy'
                elif i[j] == 'negative':
                    i[j] = 'sadness'
                emotions_clubed.append(i[j])
        radar = pd.DataFrame(
            pd.Series(emotions_clubed))[0].value_counts(normalize=True)
        radar = round(radar * 100, 2)
        radar_df = pd.DataFrame(radar).reset_index()
        radar_df['angle'] = radar_df[0] / radar_df[0].sum() * 2 * pi
        radar_df['color'] = Category20c[radar_df.shape[0]]
        radar_df = radar_df.rename(
            columns={
                'index': 'Emotion',
                0: 'Percentage',
                'angle': 'Angle',
                'color': 'Color'
            })
        source_events = ColumnDataSource(
            data=dict(Emotion=radar_df['Emotion'],
                      Percentage=radar_df['Percentage'],
                      Angle=radar_df['Angle'],
                      Color=radar_df['Color']))
        plot_events.wedge(x=0,
                          y=1,
                          radius=0.47,
                          start_angle=cumsum('Angle', include_zero=True),
                          end_angle=cumsum('Angle'),
                          line_color="white",
                          fill_color='Color',
                          source=source_events,
                          legend="Emotion")

        plot_events.add_tools(hover2)

        ##LDA
        stopwords = set(STOPWORDS)
        stopwords.add('one')
        stopwords.add('also')
        stopwords.add('twitter')
        stopwords.add('pic')
        stopwords.add('https')
        stopwords.add('bit')
        stopwords.add('ly')
        stopwords.add('via')
        stopwords.add('buff')
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        stop_words=stopwords)
        matrix = tf_vectorizer.fit_transform(alltweets)
        vocab = tf_vectorizer.get_feature_names()
        model = lda.LDA(n_topics=topic_num, n_iter=900)
        model.fit(matrix)
        topics = []
        topics.append(
            "<b>Most Frequent Words used in 12 different Topics found in the search</b><br><br>"
        )
        top_words_num = 20
        topic_mixes = model.topic_word_
        for i in range(topic_num):  #for each topic
            top_indexes = np.argsort(topic_mixes[i])[::-1][:top_words_num]
            my_top = ''
            for ind in top_indexes:
                my_top += vocab[ind] + ' '
            topics.append('TOPIC:' + str(i + 1) + ' --> ' + str(my_top) +
                          '<br><br>')
        pos_nes = []
        pos_neg = []
        pos_nes.append("<b> Top 5 most Positive Tweets </b><br><br>")
        pos = (["> " + i + '<br><br>' for i in most_pos['Tweet']])
        pos_nes.append(pos)
        pos_nes.append("<b> Top 5 most Negative Tweets </b><br><br>")
        neg = (["> " + i + '<br><br>' for i in most_neg['Tweet']])
        pos_nes.append(neg)
        for i in range(len(pos_nes)):
            pos_neg.append("".join(pos_nes[i]))
        d = Div(
            text=
            """<div style="width: 49%; text-align: justify; float: left">""" +
            "".join(pos_neg) + "</div>" + """
        <div style="width: 49%; text-align: justify; float: right">""" +
            "".join(topics) + """
        </div><div style="width: 2%;text-align:justify;float:center">""" +
            " " + "</div>",
            width=1500,
            height=500)
        column2.children.append(d)

    else:
        pass
Exemple #13
0
    wcData = nouns.vocab().most_common(ntags)
    wcDict = dict(wcData)
    return wcDict
#     except Exception as e:
#         print(e)
#         print(text)
#         break
    
     
     
     
wcInput = get_tags(text, 100)
     
# jpype._jexception.OutOfMemoryErrorPyRaisable: java.lang.OutOfMemoryError: Java heap space
# jpype._jexception.NullPointerExceptionPyRaisable: java.lang.NullPointerException
     
print(wcInput)
print(sorted(wcInput.items(), key=lambda x:x[1], reverse=True))
           
wordcloud = WordCloud(font_path='c:/Windows/fonts/malgun.ttf', 
                    relative_scaling=0.2, 
                    background_color='black').generate_from_frequencies(wcInput)
                                    
plt.figure(figsize=(30, 50))                    
plt.imshow(wordcloud)
plt.title('Top keyword')
plt.axis('off')
plt.show()
plt.savefig('cmtWordcloud.png', dpi=400, bbox_inches='tight')
# #          
# # print('finished')
def make_cloud():
    print("Generating word cloud...")
    authors, dates, contents, bigstring = split_individual("archive.pkl")

    def rainbow_color_func(word,
                           font_size,
                           position,
                           orientation,
                           random_state=None,
                           **kwargs):

        return "hsl(hue,100%,50%)".replace("hue",
                                           str(int(position[1] / 1000 * 360)))

    wc = WordCloud(width=2000,
                   height=2000,
                   color_func=rainbow_color_func,
                   stopwords=None,
                   collocations=False)
    wc.generate(bigstring)
    wc.to_file("wordcloud-rainbow.png")
    wc = WordCloud(width=2000, height=2000, stopwords=None)
    wc.generate(bigstring)
    wc.to_file("wordcloud-standard.png")

    wc.generate_from_frequencies(Counter(authors))
    wc.to_file("wordcloud-rainbow-authors.png")
    wc = WordCloud(width=2000, height=2000, stopwords=None)
    wc.generate_from_frequencies(Counter(authors))
    wc.to_file("wordcloud-standard-authors.png")
Exemple #15
0
api = tweepy.API(auth)

f = open('tweets', 'w')
for status in api.user_timeline():
    f.write(api.get_status(status.id).text)
    #print(api.get_status(status.id).text.encode('utf8'))
f.close()

words = ' '
count = 0
f = open('tweets', 'r')
for line in f:
    words = words + line
f.close

stopwords = {'will', 'youtube', 'YouTube'}
logomask = imread('twitter_mask.png')

wordcloud = WordCloud(font_path='Voyager.ttf',
                      stopwords=STOPWORDS.union(stopwords),
                      background_color='black',
                      mask=logomask,
                      max_words=2000,
                      width=2400,
                      height=1400).generate(words)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('./tweetcloud3.png', dpi=300)
plt.show()
Exemple #16
0
    for reg in entrada:
        input_data += ''.join(reg)

# Uses regex to filter the data. Mostly just removes URLs.
input_data = re.sub(r'^https?:\/\/.*[\r\n]*',
                    '',
                    input_data,
                    flags=re.MULTILINE)  #Removes a few URLs
input_data = re.sub(r"http\S+", "", input_data)  #removes some URLs
input_data = re.sub(r"pic\S+", "", input_data)  #removes more URLs
input_data = re.sub(r"https\S+", "", input_data)  #even more URLs!

#Wordcloud generator

# Generates wordcloud
wordcloud = WordCloud(max_font_size=600, width=3200,
                      height=1600).generate(input_data)

plt.figure(figsize=(40, 20), facecolor='k')  # Sets size for plt
plt.imshow(wordcloud)  # Embed image into canvas
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()  # I don't even think you need this
plt.savefig('wordcloud.png', facecolor='k',
            bbox_inches='tight')  # Saves to root directory

# ALL THIS CODE DOWN HERE WAS FOR TESTING AND MAY HAVE VALUE IF YOU NEED MORE FEATURES (I guess)
'''
with open('output.json', 'r') as f:
	for row in csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE):
		input_data += ''.join(row)
'''
Exemple #17
0
# In[25]:

from wordcloud.wordcloud import WordCloud, STOPWORDS
from PIL import Image

# In[26]:

stopwords = set(STOPWORDS)
stopwords.add('one')
stopwords.add('also')
mask_image = np.array(Image.open("images.png"))
wordcloud_good = WordCloud(colormap="Paired",
                           mask=mask_image,
                           font_path=None,
                           width=30,
                           height=20,
                           scale=2,
                           max_words=1000,
                           stopwords=stopwords)
wordcloud_good.generate(good_para)
plt.figure(figsize=(7, 10))
plt.imshow(wordcloud_good, interpolation="bilinear", cmap=plt.cm.autumn)
plt.axis('off')
plt.figure(figsize=(10, 6))
plt.show()
wordcloud_good.to_file("good.png")

# In[27]:

stopwords = set(STOPWORDS)
wordcloud_neu = WordCloud(colormap="plasma",
Exemple #18
0
# Extract and clean words
all_words = TextBlob(" ".join(tweets).upper()).words.singularize().lemmatize()
# Get stop-words
stop_words = list(set(stopwords.words('english'))) + ['thi']
# Remove Stop and Short Words
words = [w for w in all_words if len(w) > 2 and w.lower() not in stop_words]

# Convert into one long string
tweet_str = " ".join(words)

# Create word-cloud
word_cloud = WordCloud(
    font_path=f"{PROJ_PATH}rsc/swiss_911_ultra_compressed_bt.ttf",
    mode="RGBA",
    background_color=None,
    colormap="Blues",
    width=1000,
    height=1000,
    max_words=2000)
word_cloud.generate(tweet_str)
# Save
save_name = f"{PROJ_PATH}output/{search_term}_wordcloud.png"
word_cloud.to_file(save_name)

# Show in matplotlib
if PLOT:
    plt.figure(figsize=(15, 10))
    plt.imshow(word_cloud)  #, interpolation='bilinear')
    plt.axis('off')
    plt.show()
Exemple #19
0
import jieba
from wordcloud.wordcloud import WordCloud

with open('comments/AllComments_shanqiu.txt', 'r', encoding='utf-8') as r:
    datas = r.read()

word_c = WordCloud(font_path='STXINWEI.TTF',
                   width=1000,
                   height=5000,
                   margin=10,
                   background_color='pink')
word_c.generate(datas)
word_c.to_file('ciyun.jpg')