Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--url', metavar='URL', required=True, help='input the url')
    args = parser.parse_args()

    url = args.url
    output_file = path.join(path.dirname(__file__), 'wordcloud.png')

    response = requests.get(url)

    origin_text = response.text
    origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I|re.M|re.DOTALL)
    origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I|re.M|re.DOTALL)

    doc = html.fromstring(origin_text)
    text = doc.xpath('//body//text()')
    text = [i.strip() for i in text if i.strip()]
    text = ' '.join(text)

    seg = jieba.cut(text)
    seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit() and i.strip() not in stopwords]
    seg = ' '.join(seg)

    wordcloud = WordCloud(font_path='simhei.ttf', background_color='black', margin=5, width=1800, height=800)
    wordcloud = wordcloud.generate(seg)
    image = wordcloud.to_image()

    with open(output_file, 'wb') as f:
        image.save(f, format='png')
Example #2
0
def main():
	wr=WordReader()
	# wlist=wr.word_reader('data1/dt01.txt')

	wcount=''
	for root,dirs,files in os.walk('data2'):
		for file in files:
			file_path=os.path.join(root,file)
			wlist=wr.word_reader(file_path)
			wcount+=wlist

	back_coloring = np.array(Image.open("./sky.png"))
	wc = WordCloud(
                background_color="white", #背景颜色  
                max_words=1000,# 词云显示的最大词数  
                mask=back_coloring,#设置背景图片  
                max_font_size=150, #字体最大值  
                random_state=42,  
                )
                
	wc.generate(wcount) 
	# 
	# wc.generate_from_frequencies(word_list)
	# wc.fit_words(word_list)
	plt.figure() 
	plt.imshow(wc)  
	plt.axis("off")
	plt.show()  
def create_word_cloud(filename):
    # 读取文件内容
    text = open("{}.txt".format(filename), encoding='utf-8').read()

    # 注释部分采用结巴分词
    # wordlist = jieba.cut(text, cut_all=True)
    # wl = " ".join(wordlist)

    # 设置词云
    wc = WordCloud(
        # 设置背景颜色
        background_color="white",
        # 设置最大显示的词云数
        max_words=2000,
        # 这种字体都在电脑字体中,window在C:\Windows\Fonts\下,mac下可选/System/Library/Fonts/PingFang.ttc 字体
        font_path='C:\\Windows\\Fonts\\simfang.ttf',
        height=500,
        width=500,
        # 设置字体最大值
        max_font_size=60,
        # 设置有多少种随机生成状态,即有多少种配色方案
        random_state=30,
    )

    myword = wc.generate(text)  # 生成词云 如果用结巴分词的话,使用wl 取代 text, 生成词云图
    # 展示词云图
    plt.imshow(myword)
    plt.axis("off")
    plt.show()
    wc.to_file('signature.png')  # 把词云保存下
Example #4
0
def genwordcloud(texts,mask=None,font_path=None,background_color='white'):
    '''生成词云
    parameter
    ----------
    mask: RGBA模式数组,最后一个分量是alpha通道, 默认会生成一个900*1200的椭圆
    font_path: 采用的字体,建议采用安卓默认字体DroidSansFallback.ttf
    
    return
    -------
    img:可以直接img.save('test.png')
    '''
    from PIL import Image
    try:
        from wordcloud import WordCloud
    except:
        #raise Exception('wordcloud need install wordcloud package.')
        print('wordcloud need install wordcloud package.')
        return None
    if mask is None:
        tmp=np.zeros((900,1200),dtype=np.uint8)
        for i in range(tmp.shape[0]):
            for j in range(tmp.shape[1]):
                if (i-449.5)**2/(430**2)+(j-599.5)**2/(580**2)>1:
                    tmp[i,j]=255
        mask=np.zeros((900,1200,4),dtype=np.uint8)
        mask[:,:,0]=tmp
        mask[:,:,1]=tmp
        mask[:,:,2]=tmp
        mask[:,:,3]=255
    else:
        mask=np.array(Image.open(mask))
    wordcloud = WordCloud(background_color = background_color,font_path=font_path, mask = mask)
    wordcloud.generate(texts)
    img=wordcloud.to_image()
    return img
Example #5
0
def generate_image(words, image):
    graph = np.array(image)
    wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'),
                   background_color='white', max_words=MAX_WORDS, mask=graph)
    wc.generate_from_frequencies(words)
    image_color = ImageColorGenerator(graph)
    return wc, image_color
Example #6
0
def create_word_cloud(ballots, chart_directory, image_name, mask_file,
                      stop_words, word_counts=None):
    """
    Generates a word cloud from given ballots.
    """
    if word_counts is None:
        word_counts=[25, 50, 100, 1000]
    text = ''
    for ballot in ballots:
        text = ''.join((text, ballot.feedback,))
    all_stop_words = STOPWORDS
    all_stop_words |= set(stop_words)
    for word_count in word_counts:
        if mask_file:
            color_mask = imread(mask_file)
            image_colors = ImageColorGenerator(color_mask)
            wc = WordCloud(background_color="white", max_words=word_count,
                           mask=color_mask,
                           stopwords=all_stop_words,
                           color_func=image_colors,
                           max_font_size=80, random_state=42)
        else:
            wc = WordCloud(background_color="white", max_words=word_count,
                           stopwords=all_stop_words,
                           max_font_size=80, random_state=42)
        wc.generate(text)
        axis_image = plt.imshow(wc)
        plt.axis("off")
        image_name_with_count = '{0}-{1}.png'.format(image_name, str(word_count))
        logger.info('...creating word cloud {0}'.format(image_name_with_count))
        save_location = os.path.join(chart_directory, image_name_with_count)
        plt.savefig(save_location)
        plt.close()
Example #7
0
def cloudplot(person):

    person = re.sub(r'\+', ' ', person)

    text = GetTextRange(Emails, person)
    text = rmBoring(rmNonAlpha(text)).decode('ascii', 'ignore')

    plt.clf()

    d = path.dirname(path.abspath(__file__))

    hilcolor = np.array(Image.open(path.join(d, "static/img/hillarylogo.jpg")))

    wc = WordCloud(background_color="white", max_words=150, mask=hilcolor,
               stopwords=STOPWORDS.add("said"),
               max_font_size=80, random_state=42,
               relative_scaling = 0.5)


    wc.generate(text)
    image_colors = ImageColorGenerator(hilcolor)

    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")

    fig = plt.gcf()
    img = StringIO.StringIO()
    fig.savefig(img)
    img.seek(0)

    return send_file(img, mimetype='image/png')
Example #8
0
def main():
    #d = path.dirname(__file__)
    width, height = 1000, 500
    themes = lib.datastats.themes_with_usage()
    data = {
        th: len(tobj.stories) ** 0.5
        for th, tobj in themes.iteritems()
    }
    #mask = np.array(Image.open(path.join(d, "ellipse1000x500.png")))
    mask = np.array(Image.open("ellipse1000x500.png"))
    wordcloud = WordCloud(
        font_path = 'Helvetica.ttf',
        max_words = 5000,
        max_font_size = 20,
        prefer_horizontal = 1.0,
        width = width,
        height = height,
        scale = 2,
        mask = mask,
        relative_scaling = 1.0,
    ).fit_words(data)

    for rec in wordcloud.layout_:
        print rec

    image = wordcloud.to_image()
    image.show()
Example #9
0
def create_wc(words_in):
    """Create WordCloud object.

    Parameters
    ----------
    words_in : list of tuple
        Words to plot, with their corresponding frequencies.

    Returns
    -------
    wc : WordCloud() object
        Wordcloud definition.
    """

    # Create the WordCloud object
    wc = WordCloud(background_color=None,
                   mode='RGBA',
                   width=800,
                   height=400,
                   prefer_horizontal=1,
                   relative_scaling=0.5,
                   min_font_size=25,
                   max_font_size=80).generate_from_frequencies(words_in)

    # Change colour scheme to grey
    wc.recolor(color_func=_grey_color_func, random_state=3)

    return wc
def create_wordcloud(frequencies, stop_words):
	if len(frequencies) == 0: Exception("No history is found.")
	logger.debug("word frequencies count = %s" % len(frequencies))
	logger.debug("stop words = %s" % pformat(stop_words))
	wordcloud = WordCloud(background_color="black", width=900, height=600, stopwords=stop_words).generate_from_frequencies(frequencies)
	image = wordcloud.to_image()
	image.show()
Example #11
0
def  twittersearch():
     api = twitter.Api(
 	consumer_key=twitter_consumer_key,
 	consumer_secret=twitter_consumer_secret,
 	access_token_key=twitter_access_token_key,
 	access_token_secret=twitter_access_token_secret
     )

     search = api.GetSearch(term='DevOps', lang='en', result_type='recent', count=100, max_id='')
     item = 0
     to_wordcloud = ''
     to_display = '<html><body bgcolor="#0033FF"><font color="white"> <h2>Tweets about DevOps.... or <a href="'+my_url+'">click here to go back</a></h2>'
     to_display += '<ol>'
     for t in search:
           item += 1
           #to_display += str(item) + '.  '
           to_display += '<li>'
           to_display += t.user.screen_name 
           to_display += ' (' 
           to_display += t.created_at 
           to_display += ') : '
           to_display += t.text
           to_wordcloud += t.text
           to_display += '</li>'
     # Generate a word cloud image
     #wordcloud = WordCloud().generate(to_wordcloud)
     wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(to_wordcloud)
     wcimage = wordcloud.to_image()
     random_name = 'static/'+str(random.randint(0,888888))+'-image-wc.png'
     wcimage.save('./'+random_name)
     to_display += '</ol></font>'
     to_display += '<center><img src="/'+random_name+'" width="80%" height="80%"></center></body></html>' 
     
     return to_display
Example #12
0
def wcloud(wf, color, save_as=None):
    """Create a word cloud based on word frequencies,
    `wf`, using a color function from `wc_colors.py`

    Parameters
    ----------
    wf : list
        (token, value) tuples
    color : function
        from `wc_colors.py`
    save_as : str
        filename

    Returns
    -------
    None
    """
    wc = WordCloud(background_color=None, mode='RGBA',
                   width=2400, height=1600, relative_scaling=0.5,
                   font_path='/Library/Fonts/Futura.ttc')
    wc.generate_from_frequencies(wf)
    plt.figure()
    plt.imshow(wc.recolor(color_func=color, random_state=42))
    plt.axis("off")
    if save_as:
        plt.savefig(save_as, dpi=300, transparent=True)
Example #13
0
def draw_wordCloud():
    '''
    画出词云图
    :return: 
    '''
    ## 读取wordList,转化为str
    global wordList
    cut_text = ""
    for word in wordList:
        cut_text = cut_text + word + " "

    ## 生成词云
    os.chdir(r"D:\STUDYING\MyProjects\pycharm\music163_EasonComments")
    d = path.dirname(__file__)  # 当前文件文件夹所在目录
    color_mask = imread("Eason.jpg")  # 读取背景图片
    plt.imshow(color_mask)

    cloud = WordCloud(
        font_path=path.join(d, 'simsun.ttc'),
        background_color='white',
        mask=color_mask,
        max_words=2000,
        max_font_size=40,
    )
    word_cloud = cloud.generate(cut_text)  # 产生词云

    ## show
    plt.imshow(word_cloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()
Example #14
0
def make_word_cloud(text, save_path, background_color='black'):
    # text expected to a string or a list of [(word, count), ...]
    from wordcloud import WordCloud
    import os

    def col_fun(word, *args, **kw):
        return '#333'

    if type(text) == str:
        big_string = text
    else:
        big_string = ''
        for word in text:
            big_string = big_string + ''.join((word[0]+' ') * word[1])

    # print 'trying to make cloud: %s' % save_path
    # print os.getcwd()
    wc = WordCloud(background_color=background_color,
                   color_func=col_fun,
                   max_words=10000,
                   height=200,
                   width=700,
                   font_path='app/static/fonts/NanumScript.ttc').generate(big_string)
    wc.generate(big_string)
    wc.to_file('app/%s' % save_path)
Example #15
0
def run():
    f = open(u'words2.txt', 'r').read()
    words = list(jieba.cut(f))
    a = []
    for w in words:
        if len(w) > 1:
            a.append(w)
    text = r' '.join(a)
    
    bg = np.array(Image.open('bg.jpg'))
    wordcloud = WordCloud(
            background_color = 'white',
            #width = 1500,
            #height = 960,
            #margin = 10,
            mask = bg,
            font_path='C:/Windows/Fonts/simkai.ttf',
            ).generate(text)

    image_colors=ImageColorGenerator(bg)

    plt.imshow(wordcloud.recolor(color_func=image_colors))
    plt.axis('off')
    plt.show()
    wordcloud.to_file('words_result3.png')
    return
Example #16
0
def test_coloring_black_works():
    # check that using black colors works.
    mask = np.zeros((50, 50, 3))
    image_colors = ImageColorGenerator(mask)
    wc = WordCloud(width=50, height=50, random_state=42,
                   color_func=image_colors, min_font_size=1)
    wc.generate(THIS)
Example #17
0
def test_repeat():
    short_text = "Some short text"
    wc = WordCloud(stopwords=[]).generate(short_text)
    assert_equal(len(wc.layout_), 3)
    wc = WordCloud(max_words=50, stopwords=[], repeat=True).generate(short_text)
    # multiple of word count larger than max_words
    assert_equal(len(wc.layout_), 51)
    # relative scaling doesn't work well with repeat
    assert_equal(wc.relative_scaling, 0)
    # all frequencies are 1
    assert_equal(len(wc.words_), 3)
    assert_array_equal(list(wc.words_.values()), 1)
    frequencies = [w[0][1] for w in wc.layout_]
    assert_array_equal(frequencies, 1)
    repetition_text = "Some short text with text"
    wc = WordCloud(max_words=52, stopwords=[], repeat=True)
    wc.generate(repetition_text)
    assert_equal(len(wc.words_), 4)
    # normalized frequencies
    assert_equal(wc.words_['text'], 1)
    assert_equal(wc.words_['with'], .5)
    assert_equal(len(wc.layout_), wc.max_words)
    frequencies = [w[0][1] for w in wc.layout_]
    # check that frequencies are sorted
    assert_true(np.all(np.diff(frequencies) <= 0))
Example #18
0
def test_process_text():
    # test that process function returns a dict
    wc = WordCloud(max_words=50)
    result = wc.process_text(THIS)

    # check for proper return type
    assert_true(isinstance(result, dict))
Example #19
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument dicts
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    result = wc.generate_from_frequencies(words)

    assert_true(isinstance(result, WordCloud))
Example #20
0
    def create_wordclouds(self, text, name_of_cloud, additional_stop_list, max_words, width, height, bigram = False):
        text_nopunc = self.remove_punctuation(text, "", "")
        text_lower = text_nopunc.lower()
        stop = self.stopwords
        stop.extend(additional_stop_list)
        text_nostop = self.remove_stopword(text_lower, stop)
        tokens = wt(text_nostop)
        text_lem = self.lemmatize(tokens)
        tokens_lem = wt(text_lem)
        my_bigrams = nltk.bigrams(tokens_lem)
        if bigram:
            bigram_merged=list()
            for line in my_bigrams:
                bigram_merged.append(line[0]+' ' + line[1])
            counts = collections.Counter(bigram_merged)
        else:
            counts = collections.Counter(tokens_lem)
        final = counts.most_common(max_words)
        max_count = max(final, key=operator.itemgetter(1))[1]
        final = [(name, count / float(max_count))for name, count in final]

        # tags = make_tags(final, maxsize=max_word_size)
        # create_tag_image(tags, name_of_cloud+'.png', size=(width, height), layout=3, fontname='Crimson Text', background = (255, 255, 255))

        # temp_cloud = " ".join(text for text, count in final)
        word_cloud = WordCloud(font_path="fonts/Georgia.ttf",
            width=width, height=height, max_words=max_words, stopwords=stop)
        word_cloud.fit_words(final)
        word_cloud.to_file(name_of_cloud + ".png")
Example #21
0
def make_clouds(files, n_words=20):
    # set locations
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_d = '../browser/clouds/' + base_model_name + '/'
    if not os.path.exists(output_d):
        os.makedirs(output_d)
    # create wordcloud generator
    wc = WordCloud(width=1000, height=500, background_color='white')

    print('Loading model')
    model = LdaModel.load(files.model)
    beta = model.expElogbeta

    print('Normalizing by topics, and by words')
    pTW = normalize(beta, axis=0)
    pWT = normalize(beta, axis=1)

    # load bug<->id map, then invert to id<-> bug
    bug_to_id = json.loads(open(files.replacements).read())
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}

    for i in range(len(beta)):
        # compute RAR
        t_rar = np.sqrt(pTW[i] * pWT[i])
        top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
        top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
        top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
        wc.fit_words(zip(top_words, t_rar[top_word_ids]))
        wc.to_file(output_d + str(i) + '.png')
    def get_tagcloud(self, tags, tag_limit=None):
        tag_limit = tag_limit or len(tags)
        tags = sorted(tags, key=lambda kv: -kv['count'])[:tag_limit]  # Get top X tags
        tag_dict = {t['tag_name']: t['count'] for t in tags}

        # Generate a word cloud image
        wordcloud = WordCloud(
            background_color='white',
            min_font_size=10,
            max_font_size=60,
            width=self.tagcloud_width,
            height=self.tagcloud_height or 30 * len(tags) / 2 + 10,
            font_path=os.path.sep.join([settings.STATIC_ROOT, 'fonts', 'OpenSans-Regular.ttf'])
        ).generate_from_frequencies(tag_dict)

        tag_counts = [t['count'] for t in tags]
        step = (float(max(tag_counts))) / len(self.color_selection)
        thresholds = list(reversed([int(round(i * step)) for i in range(len(self.color_selection))]))

        def get_color(word, font_size, position, orientation, random_state=None, **kwargs):
            index = next((i for i, t in enumerate(thresholds) if tag_dict[word] >= t), 0)
            return self.color_selection[index]

        wordcloud.recolor(color_func=get_color)
        image = wordcloud.to_image()
        filepath = self.get_write_to_path(ext="png")
        image.save(filepath)
        return encode_file_to_base64(filepath, "data:image/png;base64,")
Example #23
0
def draw_tag_cloud(users_tokens):
    from PIL import Image
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud, ImageColorGenerator

    trump_coloring = np.array(Image.open("pics/trump.png"))

    freqs = get_full_frequencies(users_tokens)
    freq_pairs = freqs.items()
    wc = WordCloud(max_words=2000, mask=trump_coloring,
                   max_font_size=40, random_state=42)
    wc.generate_from_frequencies(freq_pairs)

    image_colors = ImageColorGenerator(trump_coloring)

    # plt.imshow(wc)
    # plt.axis("off")
    #
    # plt.figure()
    plt.imshow(wc.recolor(color_func=image_colors))
    # recolor wordcloud and show
    # we could also give color_func=image_colors directly in the constructor
    # plt.imshow(trump_coloring, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
Example #24
0
def wordCloud(text_array,name,keyword=""):
	new_text_arr=[]
	if keyword is not "":
		keyword=keyword.split(" ")[1]
	for text in text_array:
		if keyword in text:
			new_text_arr.append(text)

	text_array=new_text_arr

	cloud_text=""
	for text in text_array:
		cloud_text+=text+" "

	m_stopwords=['police','traffic','sir']

	for word in m_stopwords:
		STOPWORDS.add(word)

	image_mask = os.path.join(BASE_DIR, 'static/tool/img/nebula.png')
	coloring = imread(image_mask)
	
	wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white",mask=coloring,ranks_only=True,max_words=50).generate(cloud_text)
	filename=os.path.join(BASE_DIR, 'static/tool/img/'+name+'.png')

	image_colors = ImageColorGenerator(coloring)
	wordcloud.recolor(color_func=image_colors)
	wordcloud.to_file(filename)
	data_uri = open(filename, 'rb').read().encode('base64').replace('\n', '')

	img_tag = '<img src="data:image/png;base64,{0}" style="height:400px;">'.format(data_uri)
	
	layout=wordcloud.layout_
	words_colours={}
	count=1
	for lo in layout:
		entry={}
		entry['word']=lo[0][0]
		color=lo[len(lo)-1]
		color=color[4:]
		color=color[:-1]
		color_split=color.split(',')
		color_num=[int(x) for x in color_split]
		color_hex='#%02x%02x%02x' % tuple(color_num)
		# print color_num
		entry['color']=color_hex
		words_colours[count]=entry
		count+=1

	# print words_colours
	list_html=""
	cap=51
	if cap>len(words_colours):
		cap=len(words_colours)

	for i in range(1,cap):
		list_html+='<li class="list-group-item" ><a class="cloud-key-'+name+'" href="#" style="color:'+words_colours[i]['color']+'">'
		list_html+="#"+str(i)+" "+words_colours[i]['word']+'</a></li>'

	return (img_tag,list_html)
    def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()
Example #26
0
    def generate_wc(self, background_color='#ffffff'):
        """generate wordcloud and save to file"""
        # fig_kw = dict(figsize=(self.width/self.dpi, self.height/self.dpi),
        #               dpi=self.dpi)
        self.get_exclude_words()
        try:
            imgpath = os.path.join(self.curdir, self.wordcloud_mask)
            arr = np.array(Image.open(imgpath))
            # Other masks can be extracted from
            # Font-Awesome (http://minimaxir.com/2016/05/wordclouds/)

            # Download font or use the default one
            font_path = get_font(self.font_name)
            if self.allow_font_change:
                logger.info('Using {} font'.format(font_path))
            # print(font_path)

            wc = WordCloud(width=self.width, height=self.height,
                           font_path=font_path, colormap=self.cmap,
                           stopwords=self.exclude_words,
                           background_color=background_color, mode='RGBA',
                           mask=arr).generate(self.text)

            self.make_img_file()
            wc.to_file(self.img_file)
            self.error_in_wordcloud_gen = None

            self.font_name = None  # reset to default

        except Exception as e:
            self.error_in_wordcloud_gen = e
Example #27
0
def generate_word_cloud(img_bg_path,top_words_with_freq,font_path,to_save_img_path,background_color = 'white'):
    # 读取背景图形
    img_bg = imread(img_bg_path)
    
    # 创建词云对象
    wc = WordCloud(font_path = font_path,  # 设置字体
    background_color = background_color,  # 词云图片的背景颜色,默认为白色
    max_words = 500,  # 最大显示词数为1000
    mask = img_bg,  # 背景图片蒙版
    max_font_size = 50,  # 字体最大字号
    random_state = 30,  # 字体的最多模式
    width = 1000,  # 词云图片宽度
    margin = 5,  # 词与词之间的间距
    height = 700)  # 词云图片高度
    
    # 用top_words_with_freq生成词云内容
    wc.generate_from_frequencies(top_words_with_freq)
    
    # 用matplotlib绘出词云图片显示出来
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    
    # 如果背景图片颜色比较鲜明,可以用如下两行代码获取背景图片颜色函数,然后生成和背景图片颜色色调相似的词云
    #img_bg_colors = ImageColorGenerator(img_bg)
    #plt.imshow(wc.recolor(color_func = img_bg_colors))
    
    # 将词云图片保存成图片
    wc.to_file(to_save_img_path)
Example #28
0
def generate_cloud():
    d = path.dirname(__file__)
    janice = open(path.join(d, 'messages.txt')).read()
    group_mask = misc.imread(path.join(d, "mask.png"), flatten=True)
    wc = WordCloud(background_color="white", max_words = 2000, mask=group_mask)
    wc.generate(text)
    wc.to_file(path.join(d, "masked.jpg"))
Example #29
0
def topic_word_cloud(nmf, topic_idx, max_words=300, figsize=(14, 8), width=2400, height=1300, ax=None):
    ''' Create word cloud for a given topic
    INPUT:
        nmf: NMFClustering object
        topic_idx: int
        max_words: int
            Max number of words to encorporate into the word cloud
        figsize: tuple (int, int)
            Size of the figure if an axis isn't passed
        width: int
        height: int
        ax: None or matplotlib axis object
    '''
    wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height)
    word_freq = nmf.topic_word_frequency(topic_idx)

    # Fit the WordCloud object to the specific topics word frequencies
    wc.fit_words(word_freq)

    # Create the matplotlib figure and axis if they weren't passed in
    if not ax:
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
    ax.imshow(wc)
    ax.axis('off')
def make_cloud(words, image, size=10, filename='figures/cloud.png', max_words=200, horizontal=0.8):

    # Remove URLs, 'RT' text, screen names, etc
    my_stopwords = ['RT', 'amp', 'lt']
    words_no_urls = ' '.join([word for word in words.split()
                              if word not in my_stopwords])

    # Add stopwords, if needed
    stopwords = STOPWORDS.copy()
    stopwords.add("RT")
    stopwords.add('amp')
    stopwords.add('lt')

    # Load up a logo as a mask & color image
    logo = imread(image)

    # Generate colors
    image_colors = ImageColorGenerator(logo)

    # Generate plot
    wc = WordCloud(stopwords=stopwords, mask=logo, color_func=image_colors, scale=0.8,
                   max_words=max_words, background_color='white', random_state=42, prefer_horizontal=horizontal)

    wc.generate(words_no_urls)

    plt.figure(figsize=(size, size))
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig(filename)
Example #31
0
        full_texts.append(tmp)

# write full_texts in txt file
temp_texts = [text + '\n' for text in full_texts]
fout = open('tweets with tag-{}.txt'.format(tag), 'w', encoding='utf-8')
fout.writelines(temp_texts)
fout.close()
print('==== Tweets clawed and saved at root directory ====\n')

# create wordcloud for visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## assign font and container attributes
font_path = 'c:\\windows\\fonts\\Roboto-Regular.ttf'
wordcloud = WordCloud(font_path=font_path, width=800, height=800)

## create wordcloud
long_text = ''
for text in full_texts:
    long_text = long_text + text
long_text = re.sub(tag, string=long_text.lower(), repl='')

wordcloud = wordcloud.generate(long_text)
fig = plt.figure(figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis("off")

## save it on png
fig.savefig('wordcloud with tag-{}.png'.format(tag))
print('==== WordCloud from tweets generated and saved at root directory ====')
Example #32
0
# 1、读入xls文本数据
data = xlrd.open_workbook('data.xls')
table = data.sheets()[0]
nrows = table.nrows
text = ''
for i in range(nrows):
    colnames =  table.row_values(i) #某一行数据
    text += ','.join(colnames)

# 2、结巴分词,默认精确模式。可以添加自定义词典userdict.txt,然后jieba.load_userdict(file_name) ,file_name为文件类对象或自定义词典的路径
# 自定义词典格式和默认词库dict.txt一样,一个词占一行:每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒

cut_text = jieba.cut(text)
result = "/".join(cut_text)  # 必须给个符号分隔开分词结果来形成字符串,否则不能绘制词云
print(result)

# 3、生成词云图,这里需要注意的是WordCloud默认不支持中文,所以这里需已下载好的中文字库
# 无自定义背景图:需要指定生成词云图的像素大小,默认背景颜色为黑色,统一文字颜色:mode='RGBA'和colormap='pink'
d = path.dirname(__file__)
round_coloring = imread(path.join(d, "maid.png"))
wc = WordCloud(font_path="Yahei.ttf", background_color='white', width=800,mask=round_coloring,
               height=600, max_font_size=50,
               max_words=150)  # ,min_font_size=10)#,mode='RGBA',colormap='pink')
wc.generate(result)
wc.to_file("wordcloud.png")  # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰

# 4、显示图片
plt.figure("词云图")  # 指定所绘图名称
plt.imshow(wc)  # 以图片的形式显示词云
plt.axis("off")  # 关闭图像坐标系
plt.show()
Example #33
0
    "之下", "一只", "一半", "这个", "便是", "倘若", "突然", "只是", "不敢", "他们", "我们", "见到",
    "声音", "心想", "如此", "只见", "之中", "不能", "一个", "知道", "什么", "不想", "不是", "甚么",
    "一声", "咱们", "别人", "一句", "不知"
]

# 初始化自定义背景图片
bg_img = "fuyao.jpg"  #注图片背景ps成白色
image = Image.open(bg_img)
graph = np.array(image)

# wordcloud配置
wc = WordCloud(
    font_path="simhei.ttf",  # 设置字体
    background_color='white',  # 背景颜色
    width=image.size[0],  # 设置宽,我这里设置和背景图片宽一样
    height=image.size[1],  # 设置高,我这里设置和背景图片高一样
    max_font_size=70,
    min_font_size=10,  # 字体最大/最小值
    stopwords=no_name,  # 设置停用词,不在词云图中显示
    max_words=2000,  # 设置最大显示的字数
    mode='RGBA')
wc.generate(result)

# 绘制文字的颜色以背景图颜色为参考
image_color = ImageColorGenerator(graph)  # 从背景图片生成颜色值
wc.recolor(color_func=image_color)
# 保存图片的名字
img_name = filename[:filename.rfind("."):] + "_词云图" + ".png"
# 生成图片
wc.to_file(img_name)
# 4、显示图片
plt.figure("词云图")  # 指定所绘图名称
    detokenized_data.append(t)
    
dataset['clean_text']= detokenized_data 
documents = dataset['clean_text']

"""# 6. Perform Exploratory Analysis
To verify whether the preprocessing happened correctly, we’ll make a word cloud using the wordcloud package to get a visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model.
"""

# import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(documents.values))

# create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# generate a word cloud
wordcloud.generate(long_string)

# visualize the word cloud
wordcloud.to_image()

"""# 7. Create Document-Term Matrix
This is the first step towards topic modeling. We need to represent each and every term and document as a vector.We will use sklearn's TfidfVectorizer to create a document-term matrix using only 1000 terms (words) from our corpus.
"""

# set variables
no_terms = 1000

# NMF uses tf-idf Vectorizer
Example #35
0
def generate_ldavis(lda, topic_count, word_count):
    print("Latent Dirichlet Allocation......")
    print(lda.print_topics(-1, word_count))
    print_time()
    lda_vis = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary)
    pyLDAvis.save_html(lda_vis,
                       'visualization_all_' + str(topic_count) + '.html')


start_time = datetime.now()
warnings.filterwarnings("ignore", category=DeprecationWarning)
print("Start time of program: " + str(start_time))
tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))
wordcloud = WordCloud()
Lda = gensim.models.ldamodel.LdaModel

filename = "../data/fake_or_real_news.csv"
with open(filename, 'rb') as f:
    lines = f.read()
new = str(lines, 'utf-8')
with open('clear', 'w') as f2:
    f2.write(new)
df = pd.read_csv("clear")
df = df.set_index('Unnamed: 0')

bigram = gensim.models.phrases.Phrases(df.text)
df['text_tokens'] = df.text.apply(process_text)
doc_clean = df.text_tokens
frequency = defaultdict(int)
lda_corpus = lda[corpus]
lda_corpus
lda_docs = [doc for doc in lda_corpus]
lda_docs[0:5]
len(lda_docs)

from wordcloud import WordCloud
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()
        ]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda.show_topics(formatted=False)

fig, axes = plt.subplots(1, 2, figsize=(10, 20), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
Example #37
0
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import PIL
import numpy as np

file_text = open("ci.txt", encoding='utf8').read()

dict_list = jieba.cut(file_text, cut_all=True)

di = " ".join(dict_list)

alice_img = np.array(
    PIL.Image.open("C:\\Users\\Administrator\\Desktop\\timg.jpg"))
wc = WordCloud(width=1920,
               height=1080,
               background_color="#fff",
               margin=2,
               mask=alice_img,
               font_path="C:\\Windows\\Fonts\\msyhbd.ttf")\
    .generate(di)

plt.imshow(wc)
plt.axis("off")
plt.show()
stopwords.add("also")
stopwords.add("told")
stopwords.add("one")
stopwords.add("last")
stopwords.add("new")
stopwords.add("say")
stopwords.add("year")
stopwords.add("will")
stopwords.add("yes")
stopwords.add("no")
stopwords.add("although")
stopwords.add("first")
stopwords.add("day")

# Generate a word cloud image
wordcloud1 = WordCloud(stopwords=stopwords).generate(politics_text)
wordcloud2 = WordCloud(stopwords=stopwords).generate(film_text)
wordcloud3 = WordCloud(stopwords=stopwords).generate(football_text)
wordcloud4 = WordCloud(stopwords=stopwords).generate(business_text)
wordcloud5 = WordCloud(stopwords=stopwords).generate(technology_text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud1, interpolation='bilinear')
plt.axis("off")

plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis("off")

plt.imshow(wordcloud3, interpolation='bilinear')
Example #39
0
    data_pname = data[['pname']]
    data_requirement = data[['requirement']]
    data_workplace = data[['workplace']]
    data_nature = data[['nature']]

    # pandas 保存txt去除空格
    data_pname.to_csv('static/wordcloud_pname.txt',
                      header=None,
                      index=False,
                      sep=" ")
    file_pname = open('static/wordcloud_pname.txt', 'r', 1,
                      encoding='utf8').read()
    mytext_pname = " ".join(jieba.cut(file_pname))
    wordcloud = WordCloud(font_path='static/SimSun.ttf',
                          background_color="white",
                          width=1000,
                          height=860,
                          margin=2).generate(mytext_pname)
    wordcloud.to_file('static/img/wordcloud_pname.png')

    # pandas 保存txt去除空格
    data_requirement.to_csv('static/wordcloud_requirement.txt',
                            header=None,
                            index=False,
                            sep=" ")
    file_requirement = open('static/wordcloud_requirement.txt',
                            'r',
                            1,
                            encoding='utf8').read()
    mytext_requirement = " ".join(jieba.cut(file_requirement))
    wordcloud = WordCloud(font_path='static/SimSun.ttf',
Example #40
0
import numpy as np
from collections import Counter

text = open('20190910_101329.csv', "r", encoding="utf-8").read()  # 讀文字資料

jieba.set_dictionary("jieba_dict/dict.txt.big")  # 設定繁體中文詞庫
with open("jieba_dict/stopWord_cloud.txt", "r",
          encoding="utf-8-sig") as f:  # 設定停用字
    stops = f.read().split("\n")  # 讀取停用詞並存於stops串列中

terms = []  # 儲存字詞
for t in jieba.cut(text, cut_all=False):
    if t not in stops:
        terms.append(t)
diction = Counter(terms)

font = "msyh.ttc"
mask = np.array(Image.open("Coins.png"))  # 設定文字雲形狀
wordcloud = WordCloud(font_path=font)
wordcloud = WordCloud(background_color="white", mask=mask,
                      font_path=font)  # 背景顏色預設黑色, 改為白色
wordcloud.generate_from_frequencies(frequencies=diction)  # 產生文字雲

# 產生圖片
plt.figure(figsize=(6, 6))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

wordcloud.to_file("news_Wordcloud.png")
def write():
    #write is smart...it stops you from loading the every page when you import
    #^Above notes are just my musings
    import streamlit as st
    import matplotlib.pyplot as plt
    import pandas as pd
    import wordcloud
    import seaborn as sea

    username = st.text_input("Enter username here:")
    #plays song on the beach
    #eventually I should let this personalize
    boolean = False
    if len(username) > 0:
        try:
            df = pd.read_csv(username + ".csv")
            boolean = True
        except:
            st.write("username doesn't exist!")
    if boolean:
        audio_file = open('songonthebeach.ogg', 'rb')
        audio_bytes = audio_file.read()
        st.audio(audio_bytes, format='audio/ogg')
        #this reads our rastaman, example user csv
        df = pd.DataFrame(df)
        #str = random.choice(quote_list)
        #st.write(str)
        df = df.dropna()
        df.columns = ["score", "sentence", "date"]
        score = df["score"]
        #recent = the most recent score
        try:
            recent = score[len(score)-1]
            if recent == 0:
                st.write("Days like these come, and it's perfectly fine to be upset when difficulties arise. What you should remember is that days like these pass too, and that even when these times are dark, you still have friends, family, external resources to reach out too. Check out the resources tab for ways you can improve now.") 
        except:
            st.write("Your username exists but we didn't save your score. Sorry about that! Please insert your journal entry again and press save my score again to save it officially. This is a known bug thart occurs when a username is first created, but not after!")
        #below are placeholders for personalized notes. should add functionality for this l8r
        #if recent == 2:
        #    st.write("You're doing well today. I hope you keep up the progress.")
        #if recent == 1:
        #    st.write("You're not feeling so great today, and that's okay. Know I'll always care about you.")
        

        #code where if the last five have been super happy play Photograph


        col1, col2, col3 = st.beta_columns(3)

        #need to make this graph look better. should add a time slider too. would be cool if when a person hovers over a point they see the journal entry for it.
        with col1:
            #df["date"] = pd.to_datetime(df["date"])
            fig, ax = plt.subplots()
            df2 = df[["score", "date"]]
            df2["date"] = pd.to_datetime(df2["date"])
            df2['week_num'] = df2['date'].dt.strftime("%W")
            df2['day_num'] = df2['date'].dt.weekday
            df_wide = df2.pivot_table(index='week_num',columns='day_num',values='score')
            ax = sea.heatmap(df_wide)
            st.pyplot(fig)
        #have to take down this labelled_journal_entries csv before we release. If not, we release a lot of people's personal data.
        #need to add a slider for time here. maybe for mood too.
        with col2:
            #LJE = LJE[LJE["score"] < 4]
            from wordcloud import WordCloud
            wordcloud2 = WordCloud(background_color='white').generate(' '.join(df['sentence']))
            fig, ax = plt.subplots()
            plt.imshow(wordcloud2)
            plt.axis("off")
            st.pyplot(fig)
        #need to add sentiment-dependent emojis to output searches
        with col3:
            word = st.text_input("Input word you want to search for")
            if len(word) > 2:
                entries = ' '.join(df['sentence'])
                arr = entries.split('.')
                str = " "
                for i in range(0, len(arr)):
                    if word in arr[i]:
                        str = arr[i]
                        st.markdown(str)
Example #42
0
    if word_polarity > 0.25:
        positive.append(word)
    else:
        neutral.append(word)

positive_count = {}
positive_count2 = {}
for word in positive:
    positive_count[word] = positive_count.get(word, 0) + 1
    for word, count in positive_count.items():
        if count < 2:
            continue
        else:
            positive_count2[word] = count

word_cloud = WordCloud().generate_from_frequencies(positive_count2)
plt.imshow(word_cloud, interpolation = 'bilinear')
plt.show()

neutral_count = {}
neutral_count2 = {}
for word in neutral:
    neutral_count[word] = neutral_count.get(word, 0) + 1
    for word, count in neutral_count.items():
        if count < 2:
            continue
        else:
            neutral_count2[word] = count

word_cloud = WordCloud().generate_from_frequencies(neutral_count2)
plt.imshow(word_cloud, interpolation = 'bilinear')
Example #43
0
(WIDTH, HEIGHT, RESOLUTION) = (3840, 2160, 500)

##############################################################################
# Read artists file
##############################################################################
data = pd.read_csv(stp.DATA_PATH + stp.USR + '_cln.csv', parse_dates=[3])
songs = sorted(data.get('Song').unique())
songCount = data.groupby('Song').size().sort_values(ascending=False)

##############################################################################
# Wordcloud
##############################################################################
wordcloudDef = WordCloud(width=WIDTH,
                         height=HEIGHT,
                         max_words=2000,
                         relative_scaling=1,
                         min_font_size=12,
                         background_color='Black',
                         colormap='Purples',
                         font_path=stp.FONT)
wordcloud = wordcloudDef.generate_from_frequencies(songCount)
ax1 = plt.axes(frameon=False)
plt.figure(figsize=(20, 20 * (HEIGHT / WIDTH)), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.tight_layout(pad=0)
plt.axis("off")
plt.savefig(stp.IMG_PATH + '/SNG_WDC.png',
            dpi=RESOLUTION,
            facecolor='k',
            edgecolor='w',
            orientation='portrait',
            papertype=None,
Example #44
0

text = open(path.join(d, 'stopwords.txt'),encoding ='utf-8').read()

#如果是中文
#text = processChinese(text)#中文不好分词,使用Jieba分词进行

# read the mask / color image
# taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
# 设置背景图片
back_coloring = np.array(Image.open('/Users/xxxx1/Pictures/alice.jpeg'))

wc = WordCloud(font_path = "/System/Library/Fonts/STHeiti Light.ttc",
                background_color="white", #背景颜色
                max_words=2000,# 词云显示的最大词数
                mask=back_coloring,#设置背景图片
                # max_font_size=100, #字体最大值
                # random_state=42,
                )
# 生成词云, 可以用generate输入全部文本(中文不好分词),也可以我们计算好词频后使用generate_from_frequencies函数
wc.generate(text)
# wc.generate_from_frequencies(txt_freq)
# txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)]
# 从背景图片生成颜色值
image_colors = ImageColorGenerator(back_coloring)


# plt.figure(figsize=(8,6), dpi=800)
# 以下代码显示图片
# plt.imshow(wc)
# plt.axis("off")
Example #45
0
        keiyou_count = keiyou_count + 1
        keiyou_list.append(nodes.surface)
    else:
        pass
    nodes = nodes.next

text = "" + " ".join(meishi_list) + " ".join(doushi_list) + " ".join(
    keiyou_list)

# In[27]:

#WCの下ごしらえ

stop_words = ["https", "co", "てる", "する", "そう", "すぎ", "いい", "さん", "こと"]
fpath = "/Library/Fonts//ヒラギノ丸ゴ ProN W4.ttc"

wc = WordCloud(font_path=fpath,
               background_color="white",
               max_words=2000,
               collocations=False,
               stopwords=set(stop_words))

# WordCloudの実行
wc.generate(text)
wc.to_file("word_cloud.png")

plt.figure(figsize=(15, 12))
plt.imshow(wc)
plt.axis("off")
plt.show()
Example #46
0
                # Omit unnecessary words
                if token.base_form not in [
                        "こと", "よう", "そう", "これ", "それ", "本田", "カードバトル", "pepsi",
                        "フォロー", "jpn", "バトル", "コイン", "ペプシ", "フォロー", "RT",
                        "ジャパン", "コーラ", "カード", "プレゼント", "毎日", "挑戦", "ケース", "記念"
                ]:
                    words_count[token.base_form] += 1
                    words.append(token.base_form)
    return words_count, words


with open('data/twitter_list.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    texts = []
    for row in reader:
        if (len(row) > 0):
            text = row[0].split('http')
            texts.append(text[0])

words_count, words = counter(texts)
text = ' '.join(words)

# My font in fontbook
fpath = "~/Library/Fonts/Ricty-Bold.ttf"
wordcloud = WordCloud(background_color="White",
                      font_path=fpath,
                      width=900,
                      height=500).generate(text)

wordcloud.to_file("./wordcloud.png")
# 파이썬이 인식할 수 있는 한글 단어의 갯수를 늘리기 위한 작업
file = open('d://project//word.txt', 'r',
            encoding='utf-8')  #word.txt는 리뷰에 나올만한 단어들
word = file.read().split(' ')  #word.txt를 어절별로 분리하고
for i in word:  #분리한 어절들을 하나씩 불러온다.
    text = re.sub(i, '',
                  text)  #re.sub('있다','','있다') <-라라랜드 리뷰의 '있다'를 ''으로 대체하겠다 라는 뜻
    print(text)
    #*일반적인 문장에서 자주나오는 단어들을 일일히 손으로 다 할 수는 없으니까 for문으로 전부 ''으로 대체

# 워드 클라우드를 그린다.
wordcloud = WordCloud(
    font_path='d://Windows//Fonts//gulim',  # 글씨체
    stopwords=STOPWORDS,  # 마침표, 느낌표,싱글 쿼테이션 등을 정제
    max_words=1000,  # 워드 클라우드에 그릴 최대 단어갯수
    background_color='white',  # 배경색깔
    max_font_size=100,  # 최대 글씨 크기 
    min_font_size=1,  # 최소 글씨 
    mask=usa_mask,  # 배경 모양 
    colormap='jet').generate(text).to_file('d://project//digimon_cloud.png')
# c 드라이브 밑에 project 폴더 밑에 생성되는 워드 클라우드 이미지 이름

plt.figure(figsize=(15, 15))  #가로x세로 15x15
plt.imshow(wordcloud, interpolation='bilinear')  # 글씨가 퍼지는 스타일
plt.axis("off")

#%%
#Q.라라랜드 리뷰 txt에서 평가 점수가 6점 이상인 리뷰들만 출력하시오
stev = open("d:\\data\\lalaland.txt", encoding="UTF8")
stev2 = stev.readlines()  #어절별로 분리해서 stev2라는 리스트에 담는다.
    filtered_sent.append(review)

from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
text_tf = tf.fit_transform(filtered_sent)
feature_names = tf.get_feature_names()
dense = text_tf.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

#plotting wordcloud on TFIDF
from wordcloud import WordCloud
import matplotlib.pyplot as plt
cloud = ' '.join(df)

wordcloud = WordCloud(background_color='black', width=1800,
                      height=1400).generate(cloud)
plt.imshow(wordcloud)

##Importing positive words to plot positive word cloud

with open("E:\\Assignment\\11) Text mining\\positive-words.txt", "r") as pos:
    poswords = pos.read().split("\n")

poswords = poswords[36:]

pos_words = ' '.join([w for w in df if w in poswords])

cloud_pos = WordCloud(background_color='black', width=1800,
                      height=1400).generate(pos_words)
plt.imshow(cloud_pos)
Example #49
0
plt.xlabel('Polarity')
plt.ylabel('Rate')
plt.grid(True)
plt.hist(polarity , bins = 5)
plt.axis([-1.00, 1.00, 0, 60])
plt.show()

plt.title('Sub Histogram')
plt.xlabel('Subjectivity')
plt.ylabel('Rate')
plt.grid(True)
plt.hist(subjectivity, bins = 5)
plt.axis([-1.00, 1.00, 0, 60])
plt.show()

plt.scatter(polarity, subjectivity)
plt.show()

all_tweets = ', '.join(tweet['text'] for tweet in tweetData)
tb = TextBlob(all_tweets)
#filtered_words = []
print(all_tweets)

print(str(tweet))

wordcloud = WordCloud().generate(all_tweets)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#generate_from_text(text)
"""

import os

from os import path
from wordcloud import WordCloud

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

# Read the whole text.
#text = open(path.join(d, 'constitution.txt')).read()
text = open(path.join(d, 'garda.csv')).read()

# Generate a word cloud image
wordcloud = WordCloud().generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('cloud.png')
plt.show()
from wordcloud import WordCloud
import jieba

text = open("fulian3.txt", "rb").read()
# 结巴分词
wordlist = jieba.cut(text, cut_all=True)
wl = " ".join(wordlist)
# print(wl)#输出分词之后的txt

# 把分词后的txt写入文本文件
# fenciTxt  = open("fenciHou.txt","w+")
# fenciTxt.writelines(wl)
# fenciTxt.close()

# 设置词云
wc = WordCloud(
    background_color="black",  # 设置背景颜色
    # mask = "图片",  #设置背景图片
    max_words=2000,  # 设置最大显示的字数
    # stopwords = "", #设置停用词
    font_path="fangsong_GB2312.ttf",
    # 设置中文字体,使得词云可以显示(词云默认字体是“DroidSansMono.ttf字体库”,不支持中文)
    max_font_size=50,  # 设置字体最大值
    random_state=30,  # 设置有多少种随机生成状态,即有多少种配色方案
)
myword = wc.generate(wl)  # 生成词云

# 展示词云图
plt.imshow(myword)
plt.axis("off")
plt.show()
Example #52
0
text = re.sub("[0-9]+", '', text)

logger.info('Words')
print(text)

#####################
# Make Word Cloud
#####################
extra_stopwords = EXTRA_STOPWORDS
for e in extra_stopwords:
    STOPWORDS.add(e)

stopwords = set(STOPWORDS)
wc = WordCloud(background_color="white",
               max_words=2000,
               stopwords=stopwords,
               mode="RGBA",
               colormap='BuPu')
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(FILENAME)

n = utils.Notify()
n.telegram({
    'chat_id': '@whalepoolbtcfeed',
    'message': KEYWORD + ' related google trends for the last 7 days',
    'picture': FILENAME
})
print('Saved: ' + FILENAME)
Example #53
0
from wordcloud import WordCloud

import matplotlib.pyplot as plt

votes = getVoteData()

for name, group in votes.groupby('Vote'):
    print(name)
    speeches = group['Discurso']
    concatSpeeches = ''
    for speech in speeches:
        concatSpeeches += ' ' + sanitizeString(str(speech))

    nWord = len(concatSpeeches.split())
    mostCommonWords = Counter(concatSpeeches.split()).most_common()
    for i in range(0, 20):
        print(mostCommonWords[i][0], mostCommonWords[i][1] / (nWord))

    wordcloud = WordCloud(max_font_size=40,
                          relative_scaling=.5,
                          background_color='white',
                          max_words=50).generate(
                              concatSpeeches.replace('nao',
                                                     '').replace('sim', ''))

    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig('Temp/WordCloud_' + name + '.png')
    plt.close()
Example #54
0
print('불필요한 키워드 제거중...')
for d in rmkeys:
    data = data.replace(' ' + d + ' ', ' ')

data = data.replace('&gt;', ' ')
data = data.replace('&lt;', ' ')
data = data.replace('https://www.youtube.com/watch?v=', ' ')
data = data.replace('https://youtu.be/', ' ')
data = data.replace('- dc official App',' ')
data = data.replace('- 라하마갤 와주는데스http://gall.dcinside.com/loudhouse',' ')
data = data.replace('https://gall.dcinside.com/mgallery/board/view/?id=',' ')
data = data.replace('https://gall.dcinside.com/board/lists/?id=', ' ')
data = data.replace('https://', ' ')

print('워드클라우드 생성 중...')
wc_title = WordCloud(font_path='font.otf', width=2000, height=1800, background_color='white', collocations=False, max_words=2000).generate(data)

print('이미지 저장 중...')
wc_title.to_file('wordcloud.png')

hk = sorted(wc_title.words_.items(), key=(lambda x: x[1]), reverse = True)
#hotkey = hk[0][0] + ', ' + hk[1][0] + ', ' + hk[2][0] + ', ' + hk[3][0] + ', ' + hk[4][0]
#print('핵심 키워드:', hotkey)
#pkeys = ''
#for s in hk: pkeys += s[0] + '\n'

keys = ''

for k in hk:
    keys += str(k) + '\n'
Example #55
0
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


matplotlib.rcParams['figure.figsize'] = (16.0, 9.0)
script=open("spec.txt").read()
stopwords=set(STOPWORDS)
bond=np.array(Image.open("images.jpg"))

from matplotlib.colors import LinearSegmentedColormap as lsc
colors=["#000000","#0060A8","#484848","#FFF200"]
cmap=lsc.from_list("mycmap",colors)
wc=WordCloud(background_color="white",stopwords=stopwords,mask=bond,width=1987,height=787,colormap=cmap)
wc.generate(script)
plt.figure()
plt.imshow(wc,interpolation="bilinear")
plt.axis("off")
plt.show()
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = open('debate.csv', 'r').read()
wordcloud = WordCloud(max_font_size=100, width=1520, height=535).generate(text)
plt.figure(figsize=(16, 9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Example #57
0
Counter(words).most_common()
stop_words = [word for word, count in Counter(
    words).most_common() if count > 3]

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.transcript)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

# Pickle it for later use
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")


wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)


plt.rcParams['figure.figsize'] = [16, 6]

names = ['bird-and-whale', 'chick-little', 'goldilocks', 'threepigs',
         'petitchaperonrouge', 'uglyduckling']

# Create subplots for each comedian
# for index, name in enumerate(data.columns):
#     wc.generate(data_clean.transcript[name])
#
#     plt.subplot(3, 4, index+1)
#     plt.imshow(wc, interpolation="bilinear")
#     plt.axis("off")
#     plt.title(names[index])
Example #58
0
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import random
import TKinter as tk
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(10, 50)

f = open('/Users/Tristan/Downloads/const.txt')
text = f.read()
font_path = '/Users/Tristan/books/datasets/Open_Sans_Condensed/' + 'Open_Sans_Condensed/OpenSansCondensed-Light.ttf'

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
# w = WordCloud()
w = WordCloud(stopwords=stopWords, background_color='white', min_font_size=14, max_words=1000 ,font_path=font_path ,normalize_plurals=True)
wordcloud = w.generate(text)
wordcloud.recolor(color_func=grey_color_func)
# plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('/Users/Tristan/books/datasets/output/wordclouds/cloud2.png')
plt.close()
Example #59
0
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

f = open('Ev_Scan.txt', 'r')

document = f.read()
print(len(document))

wordcloud = WordCloud(width=600, height=480,
                      colormap="Oranges_r").generate(document)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.tight_layout(pad=0)
plt.show()
Example #60
0
# because the interactive session continues into the self check.

# Loading the Text
from pathlib import Path

text = Path('RomeoAndJuliet.txt').read_text()

# Loading the Mask Image that Specifies the Word Cloud’s Shape
import imageio

mask_image = imageio.imread('mask_heart.png')

# Configuring the WordCloud Object
from wordcloud import WordCloud   

wordcloud = WordCloud(width=1000, height=1000, 
    colormap='prism', mask=mask_image, background_color='white')
   

# Generating the Word Cloud
wordcloud = wordcloud.generate(text)

# Saving the Word Cloud as an Image File
wordcloud = wordcloud.to_file('RomeoAndJulietHeart.png')

%matplotlib

import matplotlib.pyplot as plt

plt.imshow(wordcloud)

# Section 12.3.2 Self Check snippets