def make_word_cloud(product, sentiment):
    if sentiment == "all":
        pos, neg = get_top_five_phrases(product,sentiment)

        pos.index = range(0,len(pos))
        neg.index = range(0,len(neg))

        pos_words_array = []
        neg_words_array = []
        for i in range(0,len(pos)):
            pos_words_array.append((pos["vocab"][i].upper(), float(pos["count"][i])))

        for i in range(0,len(neg)):
            neg_words_array.append((neg["vocab"][i].upper(), float(neg["count"][i])))

        wc = WordCloud(background_color="white", max_words=2000,
               max_font_size=300, random_state=42)

        # generate word cloud for positive
        positive_name = '../app/static/img/pos_wordcloud.png'
        wc.generate_from_frequencies(pos_words_array)
        wc.recolor(color_func=pos_color_func, random_state=3)
        wc.to_file(positive_name)

        # generate word cloud for negative
        negative_name = '../app/static/img/neg_wordcloud.png'
        wc.generate_from_frequencies(neg_words_array)
        wc.recolor(color_func=neg_color_func, random_state=3)
        wc.to_file(negative_name)

        return positive_name, negative_name
Esempio n. 2
0
def generate_word_cloud(img_bg_path,top_words_with_freq,font_path,to_save_img_path,background_color = 'white'):
    # 读取背景图形
    img_bg = imread(img_bg_path)
    
    # 创建词云对象
    wc = WordCloud(font_path = font_path,  # 设置字体
    background_color = background_color,  # 词云图片的背景颜色,默认为白色
    max_words = 500,  # 最大显示词数为1000
    mask = img_bg,  # 背景图片蒙版
    max_font_size = 50,  # 字体最大字号
    random_state = 30,  # 字体的最多模式
    width = 1000,  # 词云图片宽度
    margin = 5,  # 词与词之间的间距
    height = 700)  # 词云图片高度
    
    # 用top_words_with_freq生成词云内容
    wc.generate_from_frequencies(top_words_with_freq)
    
    # 用matplotlib绘出词云图片显示出来
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    
    # 如果背景图片颜色比较鲜明,可以用如下两行代码获取背景图片颜色函数,然后生成和背景图片颜色色调相似的词云
    #img_bg_colors = ImageColorGenerator(img_bg)
    #plt.imshow(wc.recolor(color_func = img_bg_colors))
    
    # 将词云图片保存成图片
    wc.to_file(to_save_img_path)
Esempio n. 3
0
def draw_tag_cloud(users_tokens):
    from PIL import Image
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud, ImageColorGenerator

    trump_coloring = np.array(Image.open("pics/trump.png"))

    freqs = get_full_frequencies(users_tokens)
    freq_pairs = freqs.items()
    wc = WordCloud(max_words=2000, mask=trump_coloring,
                   max_font_size=40, random_state=42)
    wc.generate_from_frequencies(freq_pairs)

    image_colors = ImageColorGenerator(trump_coloring)

    # plt.imshow(wc)
    # plt.axis("off")
    #
    # plt.figure()
    plt.imshow(wc.recolor(color_func=image_colors))
    # recolor wordcloud and show
    # we could also give color_func=image_colors directly in the constructor
    # plt.imshow(trump_coloring, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
Esempio n. 4
0
def generate_image(words, image):
    graph = np.array(image)
    wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'),
                   background_color='white', max_words=MAX_WORDS, mask=graph)
    wc.generate_from_frequencies(words)
    image_color = ImageColorGenerator(graph)
    return wc, image_color
Esempio n. 5
0
def wcloud(wf, color, save_as=None):
    """Create a word cloud based on word frequencies,
    `wf`, using a color function from `wc_colors.py`

    Parameters
    ----------
    wf : list
        (token, value) tuples
    color : function
        from `wc_colors.py`
    save_as : str
        filename

    Returns
    -------
    None
    """
    wc = WordCloud(background_color=None, mode='RGBA',
                   width=2400, height=1600, relative_scaling=0.5,
                   font_path='/Library/Fonts/Futura.ttc')
    wc.generate_from_frequencies(wf)
    plt.figure()
    plt.imshow(wc.recolor(color_func=color, random_state=42))
    plt.axis("off")
    if save_as:
        plt.savefig(save_as, dpi=300, transparent=True)
    def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()
Esempio n. 7
0
def generate_image(files, src_image):
    content = get_content(files)
    graph = np.array(Image.open(src_image))
    wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'),
                   background_color='white', max_words=MAX_WORDS, mask=graph)
    words = process_text(content)
    wc.generate_from_frequencies(words)
    image_color = ImageColorGenerator(graph)
    return wc, image_color
 def create_Cloud(self, data):
     print ('creating wordpair graph...')
     self.twitter_mask = np.array(Image.open(path.join(path.dirname(__file__), 'MASK/twitter_mask.png')))
     for word in data:
         wordcloud = WordCloud(font_path=path.join(path.dirname(__file__), 'FONT/CabinSketch-Bold.ttf'), relative_scaling=.5, width=1800, height=1400, stopwords=None, mask=self.twitter_mask)
         wordcloud.generate_from_frequencies(list(data[word].items()))
         wordcloud.to_file(path.join(path.dirname(__file__), 'WORDPAIRS/'+word+'.png'))            
     
     return
def create_wordcloud(wordcloud_data):
    mask = imread(MASK_PATH)
    wordcloud = WordCloud(max_words=1000, mask=mask, stopwords=None, margin=10, random_state=1,
                          font_path=FONT_PATH, prefer_horizontal=1.0, width=WORD_CLOUD_WIDTH,
                          height = WORD_CLOUD_HEIGHT, background_color='black', mode='RGBA')
    word_importance_list = [(dct['word'], dct['importance']) for dct in wordcloud_data['words']]
    partisanship_list = [dct['partisanship'] for dct in wordcloud_data['words']]
    kwargs = {'word_partisanship': partisanship_list}
    wordcloud.generate_from_frequencies(word_importance_list, **kwargs)
    return wordcloud
Esempio n. 10
0
def generateWordCloud():
    words_old = [  #some words to visualize
        { 
            'word': 'this', 
            'size': 55,
            'color': COLOR_RED,
            'font': '\'Indie Flower\', cursive',
            'angle': '45'
        },
        { 
            'word': 'Test', 
            'size': 73,
            'color': COLOR_BLUE,
            'font': '\'Open Sans\', sans-serif',
            'angle': '-30'
        },
        { 
            'word': 'kinDA', 
            'size': 153,
            'color': COLOR_GREEN,
            'font': '\'Indie Flower\', cursive',
            'angle': '-150'
        },
        { 
            'word': 'WERKS', 
            'size': 33,
            'color': COLOR_PURPLE,
            'font': '\'Open Sans\', sans-serif',
            'angle': '90'
        }
    ]

    # Read the whole text.
    words = [('chipotle', 55), ('McDonalds', 15), ('burgerking', 12), ('wendies', 41), ('using', 1), ('font', 2), ('randomize', 1), ('yet', 1), ('HHBs', 1), ('knowledge', 1), ('generator', 1), ('everything', 3), ('implementation', 2), ('simple', 2), ('might', 1), ('pixel', 1), ('real', 1), ('designs', 1), ('good', 1), ('without', 1), ('checking', 1), ('trees', 2), ('famous', 1), ('boxes', 1), ('every', 1), ('optimal', 1), ('front', 1), ('integer', 1), ('bit', 2), ('now', 2), ('easily', 1), ('shape', 1), ('fs', 1), ('stuff', 1), ('found', 1), ('works', 1), ('view', 1), ('right', 1), ('force', 1), ('generation', 3), ('hard', 1), ('back', 1), ('second', 1), ('sure', 1), ('Hopefully', 1), ('portrait', 1), ('best', 1), ('really', 2), ('speed', 1), ('method', 2), ('dataset', 2), ('figuring', 1), ('modify', 1), ('understanding', 1), ('represented', 1), ('come', 1), ('generate', 2), ('last', 2), ('fit', 1), ('Tweak', 1), ('study', 1), ('studied', 1), ('turn', 1), ('place', 2), ('isn', 1), ('uses', 2), ('implement', 1), ('sprites', 1), ('adjustable', 1), ('render', 1), ('color', 2), ('one', 1), ('fashion', 1), ('fake', 1), ('cloud', 5), ('size', 2), ('guess', 1), ('working', 1), ('Separate', 1), ('sake', 1), ('placing', 1), ('brute', 1), ('least', 2), ('insider', 1), ('lot', 1), ('basic', 1), ('prototype', 1), ('start', 1), ('empty', 1), ('sort', 1), ('testing', 1), ('spiral', 1), ('overlapping', 1), ('else', 1), ('controller', 1), ('part', 2), ('somewhat', 1), ('varying', 1), ('MySQL', 1), ('quad', 2), ('copy', 1), ('also', 1), ('bundled', 1), ('word', 9), ('algorithm', 2), ('typography', 1), ('will', 1), ('fll', 1), ('following', 2), ('bet', 1), ('perfecting', 1), ('proved', 1), ('orientation', 2), ('wordle', 1), ('JavaScript', 1), ('collision', 2), ('reads', 1), ('want', 1), ('ready', 1), ('compressing', 1), ('apparently', 1), ('check', 1), ('inefficient', 1), ('preferably', 1), ('end', 2), ('thing', 2), ('efficient', 1), ('make', 3), ('note', 1), ('python', 3), ('need', 3), ('complex', 1), ('instead', 1), ('hierarchical', 1), ('used', 1), ('ft', 1), ('see', 1), ('though', 2), ('moving', 1), ('preliminary', 1), ('data', 1), ('fm', 1), ('Figure', 2), ('database', 1), ('author', 1), ('together', 1), ('think', 1), ('provide', 1), ('definitely', 1), ('time', 1), ('position', 2), ('model', 2), ('D3', 1)]
    
    alice_mask = np.array(Image.open(path.join(d,"alice_mask.png")))
    burrito_mask = np.array(Image.open(path.join(d,"burrito2.png")))

    print alice_mask.shape
    print burrito_mask.shape

    # Generate a word cloud image
    wordcloud = WordCloud(
        background_color="white",
        max_words = 1500,
        mask = burrito_mask)
    wordcloud.generate_from_frequencies(words)

    # The pil way (if you don't have matplotlib)
    image = wordcloud.to_image()
    #words = wordcloud.process_text(text)
    #image.show()

    return serveImg(image)
Esempio n. 11
0
def makeImage(text):
    alice_mask = np.array(Image.open("alice_mask.png"))

    wc = WordCloud(background_color="white", max_words=1000, mask=alice_mask)
    # generate word cloud
    wc.generate_from_frequencies(text)

    # show
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()
def _save_word_cloud_img(frequencies, file_path):
    """
    ワードクラウドの画像ファイルを指定されたファイルパスに保存する。
    参考:http://amueller.github.io/word_cloud/index.html
    :param frequencies: タブル(単語, 出現頻度)のリスト
    :param file_path: 画像ファイルのパス
    """
    # 日本語フォントのパスが正しく設定されている必要がある。
    font_path = config.JAPANESE_FONT_PATH
    wc = WordCloud(background_color='white', max_font_size=320, font_path=font_path, width=900, height=500)
    wc.generate_from_frequencies(frequencies)
    wc.to_file(file_path)
def create_word_cloud(df, mask_file, font_path):

    mask = np.array(Image.open(mask_file))

    wc = WordCloud(relative_scaling=0.5,
                        mask=mask,
                        prefer_horizontal=1.0,
                        background_color='white',
                        font_path=font_path)

    wc.generate_from_frequencies(df.values)

    return wc
Esempio n. 14
0
def word_cloud(dictionary,topic_index,topic_word):
    
    wd={}
    b_1 = np.argsort(topic_word[ipt,:])[::-1]
    cloud_word = [str(dictionary[i])+' ' for i in b_1]
    for j in b_1:
        wd[str(dictionary[j])] = topic_word[topic_index,j]/np.sum(topic_word[topic_index,:])

    huaji = imread('250px.png')
    wc = WordCloud(width=1920, height=1080,background_color="white")
    wc.generate_from_frequencies(wd.items())  
    plt.figure()
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
Esempio n. 15
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument dicts
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    result = wc.generate_from_frequencies(words)

    assert_true(isinstance(result, WordCloud))
Esempio n. 16
0
def get_comments():
    productURL='https://sclub.jd.com/comment/'\
               'productPageComments.action?'\
               'productId=11461683&score=0&'\
               'sortType=3&page='
    
    pageURL='&pageSize=10&isShadowSku=0&callback=fetchJSON_comment98vv14008'
    
    for i in range(399):
        i=str(i)
        url=productURL+i+pageURL
        print(url)
        html=requests.get(url).content
        time.sleep(0.2)
        with open(r"jd_books.txt","ab") as f:
            f.write(html)
    
    # 信息可视化
    html=open("jd_books.txt",encoding="utf-8")
    print(html)
    
    content=re.findall(r'"content":(.*?),',html)
    content_list=[]
    for i in content:
        if "img" not in i:
            content_list.append(str(i))
    
    # 词云可视化
    contents=''.join(content_list)
    contents_rank=jieba.analyse.extract_tags(
        contents,topK=40,withWeight=True)
    
    key_words=dict()
    for i in contents_rank:
        key_words[i[0]]=i[1]
    print(key_words)
    
    # 可视化
    wc=WordCloud(font_path='/System/Libray/Fonts/PingFang.ttc',
                 background_color='White',
                 max_words=50)
    wc.generate_from_frequencies(key_words)
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
Esempio n. 17
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument of class
    # 'dict_items'
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    items = words.items()
    result = wc.generate_from_frequencies(items)

    assert_true(isinstance(result, WordCloud))
    def generatewordcloud(freqTable, inputImageFileName, outputImageFileName):
        global stopwordshearing
        
        ImageFile.LOAD_TRUNCATED_IMAGES = True

        img = Image.open(inputImageFileName)
        img = img.resize((980,1080), Image.ANTIALIAS)
        sl = STOPWORDS | stopwordshearing
        speakerArray = np.array(img)
        wc = WordCloud(background_color="white", max_words=1000, mask=speakerArray, stopwords=sl,
                random_state=42)
        
        wc.generate_from_frequencies(freqTable)
        #print wc.words_
        # create coloring from image
        image_colors = ImageColorGenerator(speakerArray)
        wc.recolor(color_func=image_colors)
        wc.to_file(outputImageFileName)
Esempio n. 19
0
def WordCloudTopic( items , imagePath = None):
    # Generate a word cloud image
    
    if imagePath:
    	alice_coloring = np.array(Image.open(imagePath))

    	wc = WordCloud(background_color="white", max_words=200, mask=alice_coloring,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=300)
    	# generate word cloud
    	wc.generate_from_frequencies(items)
    	image_colors = ImageColorGenerator(alice_coloring)
    	plt.imshow(wc.recolor(color_func=image_colors))
    else:
    	wc = WordCloud(background_color="white", max_words=300,
        max_font_size=40, random_state=42)
    	wordcloud = wc.generate_from_frequencies(items)    
    	plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Esempio n. 20
0
def generate_image(files, image_name):
    content = ''
    for f in files:
        content += open(f, 'rb').read()
        content += '\n'
    graph = np.array(Image.open(image_name))
    wc = WordCloud(font_path=os.path.join(CUR_DIR, 'fonts/simhei.ttf'),
                   background_color='white', max_words=MAX_WORDS, mask=graph)
    words = process_text(content)
    print(len(words))
    wc.generate_from_frequencies(words)
    image = ImageColorGenerator(graph)

    plt.imshow(wc)
    plt.axis("off")
    plt.imshow(wc.recolor(color_func=image))
    plt.axis("off")
    plt.figure()
    plt.imshow(graph, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
Esempio n. 21
0
def generate_word_cloud(top_words_with_freq, font_path, to_save_img_path, background_color='white'):
    # 创建词云对象
    wc = WordCloud(font_path=font_path,  # 设置字体
                   background_color=background_color,  # 词云图片的背景颜色,默认为白色
                   max_words=100,  # 最大显示词数为100
                   max_font_size=80,  # 字体最大字号
                   random_state=50,  # 字体的最多模式
                   width=500,  # 词云图片宽度
                   margin=2,  # 词与词之间的间距
                   height=300)  # 词云图片高度

    # 用top_words_with_freq生成词云内容
    wc.generate_from_frequencies(top_words_with_freq)

    # 用matplotlib绘出词云图片显示出来
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

    # 将词云图片保存成图片
    wc.to_file(to_save_img_path)
Esempio n. 22
0
def plotCloud():
    while True:
      try:
          ipt = raw_input('Topic:')
      except ImportError:
          print 'invalid type'
      else:
          cloud_word_tuple = LDA.get_topic_terms(ipt,topn=50)
          cloud_word = [str(dictionary[i[0]])+' ' for i in cloud_word_tuple]
          wd={}
          for i in cloud_word_tuple:
              wd[str(dictionary[i[0]])] = i[1] 
          huaji = imread('250px.jpg')
          wc = WordCloud()
          wc.generate_from_frequencies(wd.items())  
          plt.figure()
          plt.imshow(wc)
          plt.axis('off')
          plt.show()
      if ipt == 'exit()':
          break
Esempio n. 23
0
class MyWordCloud(object):
    def __init__(self):
        self.stopwords = {}
        self.seg_list =[]
        self.m_wordcloud = None

    def StopWord(self,filename):
        # pass
        f = open(filename, 'r')
        line = f.readline().rstrip()#strip()
        while line:
            self.stopwords.setdefault(line, 0)
            self.stopwords[line.decode('utf-8')] = 1
            line = f.readline().rstrip()
        f.close()
        return self.stopwords


    def WordCut(self,stopwords, inputfile):
        # pass
        with open (inputfile) as f:

            text = f.readlines()
            text = r' '.join(text)

            seg_generator = jieba.cut(text)

            self.seg_list = [i for i in seg_generator if i not in stopwords]

            self.seg_list = [i for i in self.seg_list if i != u' ']

            self.seg_list = r' '.join(self.seg_list)

        return self.seg_list

    def GenWordCloud(self,
                     seg_list = None,
                     font_path=None,
                     background_color="black",
                     margin=5,
                     width=1800, height=800,flag=1):
        # pass
        self.m_wordcloud = WordCloud(font_path=font_path,
                              background_color=background_color,
                              margin=margin,
                              width=width,
                              height=height)
        if flag==0:
            self.m_wordcloud = self.m_wordcloud.generate_from_frequencies(seg_list)
        else :
            self.m_wordcloud = self.m_wordcloud.generate(seg_list)
        return self.m_wordcloud
Esempio n. 24
0
def save_word_cloud(subreddit, frequencies, stopwords=STOPWORDS):
    try:
        # download images for subreddit
        download_images(['--score', MIN_SCORE, '--num', NUM_PHOTOS, '--sort-type', 'topall', subreddit, subreddit])
        # get a list of downloaded file names
        coloring = []
        for file in ext_files(subreddit, 'jpg') + ext_files(subreddit, 'png'):
            base_file = os.path.basename(file)
            filename = os.path.join(BASE_DIR, subreddit, base_file)
            # get the number of colors in the image and compare
            image = Image.open(filename)
            w, h = get_image_size(filename)
            if w > WIDTH and h > HEIGHT and all_colors(filename) > COLORS and num_colors(filename) >= DOMINANT_COLORS:
                coloring = np.array(image)
                break
        shutil.rmtree(subreddit)
        if not len(coloring):
            # get previews for gifs
            coloring = get_gif_coloring(subreddit)
        if not len(coloring):
            raise Exception('No suitable image found')
        wc = WordCloud(font_path=os.path.join(BASE_DIR, 'fonts', 'Viga-Regular.otf'), background_color="white", width=WIDTH, height=HEIGHT, max_words=500, mask=coloring, min_font_size=18)
        # generate word cloud
        wc.generate_from_frequencies(frequencies)

        # create coloring from image
        image_colors = ImageColorGenerator(coloring)

        # recolor wordcloud and show
        # we could also give color_func=image_colors directly in the constructor
        plt.imshow(wc.recolor(color_func=image_colors))
        plt.axis("off")
        fig = plt.gcf()
        # save wordcloud for subreddit
        fig.savefig('{}.png'.format(subreddit), transparent=True)
        return "generated image for {}".format(subreddit)
    except Exception,e:
        print str(e)
def make_word_cloud(content):
    # read the mask image
    d = path.dirname(__file__)
    # alice_mask = np.array(Image.open(path.join(d, "mask/terran.jpg")))
    mask = np.array(Image.open(path.join(d, mask_img)))
    # font__dir = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf'
    # font__dir = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf'
    # font__dir = '_fonts/lth.ttf'

    wc = WordCloud(background_color="white", max_words=1000, mask=mask)

    # give the absolute dir for font ttf file
    # wc.font_path = 'C:\Users\JI\Documents\GitHub\PycharmProjects\myDrone\word_cloud\_fonts\lth.ttf'
    wc.font_path = abs_font_dir
    # wc.font_path = 'C:\Users\zjsep_000\PycharmProjects\myDrone\word_cloud_min\_fonts\lth.ttf'

    # wc.font_path = '_fonts/lth.ttf'
    # wc.font_path = '/var/www/FlaskApp/FlaskApp/word_cloud_min/_fonts/lth.ttf'
    # brush options: {'shoujin_brush.ttf','Japan_brush.ttf','qingke_fangzheng.ttf','KouzanBrushFont.ttf'}
    # serfi-fonts:[]

    wc.generate_from_frequencies(content)
    # generate word cloud
    # wc.generate(text)


    # store to file
    wc.to_file(path.join(d, "img/output.png"))
    # store to static foder in web server
    # wc.to_file(path.join(d, "../static/output.png"))

    # show
    plt.imshow(wc)
    plt.axis("off")
    plt.figure()
    plt.imshow(mask, cmap=plt.cm.gray)
    plt.axis("off")
    plt.show()
Esempio n. 26
0
def main():
	
	# read the whole text
	reader = csv.reader(open('/Users/kudari/workspaces/final_homework/output/test.csv', 'r'))
	d = {}
	for k, v in reader:
		d[k] = int(v)
	# read the pic
	coloring = np.array(Image.open(path.join(d, "/Users/kudari/workspaces/final_homework/pic2.jpg")))

	# set stopwords
	stopwords = set(STOPWORDS)

	# gerenate a wordcloud image
	# create a wordcloud
	wc = WordCloud(
		max_font_size = 88, 
		background_color = 'white',
		font_path = "/Users/kudari/workspaces/final_homework/Fins-Regular.otf",
		width = 1000,
		height = 860,
		# 设置词云形状
		mask = coloring,
		stopwords = stopwords,
		max_words = 500,
		)

	# generate the cloud
	wc.generate_from_frequencies(frequencies = d)

	# create coloring from image
	image_colors = ImageColorGenerator(coloring)

	# show
	plt.imshow(wc.recolor(color_func = image_colors), interpolation="bilinear")
	plt.axis("off")
	plt.show()
	wc.to_file("/Users/kudari/workspaces/final_homework/test2.png")
Esempio n. 27
0
def plot_wordcloud_with_property(topicWeightedWords, topicsByProperty):
    figure(figsize=(16, 40))
    for idx, topic in enumerate(topicWeightedWords):
        wc = WordCloud(background_color="white")
        img = wc.generate_from_frequencies(
            [(word, weight) for weight, word in topic])
        subplot(len(topicWeightedWords), 2, 2 * idx + 1)
        imshow(img)
        axis('off')

        subplot(len(topicWeightedWords), 2, 2 * idx + 2)
        plot(topicsByProperty[:, idx])
        axis([10, 100, 0, 1.0])
        title('Topic #%2d' % (idx))
Esempio n. 28
0
def phrase2pic(phrase_file, out_png, font_path, mask_file):
  phrase_dict = txt2dict(phrase_file)
  pic_address = path.abspath(mask_file)
  pic = imread(pic_address)  #读取图片
  pic_color = ImageColorGenerator(pic)   #根据图片生成颜色函数
  wc = WordCloud(background_color='white',    #构造wordcloud类
    mask=pic,
    width = 750,
    height = 750,
    max_font_size = 80,
    random_state=30,
    font_path=font_path,
    max_words=500,
    min_font_size=2,
    color_func=pic_color
  )
  wc.generate_from_frequencies(phrase_dict)
  # wc.generate(new_textlist)    #生成词云图
  plt.figure()    #画图
  plt.imshow(wc)
  plt.axis("off")
  plt.show()
  wc.to_file(out_png)   #保存图片
Esempio n. 29
0
def colorWordCould(e_dist):
    from PIL import Image
    import numpy as np
    bcimg=Image.open('D:/PS素材教程/漫威/mmexport148828636136611.png')
    bd=np.array(bcimg)
    wcld = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = bd,
                max_words = 2000,            # 设置最大显示的字数
                max_font_size = 50,            # 设置字体最大值
                random_state = 30,            # 设置有多少种随机生成状态,即有多少种配色方案
                )
    wordc=wcld.generate_from_frequencies(e_dist)
    ims=wordc.to_image()
    ims.show()
 def create_wordcloud(self, filename=None):
     '''
     create a wordcloud of the top words in a cluster
     '''
     plt.figure()
     for idx, topic in enumerate(self.topic_weights):
         wc = WordCloud(background_color="white")
         ww = [(word, weight) for word, weight in topic.iteritems()]
         img = wc.generate_from_frequencies(ww)
         plt.subplot(len(self.topic_weights), 2, 2 * idx + 1)
         plt.axis('off')
         plt.imshow(img)
     if filename == None:
         plt.show()
     else:
         plt.savefig(filename, dpi=300)
     plt.close()
Esempio n. 31
0
wc = WordCloud(width=1000,
               height=1000,
               background_color="black",
               max_words=100,
               mask=sword_mask)
wc.generate(words_no_characters)
wc.recolor(color_func=grey_color_func, random_state=3)
wc.to_file("word_cloud/no_characters_wc_sword2.png")

wc = WordCloud(width=1000,
               height=1000,
               background_color="black",
               max_words=100,
               mask=sword_mask)
wc.generate_from_frequencies(word_could_dict)
wc.recolor(color_func=grey_color_func, random_state=3)
wc.to_file("word_cloud/characters_wc_sword2.png")

wc = WordCloud(width=1000,
               height=1000,
               background_color="white",
               max_words=100,
               mask=throne_mask)
wc.generate(words)
wc.to_file("word_cloud/words_wc_throne.png")

wc = WordCloud(width=1000,
               height=1000,
               background_color="white",
               max_words=100,
Esempio n. 32
0
#-*- coding:utf-8 -*-
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from collections import Counter

files = open('yanjiang.txt',encoding='utf-8',errors='ignore').read()
text_jieba = list(jieba.cut(files))
c = Counter(text_jieba)
common_c = c.most_common(100)
bg_pic = imread('b.png')
wc = WordCloud(font_path = '1.4.ttf',background_color='red',width=1000, height=800,mask=bg_pic,max_words=2000,max_font_size=1000,)
wc.generate_from_frequencies(dict(common_c))
# 生成图片并显示
plt.figure()
plt.imshow(wc)
plt.axis('off')
plt.show()
# 保存图片
wc.to_file('anne.jpg')
Esempio n. 33
0
# -*- coding: utf-8 -*-
"""
演示程序:结合tuple和函数generate_from_frequencies,生成一张词云图
"""

# 导入 wordcloud 模块和 matplotlib 模块
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 读入一组词频字典文件
text_dict = (('you', 3000), ('love', 3500), ('I', 3000), ('Li', 2800),
             ('Phone', 3000), ('my', 300), ('mu~', 2000), ('mu~~', 510),
             ('heart', 500), ('sweet', 180))
# 生成词云
wc = WordCloud()
wordcloud = wc.generate_from_frequencies(frequencies=text_dict)
# 显示词云图片
plt.figure(num="demo", figsize=(5, 6), dpi=500, facecolor='w', edgecolor='w')
plt.axis('off')
plt.imshow(wordcloud)
plt.show()
# 保存图片
wc.to_file('outputFiles/demo1_output.jpg')
plt.close()
Esempio n. 34
0
        if other_drugs[i] in filt_absList[j]:
            freq = FreqDist(filt_absList[j])
            other_freq_df = pd.DataFrame(list(freq.items()),
                                         columns=["Word", "Frequency"])

# sort in order of frequency, most common first
app_freq_df.sort_values(by=['Frequency'], ascending=False, inplace=True)
other_freq_df.sort_values(by=['Frequency'], ascending=False, inplace=True)

## -------- Wordcloud of word frequency ---------------------
d = {}
for a, x in app_freq_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear", cmap='RdBu')
plt.axis("off")
plt.title('approved drugs')
plt.show()

d = {}
for a, x in other_freq_df.values:
    d[a] = x

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
Esempio n. 35
0
        occurence.append([k, v])
    occurence.sort(key=lambda x: x[1], reverse=True)
    return occurence, keyCount


occurence, dum = wordCounter(movies_ds, 'genres', genreNames)

# Create the dictionary to produce a wordcloud of the movie genres
genres = dict()
trunc_occurences = occurence[0:18]
for name in trunc_occurences:
    genres[name[0]] = name[1]

# Create and display the wordcloud
genre_wordcloud = WordCloud(width=1000, height=400, background_color='white')
genre_wordcloud.generate_from_frequencies(genres)
f, ax = plot.subplots(figsize=(16, 8))
plot.imshow(genre_wordcloud, interpolation="bilinear")
plot.axis('off')
plot.show()

# Break up the big genre string into a string array
movies_ds['genres'] = movies_ds['genres'].str.split('|')
# Convert genres to string value
movies_ds['genres'] = movies_ds['genres'].fillna("").astype('str')

tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0,
                     stop_words='english')
tfidf_matrix = tf.fit_transform(movies_ds['genres'])
Esempio n. 36
0
adv_uni_dict = {}
for i in range(len(sorted_adv_unigrams)):
    adv_uni_dict[''.join(sorted_adv_unigrams[i][0])] = sorted_adv_unigrams[i][1]

adv_bi_dict = {}
for i in range(len(sorted_adv_bigrams)):
    adv_bi_dict[''.join(sorted_adv_bigrams[i][0])] = sorted_adv_bigrams[i][1]

#instantiate wordcloud object into variable

#to display max of highest 30 words, image height, width and background color to display
wordCloud = WordCloud(max_words=30, height=1000, width=1500, background_color='white')

#generate the word cloud and store it as image files in current project location
poswc_unigrams = wordCloud.generate_from_frequencies(pos_uni_dict)
poswc_unigrams.to_file('poswc_unigrams.png')
conwc_unigrams = wordCloud.generate_from_frequencies(con_uni_dict)
conwc_unigrams.to_file('conwc_unigrams.png')
advwc_unigrams = wordCloud.generate_from_frequencies(adv_uni_dict)
advwc_unigrams.to_file('advwc_unigrams.png')
poswc_bigrams = wordCloud.generate_from_frequencies(pos_bi_dict)
poswc_bigrams.to_file('poswc_bigrams.png')
conwc_bigrams = wordCloud.generate_from_frequencies(con_bi_dict)
conwc_bigrams.to_file('conwc_bigrams.png')
advwc_bigrams = wordCloud.generate_from_frequencies(adv_bi_dict)
advwc_bigrams.to_file('advwc_bigrams.png')

#to view in console using matplotlib
# plt.title('Pro Unigrams words')
# plt.imshow(poswc_unigrams, interpolation='bilinear')
Esempio n. 37
0
c = Counter(text_jieba)  # 计数
word = c.most_common(500)  # 取前500

bg_pic = imread('src.jpg')

wc = WordCloud(

    #font_path='C:\Windows\Fonts\微软雅黑.TTF',  # 指定中文字体
    background_color='white',  # 设置背景颜色
    max_words=2000,  # 设置最大显示的字数
    mask=bg_pic,  # 设置背景图片
    max_font_size=200,  # 设置字体最大值
    random_state=20  # 设置多少种随机状态,即多少种配色
)
wc.generate_from_frequencies(dict(word))  # 生成词云
print(dict(word))

wc.to_file('result.jpg')

# show

plt.imshow(wc)

plt.axis("off")

plt.figure()

plt.imshow(bg_pic, cmap=plt.cm.gray)

plt.axis("off")
Esempio n. 38
0
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('./dat/news_word_top.csv')
df.set_index('word', inplace=True)
freq_dict = df.to_dict()['count']

font_path = './font/NotoSansKR-Regular.otf'

wc = WordCloud(background_color='white',
               max_words=1000,
               font_path=font_path,
               width=1920,
               height=1080)
wc.generate_from_frequencies(freq_dict)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
Esempio n. 39
0
            f.write(content_str)

#------------------------------------------------------------
# 4) 수집결과를 기반으로 형태소 분석
#------------------------------------------------------------
# 형태소 분석 객체를 통해 수집된 뉴스 본문에서 명사만 추출
nlp = Okt()
nouns = nlp.nouns(news_content)
count = Counter(nouns)  # 명사들에 대한 빈도수 검사
most = count.most_common(100)  # 가장 많이 사용된 단어 100개 추출

# 추출 결과를 워드 클라우드에서 요구하는 형식으로 재구성
# --> {"단어": 빈도수, "단어": 빈도수 ...}
tags = {}
for n, c in most:
    if len(n) > 1:
        tags[n] = c

#------------------------------------------------------------
# 5) 수집결과를 활용하여 워드클라우드 생성
#------------------------------------------------------------
# 워드 클라우드 객체 만들기
wc = WordCloud(font_path="NanumGothic",
               max_font_size=200,
               width=1200,
               height=800,
               background_color='#ffffff')

wc.generate_from_frequencies(tags)  # 미리 준비한 딕셔너리를 통해 생성
wc.to_file("news_%s.png" % datetime)  # 워드 클라우드 이미지 저장
Esempio n. 40
0
        lyric += f.read()

result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)

keywords = dict()
for i in result:
    keywords[i[0]] = i[1]
print(keywords)

image = Image.open('Mrs.jpeg')
graph = np.array(image)
wc = WordCloud(font_path='DroidSansFallback.ttf',
               background_color='White',
               max_words=50,
               mask=graph)
wc.generate_from_frequencies(keywords)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.imshow(wc.recolor(color_func=image_color))
plt.axis("off")
plt.show()
wc.to_file('output.png')

X = []
Y = []

for key in keywords:
    X.append(key)
    Y.append(keywords[key])

num = len(X)
Esempio n. 41
0
text = konlpy.utils.read_txt('이용수할머니기자회견문.txt', encoding=u'utf-8')
nouns = okt.nouns(text)

words = []
for i in nouns:
    if len(i) > 1:
        words.append(i)

count = Counter(words)

most = count.most_common(100)

tags = {}
for i, j in most:
    tags[i] = j

wc = WordCloud(font_path='NANUMSQUARE.TTF',
               width=1200,
               height=1200,
               scale=2.0,
               max_font_size=250)

gen = wc.generate_from_frequencies(tags)

plt.figure()
plt.imshow(gen, interpolation='bilinear')
wc.to_file('mh2.png')

plt.close()
Esempio n. 42
0
    ' '.join(data_es_it[(data_es_it.user_location == candidate)].text.tolist())
    for candidate in text_cloud
]

cv = CountVectorizer(stop_words=spanish_stopwords, ngram_range=(1, 3))
X = cv.fit_transform(corpus_es)
X = X.toarray()
bow = pd.DataFrame(X, columns=cv.get_feature_names())
bow.index = text_cloud

text_es = bow.loc['spain'].sort_values(ascending=False)[:4000]
text2_dict = bow.loc['spain'].sort_values(ascending=False).to_dict()

# create the WordCloud object
wordcloud = WordCloud(min_word_length=3, background_color='white')
wordcloud.generate_from_frequencies(text2_dict)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# create a dictionary for Italian
text_cloud_it = data_es_it.user_location.unique()
corpus_it = [
    ' '.join(data_es_it[(data_es_it.user_location == candidate)].text.tolist())
    for candidate in text_cloud
]

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words=italian_stopwords, ngram_range=(1, 3))
Esempio n. 43
0
han.pos(s_park[0])
han.nouns(s_park[0])

# step4: plotting
# word cloud for Park
s_park_noun = extract_nouns(s_park)
count = Counter(s_park_noun)
tags = count.most_common(100)
# WordCloud, matplotlib: 단어 구름 그리기
font_path = "C:/WINDOWS/Fonts/NANUMGOTHIC.TTF"
wc = WordCloud(font_path=font_path,
               background_color='white',
               width=800,
               height=600)
cloud = wc.generate_from_frequencies(dict(tags))
plt.figure(figsize=(10, 8))
plt.axis('off')
plt.imshow(cloud)
plt.savefig('park.png', dpi=600)
plt.show()

# word cloud for Moon
s_moon_noun = extract_nouns(s_moon)
count = Counter(s_moon_noun)
tags = count.most_common(100)
# WordCloud, matplotlib: 단어 구름 그리기
font_path = "C:/WINDOWS/Fonts/NANUMGOTHIC.TTF"
wc = WordCloud(font_path=font_path,
               background_color='white',
               width=800,
Esempio n. 44
0
def get_chapter_cloud(chapters, chapter):
    chapter_cloud_data = []
    unique_words = set(chapter)
    for word in unique_words:        
        weight = utility.tf_idf(word, chapters, chapter)
        chapter_cloud_data.append((word, int(weight * 100)))
    weight = lambda element: element[1]
    chapter_cloud_data.sort(key=weight, reverse=True)
    return chapter_cloud_data


if __name__ == '__main__':
    book = utility.get_text_file_as_list('shrek.txt')
    chapters = utility.split_by_delimiter(book, "#" * 10)
    preprocessed_chapters = [preprocess_text(chapter) for chapter in chapters]
    cloud_data = prepare_word_cloud_data(preprocessed_chapters)
    
    for i, data in enumerate(cloud_data): 
        wc = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue')
        wc.generate_from_frequencies(dict(data[5:]))
        wc.to_file(f'clouds/shrek_cloud{i}.png')

    # subexercise 5
    preprocessed_book = preprocess_text(book)
    cloud = get_chapter_cloud(preprocessed_book, preprocessed_book)
    wc = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue')
    wc.generate_from_frequencies(dict(cloud[15:]))
    wc.to_file('clouds/book_tf_idf.png')

Esempio n. 45
0
    plt.xticks(np.arange(0, (2*top_features)), feature_names[top_coefficients], rotation = 60, ha = "right")
    plt.show()
plot_coefficients()


###Create Word Cloud#######
mystopwords = set(stopwords.words('english'))
# Read a text file and calculate frequency of words in it
with open("C:/Users/becky/Desktop/deceptions.txt", "r") as f:
    words = f.read().split()
data = dict()
for word in words:
    word = word.lower()
    if word in stop_words:
        continue
    data[word] = data.get(word, 0) + 1
word_cloud = WordCloud(
    background_color = background_color,
    width = width,
    height=height, 
    collocations = False,
    stopwords = mystopwords
)
word_cloud.generate_from_frequencies(data)
word_cloud.to_file('image7.png')





Esempio n. 46
0
def plot_word_cloud_single(output_dir, grades, index,
                           color=None):
    import seaborn as sns

    if color is not None:
        colormap = sns.dark_palette(color, as_cmap=True)
    else:
        colormap = None

    # Scrap non-interesting contrasts
    contrasts = list(filter(
        lambda x: 'effects_of_interest' not in x and 'gauthier' not in x,
        grades))[:15]
    frequencies_cat = defaultdict(lambda: 0.)
    frequencies_single = defaultdict(lambda: 0.)
    occurences = defaultdict(lambda: 0.)
    for contrast in contrasts:
        grade = grades[contrast]
        study, contrast = contrast.split('::')
        contrast = contrast.replace('_', ' ').replace('&', ' ').replace('-',
                                                                        ' ')
        terms = contrast.split(' ')
        cat_terms = []
        for term in terms:
            if term == 'baseline':
                break
            if term in ['vs']:
                break
            cat_terms.append(term)
        for term in cat_terms:
            frequencies_single[term] += grade
            occurences[term] += 1
        cat_terms = ' '.join(cat_terms)
        frequencies_cat[cat_terms] += grade

    frequencies_single = {term: freq / math.sqrt(occurences[term]) for term, freq
                          in frequencies_single.items()}
    width, height = (900, 450)
    wc = WordCloud(prefer_horizontal=1,
                   background_color="rgba(255, 255, 255, 0)",
                   width=width, height=height,
                   colormap=colormap,
                   relative_scaling=0.7)
    wc.generate_from_frequencies(frequencies=frequencies_single, )
    wc.to_file(join(output_dir, 'wc_single_%i.png' % index))

    width, height = (900, 300)

    wc = WordCloud(prefer_horizontal=1,
                   background_color="rgba(255, 255, 255, 0)",
                   width=width, height=height,
                   mode='RGBA',
                   colormap=colormap,
                   relative_scaling=0.8)
    wc.generate_from_frequencies(frequencies=frequencies_cat, )
    wc.to_file(join(output_dir, 'wc_cat_%i.png' % index))

    width, height = (1200, 300)

    wc = WordCloud(prefer_horizontal=1,
                   background_color="rgba(255, 255, 255, 0)",
                   width=width, height=height,
                   mode='RGBA',
                   colormap=colormap,
                   relative_scaling=0.8)
    wc.generate_from_frequencies(frequencies=frequencies_cat, )
    wc.to_file(join(output_dir, 'wc_cat_%i_wider.png' % index))
Esempio n. 47
0
def display(input_dict, display_type='first_appear'):
    if display_type == 'first_appear':
        input_dict = dict(sorted(input_dict.items(), key=lambda x: x[1][0]))
        print('Appearance Sequence: ')
        seq = pd.Series(key for key in input_dict.keys())
        print(seq)
    elif display_type == 'print_name_dict':
        global file_line_count
        if len(input_dict) >= 1:
            input_dict = dict(sorted(input_dict.items(), key=lambda x: x[0]))
            print("Characters and their information: ")
            name = []
            first_appearance = []
            mentioned = []
            for key, value in input_dict.items():
                name.append(key)
                first_appearance.append(
                    round(value[0] / file_line_count * 100, 2))
                mentioned.append(len(value))
            df_out = pd.DataFrame()
            df_out['Name'] = name
            df_out['FirstMentioned(%)'] = first_appearance
            df_out['Mentioned'] = mentioned
            print(df_out)
    elif display_type == 'print_family_name_dict':
        print("Family Name Dictionary:")
        for family_name, info in input_dict.items():
            mat = "{:25}\t{:5}\t{:}"
            family_member = [member[0] for member in info[1]]
            print(
                mat.format(family_name, str(info[0]),
                           " / ".join(family_member)))
    elif display_type == 'print_appearance_dict(after projection)':
        for line, appear_member in input_dict.items():
            if len(appear_member):
                mat = "{:<5}: {:}"
                print(mat.format(line, " / ".join(appear_member)))
    elif display_type == 'characters_word_cloud':
        wc = WordCloud(background_color='white')
        wc.generate_from_frequencies(input_dict)
        plt.figure()
        plt.imshow(wc)
        plt.axis('off')
        plt.show()
    elif display_type == 'characters_relation_graph(csv generate)':
        characters_relation_extract(input_dict)
        with open(save_csv_path + file_name[0:file_name.find(".")] +
                  '_node.csv',
                  'w',
                  encoding='utf-8') as file:
            file.write('id,label,weight\n')
            for name, freq in freq_dict.items():
                file.write(name + ',' + name + ',' + str(freq) + '\n')
        with open(save_csv_path + file_name[0:file_name.find(".")] +
                  '_edge.csv',
                  'w',
                  encoding='utf-8') as file:
            file.write('source,target,weight\n')
            for name, freq in relation_score_dict.items():
                file.write(name[0] + ',' + name[1] + ',' + str(freq) + '\n')
    elif display_type == 'train_relation_classifier':
        if build_relation_record_dict:
            if generate_graph:
                lengths = []
                scores = []
                for length in range(20, len(relation_record_dict), 10):
                    lengths.append(length)
                    score = relation_predict.train(relation_record_dict,
                                                   file_name_prefix,
                                                   name_dict.keys(), length,
                                                   True)
                    scores.append(score)
                pylab.plot(lengths, scores, '-bo')
                pylab.title(
                    'Related Word Freq Classifier Performance with Varying Relation Set Size'
                )
                pylab.xlabel('Relation Set Size')
                pylab.ylabel('Accuracy(Cross Validation)')
                pylab.show()
                with open(save_csv_path + file_name[0:file_name.find(".")] +
                          '_scores.csv',
                          'w',
                          encoding='utf-8') as file:
                    file.write('feature set size,score\n')
                    for i in range(len(lengths)):
                        file.write(
                            str(lengths[i]) + ',' + str(scores[i]) + '\n')
                    file.close()
            else:
                res = input(
                    'Wanna load from a existed feature set which save you for about 1 min?[y/n]'
                )
                if res == 'y':
                    res = True
                else:
                    res = False
                relation_predict.train(relation_record_dict, file_name_prefix,
                                       name_dict.keys(), best_length, res)
        else:
            print("[WARNING] Need to build relation record dict first")
Esempio n. 48
0
def mk_wordcloud(words, filename_out, strings_exclude):
    """
    words : string
    filename_out : string
    strings_exlucde list : ['xxx', 'yyy'] # If you want to remove any particular word form text which does not contribute much in meaning
    """
    WNL = nltk.WordNetLemmatizer()
    text = words
    # Lowercase and tokenize
    text = text.lower()
    # Remove single quote early since it causes problems with the tokenizer.
    text = text.replace("'", "")
    # Remove numbers from text
    # remove_digits = str.maketrans('', '', digits)
    # text = text.translate(remove_digits)
    tokens = nltk.word_tokenize(text)
    text1 = nltk.Text(tokens)

    # Remove extra chars and remove stop words.
    text_content = [
        "".join(re.split("[ .,;:!?‘’``''@#$%^_&*()<>{}~\n\t\\\-]", word))
        for word in text1
    ]

    # set the stopwords list
    stopwords_wc = set(STOPWORDS)

    customised_words = strings_exclude

    new_stopwords = stopwords_wc.union(customised_words)
    text_content = [word for word in text_content if word not in new_stopwords]

    # After the punctuation above is removed it still leaves empty entries in the list.
    text_content = [s for s in text_content if len(s) != 0]

    # Best to get the lemmas of each word to reduce the number of similar words
    text_content = [WNL.lemmatize(t) for t in text_content]

    nltk_tokens = nltk.word_tokenize(text)
    bigrams_list = list(nltk.bigrams(text_content))
    # print(bigrams_list)
    dictionary2 = [" ".join(tup) for tup in bigrams_list]
    # print (dictionary2)

    # Using count vectoriser to view the frequency of bigrams
    vectorizer = CountVectorizer(ngram_range=(2, 2))
    bag_of_words = vectorizer.fit_transform(dictionary2)
    vectorizer.vocabulary_
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    # print (words_freq[:100])

    # Generating wordcloud and saving as jpg image
    words_dict = dict(words_freq)
    WC_height = 1000
    WC_width = 1500
    WC_max_words = 200
    wordCloud = WordCloud(
        max_words=WC_max_words,
        height=WC_height,
        width=WC_width,
        stopwords=new_stopwords,
    )
    wordCloud.generate_from_frequencies(words_dict)
    plt.title(
        "Most frequently occurring bigrams connected by same colour and font size"
    )
    plt.imshow(wordCloud, interpolation="bilinear")
    plt.axis("off")
    # plt.show()
    return wordCloud.to_file(filename_out)
Esempio n. 49
0
def main(city, keyword, region, pages):
    '''
    主函数
    '''
    csv_filename = 'zl_' + city + '_' + keyword + '.csv'
    txt_filename = 'zl_' + city + '_' + keyword + '.txt'
    headers = [
        'job', 'years', 'education', 'salary', 'company', 'scale', 'job_url'
    ]
    salaries = []

    write_csv_headers(csv_filename, headers)
    for i in range(pages):
        '''
        获取该页中所有职位信息,写入csv文件
        '''
        job_dict = {}
        html = get_one_page(city, keyword, region, i)
        items = parse_one_page(html)
        for item in items:
            html = get_detail_page(item.get('job_url'))
            job_detail = get_job_detail(html)

            job_dict['job'] = item.get('job')
            job_dict['years'] = job_detail.get('years')
            job_dict['education'] = job_detail.get('education')
            job_dict['salary'] = item.get('salary')
            job_dict['company'] = item.get('company')
            job_dict['scale'] = job_detail.get('scale')
            job_dict['job_url'] = item.get('job_url')

            # 对数据进行清洗,将标点符号等对词频统计造成影响的因素剔除
            pattern = re.compile(r'[一-龥]+')
            filterdata = re.findall(pattern, job_detail.get('requirement'))
            write_txt_file(txt_filename, ''.join(filterdata))
            write_csv_rows(csv_filename, headers, job_dict)

    sal = read_csv_column(csv_filename, 3)
    # 撇除第一项,并转换成整形,生成新的列表
    for i in range(len(sal) - 1):
        # 工资为'0'的表示招聘上写的是'面议',不做统计
        if not sal[i] == '0':
            salaries.append(int(sal[i + 1]))

    plt.hist(
        salaries,
        bins=10,
    )
    plt.show()

    content = read_txt_file(txt_filename)
    segment = jieba.lcut(content)
    words_df = pd.DataFrame({'segment': segment})

    stopwords = pd.read_csv("stopwords.txt",
                            index_col=False,
                            quoting=3,
                            sep=" ",
                            names=['stopword'],
                            encoding='utf-8')
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)

    # 设置词云属性
    color_mask = imread('background.jfif')
    wordcloud = WordCloud(
        font_path="simhei.ttf",  # 设置字体可以显示中文
        background_color="white",  # 背景颜色
        max_words=100,  # 词云显示的最大词数
        mask=color_mask,  # 设置背景图片
        max_font_size=100,  # 字体最大值
        random_state=42,
        width=1000,
        height=860,
        margin=2,
        # 设置图片默认的大小,但是如果使用背景图片的话,                                                   # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
    )

    # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数
    word_frequence = {x[0]: x[1] for x in words_stat.head(100).values}
    word_frequence_dict = {}
    for key in word_frequence:
        word_frequence_dict[key] = word_frequence[key]

    wordcloud.generate_from_frequencies(word_frequence_dict)
    # 从背景图片生成颜色值
    image_colors = ImageColorGenerator(color_mask)
    # 重新上色
    wordcloud.recolor(color_func=image_colors)
    # 保存图片
    wordcloud.to_file('output.png')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Esempio n. 50
0
def print_word_cloud(text):
    wc = WordCloud(background_color='white', width=1000, height=800)
    wc.generate_from_frequencies(text)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
def WordCloud_Color(pd_csv, video_id):
    pwd = os.path.dirname(__file__)
    jieba.set_dictionary(pwd + '/dict.txt.big_new.txt')
    # 再加入我們從維基百科建立的自訂字典:self_define_dict.txt,可以將專有名詞成功斷開-->蔡英文、韓國瑜
    # 這個自定義字典存檔時記得使用utf-8的編碼存檔
    jieba.load_userdict(pwd + '/self_define_dict.txt')
    # 讀入stopwords.txt並做成 stopwords 字典
    stopwords = {}
    with open(pwd + r'/test_dict_stop.txt', 'r', encoding='UTF-8') as file:
        for st_word in file.readlines():
            st_word = st_word.strip()  # data.strip()為去除前後空白
            stopwords[st_word] = 1
    FilePath = pwd
    ImgPath = pwd + r'/static/images'
    wd_dict = {}
    # 新聞總檔案
    # 有些中文字python預設為unicode無法編譯,例如游錫堃的堃,使用encoding ='utf-8-sig'
    for j, content in enumerate(pd_csv['clean_con']):
        # 這個是將udn的內容中有該段文字給替換成空白
        content = content.replace(
            '''domready(function() {if ( !countDownStatus ) getCountdown();if ( !highChartsStatus ) getHighcharts();});domready(function() {var channelId = 2;var actArea = "poll_at_story_0_v773";var actCode = "v773";var actTemplate = "bar2";var elemDiv = document.createElement('div');elemDiv.id = actArea;elemDiv.className ='vote_body area';var scr = document.getElementById(actArea+'_script');scr.parentNode.insertBefore(elemDiv, scr);$.getScript('/funcap/actIndex.jsp?actCode=' + actCode + '&channelId=' + channelId , function() {actTemplate = eval('objAct_' + actCode + '.d1.actTemplate');$.ajaxSetup({ cache: true });$.getScript('/func/js/' + actTemplate + '_min.js?2019122401', function() {$.ajaxSetup({ cache: false });piepkg();loadTemplateJs(actTemplate);eval(actTemplate + 'view("' + '#' + actArea + '");')})});});''',
            '')
        content = content.strip('').strip('\n').strip('')  # 去除文章前後的空白與斷行
        seg_con_list = jieba.cut(content)
        # 拿stopwords來清理jieba處理完的字串
        for wd in seg_con_list:
            wd = wd.strip('')
            if is_alphabet(wd) != True:
                if stopwords.get(wd) == None and len(wd) > 1:
                    if wd_dict.get(wd) == None:  # 開始計算字詞的數量,未出現的單字存入字典
                        wd_dict[wd] = 1
                    else:  # 開始計算字詞的數量,出現過的單字字典數加1
                        wd_dict[wd] += 1
            # 每篇文章做完再進到下一行

    print("影片ID:{}".format(video_id))
    # === deal with similarity_dict ===
    fw = open(pwd + r'/similarity_dict.txt', 'r', encoding='utf-8-sig')
    sy_list = []
    while True:
        line = fw.readline()
        b = line.strip('\n').strip(' ')
        a = b.split(',')
        sy_list.append(a)
        if not line:
            break
    fw.close()
    sy_list.pop()  # 將最後一個空串列丟出
    ncount = 0
    for n, syn in enumerate(sy_list):
        for i in range(len(syn)):
            ncount += wd_dict.get(syn[i], 0)
            if wd_dict.get(syn[i]) != None:
                del wd_dict[syn[i]]
        wd_dict[syn[0]] = ncount
        ncount = 0
    print(wd_dict)

    # del wd_dict['不拘']
    # ===== 生成文字雲 ======
    def random_color_func(word=None,
                          font_size=None,
                          position=None,
                          orientation=None,
                          font_path=None,
                          random_state=None):
        h = randint(0, 240)
        # s = int(100.0 * 255.0 / 255.0)
        s = randint(70, 100)
        l = int(100.0 * float(randint(60, 120)) / 255.0)
        return "hsl({}, {}%, {}%)".format(h, s, l)

    ###http://csscoke.com/2015/01/01/rgb-hsl-hex/ HSL調色###
    # font設定成微軟正黑體,這邊我是直接抓我windows中的字體檔案,將該檔案放在程式的同一個工作目錄下即可
    font = pwd + '/NotoSansCJKtc-Black.otf'
    # wordcloud = WordCloud(background_color='white',font_path=font,scale=5)
    wordcloud = WordCloud(background_color='white',
                          font_path=font,
                          max_font_size=50,
                          min_font_size=10,
                          scale=10,
                          max_words=500)
    # 文字雲使用頻率,輸入值為字詞數的字典 (wd_dict)
    my_wordcloud = wordcloud.generate_from_frequencies(frequencies=wd_dict)
    # 畫出文字雲
    my_wordcloud.recolor(color_func=random_color_func)
    plt.axis("off")
    wordcloud.to_file(ImgPath + '/{}.png'.format(video_id))
Esempio n. 52
0
def generateTfIdfWordClouds(pathToMemex):
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json")
    citeKeys = list(ocrFiles.keys())  #[:500]

    print("\taggregating texts into documents...")
    docList = []
    docIdList = []

    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8"))

        docId = citeKey
        doc = " ".join(docData.values())

        # clean doc
        doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
        doc = re.sub('\W+', ' ', doc)
        doc = re.sub('_+', ' ', doc)
        doc = re.sub('\d+', ' ', doc)
        doc = re.sub(' +', ' ', doc)

        # update lists
        docList.append(doc)
        docIdList.append(docId)

    print("\t%d documents generated..." % len(docList))

    # PART 2: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")
    stopWords = functions.loadMultiLingualStopWords(["deu", "eng", "fre"])
    vectorizer = CountVectorizer(ngram_range=(1, 1),
                                 min_df=2,
                                 max_df=0.5,
                                 stop_words=stopWords)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(
        countVectorized)  # generates a sparse matrix

    print("\tconverting and filtering tfidf data...")
    tfidfTable = pd.DataFrame(vectorized.toarray(),
                              index=docIdList,
                              columns=vectorizer.get_feature_names())
    tfidfTable = tfidfTable.transpose()
    tfidfTableDic = tfidfTable.to_dict()
    tfidfTableDic = filterTfidfDictionary(tfidfTableDic, 0.03, "more")

    #tfidfTableDic = json.load(open("/Users/romanovienna/Dropbox/6.Teaching_New/BUILDING_MEMEX_COURSE/_memex_sandbox/_data/results_tfidf_publications.dataJson"))

    # PART 4: generating wordclouds
    print("\tgenerating wordclouds...")
    wc = WordCloud(
        width=1000,
        height=600,
        background_color="white",
        random_state=2,
        relative_scaling=
        0.5,  #color_func=lambda *args, **kwargs: (179,0,0)) # single color
        #colormap="copper") # Oranges, Reds, YlOrBr, YlOrRd, OrRd; # copper
        colormap="autumn")  # binary, gray
    # https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html

    counter = len(tfidfTableDic)
    citeKeys = list(tfidfTableDic.keys())
    random.shuffle(citeKeys)

    for citeKey in citeKeys:
        savePath = functions.generatePublPath(pathToMemex, citeKey)
        savePath = os.path.join(savePath, "%s_wCloud.jpg" % citeKey)

        if not os.path.isfile(savePath):
            wc.generate_from_frequencies(tfidfTableDic[citeKey])
            # plotting
            plt.imshow(wc, interpolation="bilinear")
            plt.axis("off")
            #plt.show() # this line shows the plot
            plt.savefig(savePath, dpi=200, bbox_inches='tight')

            print("\t%s (%d left...)" % (citeKey, counter))
            counter -= 1

        else:
            print("\t%s --- already done" % (citeKey))
            counter -= 1
Esempio n. 53
0
overlap.render('主要城市评论数_平均分.html')

# 词云
tomato_str = ' '.join(tomato_com['comment'])
words_list = []
word_generator = jieba.cut_for_search(tomato_str)
for word in word_generator:
    words_list.append(word)
words_list = [k for k in words_list if len(k) > 1]
back_color = imread('tomato.jpg')  # 解析该图片
wc = WordCloud(
    background_color='white',  # 背景颜色
    max_words=200,  # 最大词数
    mask=back_color,  # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
    max_font_size=300,  # 显示字体的最大值
    stopwords=STOPWORDS.add('苟利国'),  # 使用内置的屏蔽词,再添加'苟利国'
    font_path="C:/Windows/Fonts/STFANGSO.ttf",
    random_state=42,  # 为每个词返回一个PIL颜色
    # width=1000,  # 图片的宽
    # height=860  #图片的长
)
tomato_count = Counter(words_list)
wc.generate_from_frequencies(tomato_count)
# 基于彩色图像生成相应彩色
image_colors = ImageColorGenerator(back_color)
# 绘制词云
plt.figure()
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis('off')
wc.to_file(path.join(d, "词云.png"))
Esempio n. 54
0
from re import match
from wordcloud import WordCloud, STOPWORDS

data = pd.read_csv("C:/Users/user/Desktop/2020_text_mining/jobkorea_data.csv")

komoran = Komoran()

%time komoran_nouns = komoran.nouns(''.join(str(data['답변'].fillna(''))))
komoran_nouns[-10:]

DBA = data.loc[data['직무분야'] == "ERP·시스템분석·설계", "답변"]
nouns = komoran.nouns(''.join(str(DBA.fillna(''))))
nouns = [n for n in nouns if len(n) > 1]
nouns = [n for n in nouns if not(match('^[0-9]',n))]
count = Counter(nouns)
top = count.most_common(40)

#불용어 제거 
stopwords = set(STOPWORDS)
stopwords.add('제가')

wordcloud = WordCloud(font_path='C:/Users/user/Desktop/2020_text_mining/NanumGothic.ttf', 
                   background_color='white', width=800, height=600, stopwords=stopwords)

cloud = wordcloud.generate_from_frequencies(dict(top))

plt.figure(figsize=(10,8))
plt.imshow(wordcloud)
plt.tight_layout(pad=0)
plt.axis('off')
plt.show()
Esempio n. 55
0
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)
topics = optimal_model.show_topics(formatted=False)
fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()


# Here we also visualized the first 4 topics in our document along with the top 10 keywords. Each keyword's corresponding weights are shown by the size of the text.
# 
# Based on the visualization, we see the following topics:
# - Topic 0: Employer Quality
# - Topic 1: Management Quality
Esempio n. 56
0
    def apply_words(self, words, evaluation, options):
        'WordCloud[words_List, OptionsPattern[%(name)s]]'
        ignore_case = self.get_option(options, 'IgnoreCase',
                                      evaluation) == Symbol('True')

        freq = dict()
        for word in words.leaves:
            if not isinstance(word, String):
                return
            py_word = word.get_string_value()
            if ignore_case:
                key = py_word.lower()
            else:
                key = py_word
            record = freq.get(key, None)
            if record is None:
                freq[key] = [py_word, 1]
            else:
                record[1] += 1

        max_items = self.get_option(options, 'MaxItems', evaluation)
        if isinstance(max_items, Integer):
            py_max_items = max_items.get_int_value()
        else:
            py_max_items = 200

        image_size = self.get_option(options, 'ImageSize', evaluation)
        if image_size == Symbol('Automatic'):
            py_image_size = (800, 600)
        elif image_size.get_head_name() == 'System`List' and len(
                image_size.leaves) == 2:
            py_image_size = []
            for leaf in image_size.leaves:
                if not isinstance(leaf, Integer):
                    return
                py_image_size.append(leaf.get_int_value())
        elif isinstance(image_size, Integer):
            size = image_size.get_int_value()
            py_image_size = (size, size)
        else:
            return

        # inspired by http://minimaxir.com/2016/05/wordclouds/
        import random
        import os

        def color_func(word,
                       font_size,
                       position,
                       orientation,
                       random_state=None,
                       **kwargs):
            return self.default_colors[random.randint(0, 7)]

        font_base_path = os.path.dirname(
            os.path.abspath(__file__)) + '/../fonts/'

        font_path = os.path.realpath(font_base_path + 'AmaticSC-Bold.ttf')
        if not os.path.exists(font_path):
            font_path = None

        from wordcloud import WordCloud
        wc = WordCloud(width=py_image_size[0],
                       height=py_image_size[1],
                       font_path=font_path,
                       max_font_size=300,
                       mode='RGB',
                       background_color='white',
                       max_words=py_max_items,
                       color_func=color_func,
                       random_state=42,
                       stopwords=set())
        wc.generate_from_frequencies(freq.values())

        image = wc.to_image()
        return Image(numpy.array(image), 'RGB')
Esempio n. 57
0
def word_count(email, keyword, savedDate, optionList, analysisName):    
    # mongo에서 전처리 결과 가져오기
    doc = getPreprocessing(email, keyword, savedDate, optionList)[0]
    nTokens = getPreprocessing(email, keyword, savedDate, optionList)[1]
    doc = sum(doc, []) # 중첩리스트 하나로 합치기
    #print(doc, nTokens)    

    vectorizer = CountVectorizer(analyzer='word', max_features=int(optionList), tokenizer=None)
    words=vectorizer.fit(doc)
    words_fit = vectorizer.fit_transform(doc)
 
    word_list=vectorizer.get_feature_names() #=sorted(vectorizer.vocabulary_)
    #print("Vec사전:", word_list, '\n빈도수:', words_fit.toarray().sum(axis=0))
    count_list = words_fit.toarray().sum(axis=0)

    df=pd.DataFrame()
    df["words"] = word_list
    df["count"] = count_list

    count_list = list([int(x) for x in count_list])
    df = df.sort_values(by=['count'], axis=0, ascending=False)
    #dict_words = dict(zip(word_list,count_list))
    dict_words = df.set_index('words').T.to_dict('records') #type: list
    dict_words = dict_words[0]
    print("빈도수 분석결과\n", df, '\n', dict_words)

    ## CSV파일로 저장
    # with open('wc_csvfile.csv','w') as f:
    #     w = csv.writer(f)
    #     for k, v in dict_words.items():
    #         w.writerow([k, v])

    ## Barchart 그리기
    FONT_PATH='TextMining/NanumBarunGothic.ttf'
    fontprop = fm.FontProperties(fname=FONT_PATH, size=8)
    plt.figure(figsize=(20,5))
    plt.bar(word_list, count_list)
    plt.xticks(rotation=40, ha='right', fontproperties=fontprop)
    plt.savefig('wc_barchart.jpg')

    ## Wordcloud 시각화
    wordcloud = WordCloud(
        font_path = FONT_PATH,
        width = 1500,
        height = 1000,
        background_color="white",
    )
    wordcloud = wordcloud.generate_from_frequencies(dict_words)
    #plt.savefig('wordcould.png', bbox_inches='tight')
    print("빈도수분석 Wordcloud 결과파일이 생성되었습니다..")
    wordcloud.to_file('wc_wordcloud.jpg')
    
    
    #Mongo에 저장된 바차트, 워드클라우드의 binary 파일과 이미지파일이 일치하는지 확인하기 위해 size출력
    from os.path import getsize
    BarFile = 'wc_barchart.jpg'    
    WcFile = 'wc_wordcloud.jpg'
    bar_file_size = getsize(BarFile) #wc_barchart.jpg: 95129
    wc_file_size = getsize(WcFile) #wc_wordcloud.jpg: 223997

    print('File Name: %s \tFile Size: %d' %(BarFile, bar_file_size))
    print('File Name: %s \tFile Size: %d' %(WcFile, wc_file_size))
    
    ### Mongo 저장 ###
    client=MongoClient(host='localhost',port=27017)
    #print('MongoDB에 연결을 성공했습니다.')
    db=client.textMining
    nTokens = optionList
    now = datetime.datetime.now()
    #print("time: ", now,'\n', now.strftime("%Y-%m-%dT%H:%M:%S.%fZ")) #형식
    
    ## 몽고에 Barchart 이미지 binary로 저장
    print("\nMongoDB에 빈도수 분석 결과를 바차트로 저장합니다.")
    fs = gridfs.GridFS(db, 'count') #count.files, count.chunks로 생성됨
    with open(BarFile, 'rb') as f:
        contents = f.read()
    fs.put(contents, filename='wc_bar')

    ## 몽고의 count.files & count.chunks collection에 WordCloud 이미지 binary로 저장
    print("MongoDB에 빈도수 분석 결과를 wordcloud로 저장합니다.\n")
    with open(WcFile, 'rb') as f:
        contents = f.read()
    fs.put(contents, filename='wc_wordcloud')

    barBinary = getBinaryImage(bar_file_size, analysisName)
    wcBinary = getBinaryImage(wc_file_size, analysisName)

    doc={
        "userEmail" : email,
        "keyword" : keyword,
        "savedDate": savedDate,
        "analysisDate" : datetime.datetime.now(),
        #"duration" : ,
        "nTokens" : nTokens,
        "resultJson" : json.dumps(dict_words, ensure_ascii=False),
        "resultBar" : barBinary,
        "resultWC" : wcBinary,
        #"resultCSV" :,
    }
    db.count.insert_one(doc)  
    
    print("MongoDB에 저장되었습니다.")
    
    return dict_words
 
#word_count('*****@*****.**', '북한', "2021-07-08T11:46:03.973Z", 100, 'count')
#word_count('*****@*****.**', '북한', "2021-07-08T11:46:03.973Z", 100, 'count')
Esempio n. 58
0
# 5. 전처리 + 단어카운트 : 단어 길이(1음절)제한, 숫자 제외
nouns_count = {}  # 단어 카운트
for noun in nouns_word:
    if len(noun) > 1 and not (match('^[0-9]', noun)):
        # key[noun] = value[출현빈도수]
        nouns_count[noun] = nouns_count.get(noun, 0) + 1
nouns_count
len(nouns_count)  # 19143

# 6. WordCloud

# 6-1) top50 word
word_count = Counter(nouns_count)  # dict
top20_word = word_count.most_common(20)
top20_word

# 6-2) wordcloud
wc = WordCloud(font_path='C:/Windows/Fonts/malgun.ttf',
               width=800,
               height=600,
               max_words=100,
               max_font_size=200,
               background_color='white')
wc_result = wc.generate_from_frequencies(dict(top20_word))
#wc_result # <wordcloud.wordcloud.WordCloud at 0x1e5dc3f5a48>
plt.figure(figsize=(12, 8))
plt.imshow(wc_result)
plt.axis('off')  # x축,y축 테두리 제거
plt.show()
Esempio n. 59
0
    stopwords = pd.read_csv('ChineseStopwords.txt',
                            index_col=False,
                            quoting=3,
                            sep="\t")
    words_df = words_df[~words_df.segment.isin(stopwords)]  # 剔除停用词中的词

    # 统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {"计数": np.size})  # 聚合
    words_stat = words_stat.reset_index().sort_values(
        by=["计数"], ascending=False)  # 按照词频降序
    #    a = words_stat.head()
    #    print(a)

    # 初始化一个词云
    wordcloud = WordCloud(
        font_path='./font/simhei.ttf',
        background_color='white',
        max_font_size=230,
        mask=graph,
    )

    # 取出前1000个高频词汇
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}  # 词频字典
    wordcloud = wordcloud.generate_from_frequencies(word_frequence)

    wordcloud.to_file(outpath)  # 保存图片
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Esempio n. 60
0
lda = LatentDirichletAllocation(n_components=total_topics, 
            max_iter=15,learning_method='online', 
            learning_offset=15,random_state=1234)
ldaTransform = lda.fit_transform(td)
#declaring number of terms we need per topic
terms_count = 25
#Looping over lda components to get topics and their related terms with high probabilities
for idx,topic in enumerate(lda.components_):    
    print('Topic# ',idx+1)
    abs_topic = abs(topic)
    topic_terms = [[terms[i],topic[i]] for i in abs_topic.argsort()[:-terms_count-1:-1]]
    topic_terms_sorted = [[terms[i], topic[i]] for i in abs_topic.argsort()[:-terms_count - 1:-1]]
    topic_words = []
    for i in range(terms_count):
        topic_words.append(topic_terms_sorted[i][0])
    print(','.join( word for word in topic_words))
    print("")
    dict_word_frequency = {}
    
    for i in range(terms_count):
        dict_word_frequency[topic_terms_sorted[i][0]] = topic_terms_sorted[i][1]    
    wcloud = WordCloud(background_color="white",mask=None, max_words=100,\
                        max_font_size=60,min_font_size=10,prefer_horizontal=0.9,
                        contour_width=3,contour_color='black')
    wcloud.generate_from_frequencies(dict_word_frequency)       
    plt.imshow(wcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig("Topic#"+str(idx+1), format="png")