Esempio n. 1
0
def test_unicode_stopwords():
    wc_unicode = WordCloud(stopwords=[u'Beautiful'])
    try:
        words_unicode = wc_unicode.process_text(unicode(THIS))
    except NameError:  # PY3
        words_unicode = wc_unicode.process_text(THIS)

    wc_str = WordCloud(stopwords=['Beautiful'])
    words_str = wc_str.process_text(str(THIS))

    assert words_unicode == words_str
Esempio n. 2
0
def test_unicode_stopwords():
    wc_unicode = WordCloud(stopwords=[u'Beautiful'])
    try:
        words_unicode = wc_unicode.process_text(unicode(THIS))
    except NameError:  # PY3
        words_unicode = wc_unicode.process_text(THIS)

    wc_str = WordCloud(stopwords=['Beautiful'])
    words_str = wc_str.process_text(str(THIS))

    assert_true(words_unicode == words_str)
Esempio n. 3
0
def test_process_text_default_patterns():
    wc = WordCloud(stopwords=set(), include_numbers=True, min_word_length=2)
    words = wc.process_text(THIS)

    wc2 = WordCloud(stopwords=set(), include_numbers=True, min_word_length=1)
    words2 = wc2.process_text(THIS)

    assert "a" not in words
    assert "3" not in words

    assert "a" in words2
    assert "3" in words2
Esempio n. 4
0
    def generate(text, mask, filtered_words):
        """Generate wordcloud"""

        wc = WordCloud(max_words=4000,
                       mask=mask,
                       repeat=False,
                       stopwords=filtered_words)
        wc.process_text(text)
        wc.generate(text)
        img = wc.to_image()
        b = BytesIO()
        img.save(b, 'png')
        b.seek(0)
        return b
Esempio n. 5
0
def generate_wordcloud(data: list):
    """Function to generate wordmap"""

    text = ' '.join(data[5])
    # backgroud_Image = plt.imread('job.jpg')
    stopwords = set('')
    stopwords.update(['be', 'am', 'are', 'is', 'was', 'were', 'being', 'been', 'can', 'could', 'dare', 'do', 'does',
                      'did', 'have', 'has', 'had', 'having', 'may', 'might', 'must', 'need', 'ought', 'shall', 'should',
                      'will', 'would', 'he', 'she', 'they', 'i', 'me', 'my', 'mine', 'you', 'yours', 'their', 'a'])

    wc = WordCloud(
        background_color='white',
        font_path='C:\Windows\Fonts\Arial.TTF',
        max_words=2000,
        max_font_size=150,
        random_state=30,
        stopwords=stopwords
    )

    wc.generate_from_text(text)

    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True)
    print(sort[:50])
    img_colors = ImageColorGenerator('white')
    wc.recolor(color_func=img_colors)

    img_name = '{}_wordcloud.png'.format(data[0])
    plt.savefig(img_name, dpi=180)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

    return img_name
Esempio n. 6
0
def gera_word_cloud(cod_assunto, nome_assunto):

    docs = df_amostra[(df_amostra['cd_assunto_nivel_3'] == cod_assunto)]
    docs = docs.head(2000)

    pool = mp.Pool(7)
    docs['texto_processado_2'] = pool.map(
        processa_texto, [row for row in docs['texto_processado']])
    pool.close()

    texto_docs = ''
    for index, row in docs.iterrows():
        texto_docs = texto_docs + ' ' + row['texto_processado_2']

    wc = WordCloud(background_color="white",
                   max_words=200,
                   width=1024,
                   height=350,
                   stopwords=stopwords_processadas,
                   collocations=False,
                   colormap="twilight",
                   normalize_plurals=False)
    t = wc.process_text(texto_docs)
    wc.generate_from_frequencies(t)
    wc.to_file(
        "/media/DATA/classificadorDeAssuntos/Dados/Resultados/word_clouds/word_cloud_"
        + str(cod_assunto) + "_" + nome_assunto + ".png")
def plot_wordcloud(data, selected_job):
    selected_data = data.loc[data.Category == selected_job].reset_index(
        drop=True)
    selected_data[
        "All_Qualifications"] = selected_data.Basic_Qualifications + selected_data.Preferred_Qualifications
    selected_qualifications = ''.join(selected_data.All_Qualifications)
    my_stopwords = {
        'and', 'experience', 'e', 'g', 'in', 'a', 'years', 'of', 'with',
        'ability', 'to', 'such', 'as', 'working', 'the', 'related', 'field',
        'or', 'work', 'for', 'using', 'etc', 'other', 'At', 'least', 'similar',
        'equivalent', 's', 'on', 'M', 'one', 'degree', 'knowledge', 'building',
        'strong', 'skill', 'skills', 'relevant', 'advanced', 'R',
        'demonstrated', 'tools', 'proficiency', 'environment', 'technical',
        'engineering', 'an', 'Amazon', 'i', 'Minimum', 'education',
        'reporting', 'highly', 'is', 'including', 'detail', 'this', 'role',
        'Meets', 'exceeds', 'project', 'able'
    }

    wc = WordCloud(background_color='white',
                   min_font_size=8,
                   prefer_horizontal=1,
                   stopwords=my_stopwords)
    wc.generate(selected_qualifications)
    frequencies = wc.process_text(selected_qualifications)
    return wc.to_image(), frequencies
def create_wordcloud(df):
    """
    生成地铁名词云
    """
    # 分词
    text = ''
    for line in df['station']:
        text += ' '.join(jieba.cut(line, cut_all=False))
        text += ' '
    backgroud_Image = plt.imread('rocket.jpg')
    wc = WordCloud(
        background_color='white',
        mask=backgroud_Image,
        font_path='C:\Windows\Fonts\华康俪金黑W8.TTF',
        max_words=1000,
        max_font_size=150,
        min_font_size=15,
        prefer_horizontal=1,
        random_state=50,
    )
    wc.generate_from_text(text)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)
    # 看看词频高的有哪些
    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)
    print(sort[:50])
    plt.imshow(wc)
    plt.axis('off')
    wc.to_file("地铁名词云.jpg")
    print('生成词云成功!')
Esempio n. 9
0
def get_wc(df: pd.DataFrame, field: str) -> (dict, WordCloud):
    """Devuelve un dict con las frecuencias y el word cloud"""
    words = ""
    for row in df[field].values:
        words += remove_accents(str(row)) + "\n"
    
    # uso stopwords custom a parte de las de wordcloud
    # sacamos las preposiciones porque no aportan mucho.
    stopwords = set([
        # preposiciones
        "el", "para", "en", "de", "la", "del",
        "nan", "los", "las", "se", "con", "al",
        "es", "lo",
        # html escapado
        "nbsp", "li", "br", "aacute", "hr", "col",
    ]).union(STOPWORDS)
    
    wc = WordCloud(
        width = 800, height = 800, 
        stopwords = stopwords,
        background_color ='white', 
        collocations=False, # evita collocations (ex. en casa, en venta)
    )
    
    
    freq = wc.process_text(words)
    # por si queremos printear las frecuencias en orden
    #print({k: v for k, v in sorted(freq.items(), key=lambda item: item[1], reverse=True)})

    wc.fit_words(freq)

    return freq, wc
Esempio n. 10
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument dicts
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    result = wc.generate_from_frequencies(words)

    assert_true(isinstance(result, WordCloud))
Esempio n. 11
0
def test_process_text():
    # test that process function returns a dict
    wc = WordCloud(max_words=50)
    result = wc.process_text(THIS)

    # check for proper return type
    assert_true(isinstance(result, dict))
Esempio n. 12
0
def creat_wordcloud(df):

    text = ''
    for line in df['title']:
        text += ' '.join(jieba.cut(line, cut_all=False))
        text += ' '

    background_Image = plt.imread('data/image.jpg')
    wc = WordCloud(background_color='white',
                   mask=background_Image,
                   font_path='msyh.ttc',
                   max_words=1000,
                   max_font_size=150,
                   min_font_size=15,
                   prefer_horizontal=1,
                   random_state=50)
    wc.generate_from_text(text)
    img_colors = ImageColorGenerator(background_Image)
    wc.recolor(color_func=img_colors)

    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)
    print(sort[:50])
    plt.imshow(wc)
    plt.axis('off')
    wc.to_file('商家标题词云.jpg')
    print('生成词云成功')
Esempio n. 13
0
def make_wordcloud(readtext, imagename):

    text_from_file_with_apath = open(readtext, encoding='utf-8-sig').read()

    wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True)
    wl_space_split = " ".join(wordlist_after_jieba)  # jieba中文分词

    stopwords = set()  #停用詞
    stopwords.update(['https:imgur'], ['https'], ['imgur'], ['jpg'], ['com'],
                     ['dcard'], ['tw'], ['www'], ['http'], ['png'])

    my_wordcloud = WordCloud(max_font_size=35,
                             mask=imagename,
                             stopwords=stopwords,
                             font_path='C:/Windows/Fonts/MSYH.TTC').generate(
                                 wl_space_split)  #max_font_size:最大的文字大小

    process_word = WordCloud.process_text(my_wordcloud,
                                          wl_space_split)  # 查看詞頻,方便重新增加停用词
    sort = sorted(process_word.items(), key=lambda e: e[1],
                  reverse=True)  # sort為list
    print(sort[:50])

    plt.imshow(my_wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
Esempio n. 14
0
def ana_result():
    file = r'E:\pytotal\bilibllcomments\result.csv'
    df = pd.read_csv(file, header=None)
    # 利用jieba库分词
    text = ''
    for line in df[1]:
        text += ' '.join(jieba.cut(line))
    # 生成词云
    bgi = pt.imread(r'E:\pytotal\bilibllcomments\佩奇.jpg')
    wc = WordCloud(
        background_color='white',
        mask=bgi,
        max_words=2000,
        max_font_size=80,
        random_state=30,
    )
    wc.generate_from_text(text)
    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=False)
    img_colors = ImageColorGenerator(bgi)
    wc.recolor(color_func=img_colors)
    pt.imshow(wc)
    pt.axis('off')
    wc.to_file(r'E:\pytotal\bilibllcomments\result.jpg')
    print('done')
Esempio n. 15
0
def create_wordcloud(pandas_series,
                     max_words=2000,
                     max_font_size=None,
                     filepath=None):
    mask = np.array(Image.open("images/NYC_silhouette.png"))
    mask[mask > 0] = 255
    # this is because the WordCloud library uses bigrams and if we do not shuffle data, we will
    # see noise like: Noise Residential, Residential Noise
    pandas_series = pandas_series.sample(frac=1)["Complaint Type"]

    wordcloud = WordCloud(width=3000,
                          height=2000,
                          max_words=max_words,
                          max_font_size=max_font_size,
                          background_color='black',
                          stopwords=STOPWORDS,
                          random_state=1,
                          mask=mask,
                          contour_width=3,
                          contour_color='white')

    text = pandas_series.astype(str).values
    processed_text = wordcloud.process_text(" ".join(text))
    wordcloud.generate_from_frequencies(processed_text)
    if filepath:
        wordcloud.to_file(filepath)
def wc_chinese():
    text = open(path.join(d, 'langchao2.txt'), encoding='UTF-8-SIG').read()
    font_path = '‪C:\Windows\Fonts\STXIHEI.TTF'
    background_Image = np.array(Image.open(path.join(d, "circle.jpg")))
    img_colors = ImageColorGenerator(background_Image)

    stopwords = set('')

    wc = WordCloud(
        font_path=font_path,
        margin=2,
        mask=background_Image,
        scale=2,
        max_words=200,
        min_font_size=4,
        max_font_size=100,
        stopwords=stopwords,
        random_state=42,
        background_color='white',
    )
    wc.generate_from_text(text)

    # 获取文本词排序,可调整 stopwords
    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)
    print(sort[:50])  # 获取文本词频最高的前50个词

    wc.recolor(color_func=img_colors)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('浪潮basic2.png', dpi=200)
    plt.show()
Esempio n. 17
0
def test_process_text():
    # test that process function returns a dict
    wc = WordCloud(max_words=50)
    result = wc.process_text(THIS)

    # check for proper return type
    assert isinstance(result, dict)
Esempio n. 18
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument dicts
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    result = wc.generate_from_frequencies(words)

    assert isinstance(result, WordCloud)
Esempio n. 19
0
    def generate_from_str(text_str, output_name):
        now = time.strftime('%Y-%m-%d', time.localtime())
        #text_str = ''.join(text_list)

        # 分词操作
        #text_str = jieba.cut(text_str)
        #text_str = ''.join(text_str)

        wc = WordCloud(
            font_path='./fonts/msyh.ttc',
            background_color='white',
            width=4961,  # A3
            height=3508,
            max_words=1000,
            color_func=colorFunc)
        print('计算词频...')
        freqs = wc.process_text(text_str)
        print(freqs)

        print('生成词频json...')
        if not os.path.exists('./wordcloud_json'): os.mkdir('./wordcloud_json')
        with open('./wordcloud_json/' + output_name + '_' + now + '.json',
                  'w',
                  encoding='utf-8') as jsonf:
            json.dump(freqs, jsonf, ensure_ascii=False)
Esempio n. 20
0
def create_wordcloud(df, save_name=None):
    """
    Creates a word cloud based on enrichment array

    Must have column 'term_name'.
    It takes this column, flattens it into a large string.
    Then we pass it to the python package wordcloud, which generates the
    wordcloud.

    It returns the figure with a save method and a dictionary of counts.



    Parameters
    ----------
    df : pd.DataFrame
    save_name : str

    Returns
    -------

    """
    data = df.apply(_cleanup_term_name, axis=1)
    text = ' '.join(data)
    # Generate a word cloud image
    wc = WordCloud(margin=0, background_color=None, mode='RGBA',
                   # min_count=1,
                   width=800, height=600, collocations=True,
                   stopwords=basic_words)
    wordcloud = wc.generate(text)
    word_dict = wc.process_text(text)

    def plot(self, save_name=None, figsize=(8, 5)):
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
        ax.imshow(self, interpolation='bilinear')
        plt.xticks([])
        plt.yticks([])
        plt.axis("off")
        if save_name is not None:
            plt.savefig('{}.png'.format(save_name), bbox_inches='tight',
                        dpi=150)
            plt.title(save_name)
        return fig

    wordcloud.plot = types.MethodType(plot, wordcloud)

    if save_name is not None:
        wordcloud.plot(save_name)

    wordcloud.word_dict = word_dict

    df1 = pd.DataFrame(list(word_dict.items()),
                       columns=['words', 'counts'])

    df1.sort_values('counts', ascending=False, inplace=True)
    wordcloud.data = df1

    return wordcloud
Esempio n. 21
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument of class
    # 'dict_items'
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    items = words.items()
    result = wc.generate_from_frequencies(items)

    assert_true(isinstance(result, WordCloud))
Esempio n. 22
0
def test_generate_from_frequencies():
    # test that generate_from_frequencies() takes input argument of class
    # 'dict_items'
    wc = WordCloud(max_words=50)
    words = wc.process_text(THIS)
    items = words.items()
    result = wc.generate_from_frequencies(items)

    assert_true(isinstance(result, WordCloud))
    def get_comments_wordcloud(self):
        print('请输入需要生产词云的番剧名称:')
        while True:
            fanOperaname = str(input())
            sql = "select media_id from media where media_name = '%s'" % (fanOperaname)
            self.cursor.execute(sql)
            temp = self.cursor.fetchall()
            if len(temp) != 0:
                result = temp[0][0]
                break
            else:
                print('输入番剧名称有错,请重新输入:')
        image_path = '词云相关数据/' + str(fanOperaname) + '.jpeg'
        data = self.get_data(result)

        # 用jieba进行精确分词,返回list
        data = jieba.lcut(data, cut_all=False)
        text = '。'.join(data)

        # 加载停止词
        stop_words = self.load_stopwords()

        # 中文的话要设置中文字体,不然词云会乱码
        font_path = 'SourceHanSansCN-Regular.ttf'
        # 设置词云背景图
        background_image = numpy.array(Image.open(image_path))
        # 从背景图中取色,不同区域字不同颜色
        img_colors = ImageColorGenerator(background_image)
        stopwords = set(stop_words)

        wc = WordCloud(
            font_path=font_path,
            margin=2,
            mask=background_image,
            scale=2,
            max_words=400,
            min_font_size=4,
            stopwords=stopwords,
            random_state=42,
            background_color='white',
            max_font_size=100,
        )
        wc.generate(text)  # 生成词云

        # 获取文本排序
        process_word = WordCloud.process_text(wc,text)
        sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True)
        print(sort[:50])
        wc.recolor(color_func=img_colors)

        ciyun_image = str(image_path.split('.')[0] + '.png')

        wc.to_file(ciyun_image)

        # 将词云图片和原图片进行叠加操作
        self.imageOverlay(image_path, ciyun_image)
        return True
 def word_cloud(self, show_plot=True):
     word_cloud_obj = WordCloud()
     freq_dict = word_cloud_obj.process_text(' '.join(self.text))
     if show_plot:
         word_cloud = word_cloud_obj.generate_from_frequencies(freq_dict)
         plt.figure()
         plt.imshow(word_cloud, interpolation='bilinear')
         plt.axis("off")
         plt.show()
     return freq_dict
def analysis9(data):
    jieba.load_userdict("userdict.txt")
    jieba.add_word('区块链')

    text=''
    for i in data['title'].values:
    # for i in data[data.year == 2018]['title'].values:
        # 替换无用字符
        symbol_to_replace = '[!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
        # data['name'].str.replace(symbol_to_replace,'',inplace=True,regex=True)
        i = re.sub(symbol_to_replace,'',i)
        # print(i)
        text+=' '.join(jieba.cut(i,cut_all=False))

    # text = jieba.del_word('如何')
    d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

    background_Image = np.array(Image.open(path.join(d, "tiger.png")))
    # background_Image = plt.imread('E:\my_Python\training\1exercise\tiger.png')

    font_path = 'C:\Windows\Fonts\SourceHanSansCN-Regular.otf'  # 思源黑,黑体simhei.ttf
    # 添加stopswords
    stopwords = set()
    # 先运行对text进行词频统计再排序,再选择要增加的停用词
    stopwords.update(['如何','怎么','一个','什么','为什么','还是','我们','为何','可能','不是','没有','哪些','成为','可以','背后','到底','就是','这么','不要','怎样','为了','能否','你们','还有','这样','这个','真的','那些'])

    wc = WordCloud(
        # background_color = '#3F3F3F',
        # background_color = 'white',
        background_color = 'black',
        font_path = font_path,
        mask = background_Image,
        stopwords = stopwords,
        max_words = 200,
        # width = 1000,height=600,
        margin =2,
        max_font_size = 100,
        random_state = 42,
        scale = 2,
        # colormap = 'viridis'
    )
    wc.generate_from_text(text)

    process_word = WordCloud.process_text(wc, text)
    # 下面是字典排序
    sort = sorted(process_word.items(),key=lambda e:e[1],reverse=True) # sort为list
    print(sort[:50])  # 输出前词频最高的前50个,然后筛选出不需要的stopwords,添加到前面的stopwords.update()方法中
    img_colors = ImageColorGenerator(background_Image)
    wc.recolor(color_func=img_colors)  # 颜色跟随图片颜色

    plt.imshow(wc,interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()  # 自动控制空白边缘,以全部显示x轴名称
    plt.savefig('huxiu5.png',dpi=200)
    plt.show()
def generate_wordclouds(candidate):

    texts = {
        'total': candidate['text'],
        'positive': candidate['text_positive'],
        'negative': candidate['text_negative']
    }
    scores = {
        'total': candidate['score_palavras'],
        'positive': candidate['score_palavras_pos'],
        'negative': candidate['score_palavras_neg']
    }

    brazil_mask = np.array(Image.open("brazil_mask.png"))

    colors = {}
    for score in scores.items():
        colors[score[0]] = SimpleGroupedColorFunc(
            {p[0]: color_to_use[math.floor(p[1] * 9.99)]
             for p in score[1]}, default_color)

        wordcloud = WordCloud(width=1200,
                              height=1200,
                              background_color="white",
                              mask=brazil_mask,
                              stopwords=stopwords_pt,
                              font_path='/Library/Fonts/Times New Roman.ttf')

    for text in texts.items():
        wordcloud_total = wordcloud.generate(text[1])
        wordcloud_total.recolor(color_func=colors[text[0]])

        plt.axis("off")
        plt.imshow(wordcloud_total, interpolation="bilinear")
        plt.show()

        #image_total = wordcloud_total.to_image()
        #image_total.show()
        wordcloud_total.to_file('../imagens/{}_{}_{}.png'.format(
            'pyspark' if candidate['type'] == 'csv' else 'azure',
            candidate['candidate'], text[0]))

    #generate CSV
    if candidate['type'] == 'csv':
        freq = wordcloud.process_text(texts['total'])

        with open('../data/frequency/pyspark_frequency_{}.csv'.format(
                candidate['candidate']),
                  'w',
                  encoding='utf-8') as f:  # Just use 'w' mode in 3.x
            csv_out = csv.writer(f)
            csv_out.writerow(['Word', 'Count'])
            for key in freq.keys():
                csv_out.writerow((key, freq[key]))
Esempio n. 27
0
def xnet_input_to_communities_wordcloud(input_file,
                                        output_file,
                                        minYear=minYear,
                                        minKCore=minKCore):
    graph = xn.xnet2igraph(input_file)
    verticesToDelete = np.where(
        np.logical_or(
            np.array(graph.vs["year"]) < minYear,
            np.array(graph.vs["KCore"]) < minKCore))[0]
    graph.delete_vertices(verticesToDelete)
    graph = graph.clusters(mode="WEAK").giant()
    communities = graph.vs["Community"]
    sortedCommunities = sortByFrequency(communities)[0:maxCommunities]
    fig = plt.figure(figsize=(20, 5 * math.ceil(len(sortedCommunities) / 2)))
    allAbstracts = "\n".join(graph.vs["paper_abstract"])
    allFrequencies = WordCloud(
        max_words=maxAllWords).process_text(allAbstracts)
    amask = np.zeros((500, 1000), dtype='B')
    amask[:10, :] = 255
    amask[-10:, :] = 255
    amask[:, :10] = 255
    amask[:, -10:] = 255
    for index, community in enumerate(sortedCommunities):
        communityColor = (_styleColors[index]
                          if index < len(_styleColors) else "#aaaaaa")
        abstracts = "\n".join([
            vertex["paper_abstract"] for vertex in graph.vs
            if vertex["Community"] == community
        ])
        plt.subplot(math.ceil(len(sortedCommunities) / 2), 2, index + 1)
        wc = WordCloud(background_color="white",
                       max_words=maxInternalWords,
                       width=1000,
                       height=500,
                       mask=amask,
                       contour_width=10,
                       contour_color=communityColor,
                       random_state=3,
                       color_func=generateColorFunction(communityColor))

        inCommunityFrequency = wc.process_text(abstracts)
        relativeFrequencies = {
            key: frequency / math.log(allFrequencies[key] + 1)
            for key, frequency in inCommunityFrequency.items()
            if key in allFrequencies
        }
        wc.generate_from_frequencies(relativeFrequencies)

        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")

    plt.tight_layout()
    plt.savefig(output_file)
    plt.close(fig)
Esempio n. 28
0
 def topWord(self, top=100):
     wc = WordCloud()
     result=wc.process_text(self.sentense)
     sortlist = sorted(result.items(), key=lambda item: item[1], reverse=True)
     top_dict = {}
     try:
         for i in range(0, top):
             top_dict[sortlist[i][0]] = sortlist[i][1]
     except:
         print('not enough %s words, please set top parameter down'%(top))
     print(top_dict)
     return top_dict
 def show_worldcloud(self, stopwords):
     stopwords = set(STOPWORDS) | set(stopwords)
     wc = WordCloud(width=800,
                    height=400,
                    stopwords=stopwords,
                    collocations=False)
     a = " ".join(x.text for x in self.tweets)
     self.words = wc.process_text(a)
     wc.generate_from_frequencies(self.words)
     plot.imshow(wc)
     plot.axis("off")
     plot.rcParams["figure.figsize"] = (20, 10)
     plot.show()
Esempio n. 30
0
def get_wordcloud_from_wordlist(wordlist, background_image='background', slow_connection_mode=False):
    from PIL import Image

    fpath = "/usr/share/fonts/opentype/noto/NotoSansCJK-Medium.ttc"

    # ストップワードの設定
    stop_words = [
        'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'ない',
        'くれる', 'やる', 'くださる', 'そう', 'せる', 'した', 'して',
        'て', 'に', 'を', 'は', 'の', 'が', 'と', 'た', 'し', 'で', 'も', 'な', 'い', 'か',
        'こと', 'これ', 'それ', 'ここ', 'もの',
        'ので', 'よう',
        'いい',
        '思う',
        '人', '気', '何',
        '私', '僕', '自分', 'やつ', 'さん', 'くん', 'ちゃん',
        '今日', '今', 'とき', 'まだ', 'もう', 'みたい',
    ]
    
    img_array = np.array(Image.open(background_image))
    
    pastel_colors = [f"hsl({hue}, 25%, 66%)" for hue in [0, 60, 120, 180]]
    def pastel_color_func(word, font_size, position, orientation, random_state=None,
                          **kwargs):
        import random
        return pastel_colors[random.randint(0, 3)]
    
    wordcloud = WordCloud(regexp=r"\w[\w']*",
                          normalize_plurals=False,
                          background_color="white",
                          font_path=fpath,
                          mask=img_array,
                          color_func=pastel_color_func if slow_connection_mode else ImageColorGenerator(img_array),
                          scale=1.5,
                          stopwords=set(stop_words),
#                          max_font_size=55, 
                         )
    text = ' '.join(wordlist)
    words = wordcloud.process_text(text)
    wordcloud.generate_from_frequencies(words)
    
    if slow_connection_mode:
        (wordcloud.to_image()
            .resize((400, 400), resample=Image.BOX)
            .convert(mode="P", palette=Image.ADAPTIVE, colors=8)
            .save('/tmp/wordcloud.png'))
    else:
        wordcloud.to_file("/tmp/wordcloud.png")
    
    return wordcloud, words
Esempio n. 31
0
def wc_english():
    # 获取当前文件的路径
    d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

    # 获取文本txt
    text = open(path.join(d, r'world_cloud\file\legend1900.txt')).read()

    # 读取背景图片
    background_Image = np.array(
        Image.open(path.join(d, r'world_cloud\mask1900.jpg')))

    # 提取背景图片颜色
    img_color = ImageColorGenerator(background_Image)

    # 设置英文停止词,用词云自带的停止词
    stopwords = set(STOPWORDS)
    stopwords.add('one')
    wc = WordCloud(
        margin=2,  # 设置页面边缘
        mask=background_Image,  # 设置背景图片
        scale=2,  # 设置缩放倍数
        max_words=200,  # 设置最多次个数
        min_font_size=4,  # 设置最小字体大小
        max_font_size=150,  # 设置最大字体大小
        stopwords=stopwords,
        random_state=42,
        background_color='white',  # 背景颜色
        colormap='Blues')

    # 删除词频出现的one
    # 获取文本词排序,来调整stopword
    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)

    # 生成词云
    wc.generate_from_text(text)

    # 根据图片色设置背景色,根据图片色彩绘制词云文字颜色
    # wc.recolor(color_func=img_color)
    wc.recolor(color_func=grey_color_func)
    # 显示图像 interpolation 内插入法 bilinear双线性
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()

    # 存储图像
    wc.to_file(d + r'\world_cloud\1900pro1.png')
    # or plt.savefig('1900_basic.png,dpi=200)

    plt.show()
Esempio n. 32
0
def wc_chinese():
    d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
    # 确定文本编码格式
    text = open(path.join(d, 'langchao.txt'), 'rb').read()
    text_charInfo = chardet.detect(text)
    #print(text_charInfo)
    # 结果:{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}

    text = open(path.join(d, r'langchao.txt'), encoding='UTF-8-SIG').read()
    text += ' '.join(jieba.cut(text, cut_all=False))  # cut_all=False 表示采用精确模式

    # 设置中文字体
    font_path = '‪C:\Windows\Fonts\STXIHEI.TTF'
    # 获取背景图片
    background_Image = np.array(Image.open(path.join(d, 'circle.jpg')))
    # 提取背景图片颜色
    img_colors = ImageColorGenerator(background_Image)
    # 设置中文停止词
    stopwords = set('')
    stopwords.update([
        '但是', '一个', '自己', '因此', '没有', '很多', '可以', '这个', '虽然', '因为', '这样', '已经',
        '现在', '一些', '比如', '不是', '当然', '可能', '如果', '就是', '同时', '比如', '这些', '必须',
        '由于', '而且', '并且', '他们'
    ])

    wc = WordCloud(font_path=font_path,
                   margin=2,
                   mask=background_Image,
                   scale=2,
                   max_words=200,
                   min_font_size=4,
                   max_font_size=100,
                   stopwords=stopwords,
                   random_state=42,
                   background_color='white')
    wc.generate_from_text(text)

    # 获取文本词排序,调整stopwords
    process_word = WordCloud.process_text(wc, text)
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)
    #print(sort[:50])  # 获取文本词频最高的前50个词

    # 根据图片色设置背景色
    wc.recolor(color_func=img_colors)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('浪潮basic01.png', dpi=200)
    plt.show()
Esempio n. 33
0
def writeFreq(text, outFile, words):
    """
    Writes frequencies of words into the specified file
    """

    excludewords = STOPWORDS.copy()
    
    for word in words:
        excludewords.add(word)
    
    wordcloud = WordCloud(max_words=NUM_OF_WORDS, stopwords=excludewords)
    freqList  = wordcloud.process_text(text)

    for item in freqList:
        outFile.write(item[0] + ',' + str(item[1]) + '\n')
Esempio n. 34
0
def analysis_09(data):
    jieba.load_userdict('userdict.txt')
    jieba.add_word('区块链')

    text = ''
    for i in data['title'].values:
        # 替换无用字符
        symbol_to_replace = '[!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
        i = re.sub(symbol_to_replace, '', i)
        text += ' '.join(jieba.cut(i, cut_all=False))

    d = path.dirname(__file__) if '__file__' in locals() else os.getcwd()

    background_Image = np.array(Image.open(path.join(d, "tiger.png")))

    font_path = 'C:\Windows\Fonts\STFANGSO.TTF'
    # 添加stopwords
    stopwords = set()
    # 先运行对text进行词频统计再排序,再选择要增加的停用词
    stopwords.update([
        '如何', '怎么', '一个', '什么', '为什么', '还是', '我们', '为何', '可能', '不是', '没有',
        '哪些', '成为', '可以', '背后', '到底', '就是', '这么', '不要', '怎样', '为了', '能否', '你们',
        '还有', '这样', '这个', '真的', '那些'
    ])

    wc = WordCloud(background_color='black',
                   font_path=font_path,
                   mask=background_Image,
                   stopwords=stopwords,
                   max_words=200,
                   margin=2,
                   max_font_size=100,
                   random_state=42,
                   scale=2)
    wc.generate_from_text(text)

    process_word = WordCloud.process_text(wc, text)
    # 字典排序
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)
    print(sort[:50])
    img_colors = ImageColorGenerator(background_Image)
    wc.recolor(color_func=img_colors)  # 颜色跟随图片颜色

    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('词云图.png', dpi=200)
    plt.show()
Esempio n. 35
0
def wc_english_improve_03():
    d = path.dirname(__file__) if '__file__' in locals() else os.getcwd()
    text = open(path.join(d, 'legend1900.txt')).read()
    # 读取背景图片
    background_Image = np.array(Image.open(path.join(d, 'mask1900.jpg')))
    # or
    # background_Image = imread(path.join(d,'mask1900.jpg'))
    # 提取背景图片颜色
    #img_colors = ImageColorGenerator(background_Image)
    # 设置英文停止词,分割筛除文本中不需要的词汇,比如:a、an、the
    stopwords = set(STOPWORDS)
    stopwords.add('one')

    wc = WordCloud(margin=2,
                   mask=background_Image,
                   scale=2,
                   max_words=200,
                   min_font_size=4,
                   max_font_size=150,
                   stopwords=stopwords,
                   random_state=42,
                   background_color='black')
    # 生成词云
    wc.generate_from_text(text)

    # 获取文本词排序,调整stopwords
    process_word = WordCloud.process_text(
        wc, text)  # 返回的是dict,表示的是分词后的token以及对应出现的次数
    sort = sorted(process_word.items(), key=lambda e: e[1], reverse=True)

    #print(sort[:50]) # 获取文本词频最高的前50个词,one出现60次

    def grey_color_func(word,
                        font_size,
                        position,
                        orientation,
                        random_state=None,
                        **kwargs):
        return "hsl(0, 0%%, %d%%)" % random.randint(50,
                                                    100)  # 随机设置hsl值,色相,饱和度,明度

    wc.recolor(color_func=grey_color_func)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('1900_basic_04.png', dpi=200)
    plt.show()
Esempio n. 36
0
__author__ = "Nick"

from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

text = open("Alice.txt").read()

wc = WordCloud(background_color="white", max_words=100, stopwords=STOPWORDS.add("said"))
wc.generate(text)

print(wc.process_text(text=text))

plt.imshow(wc)
plt.axis("off")
plt.show()
Esempio n. 37
0
    seed = int(sys.argv[0])
except:
    seed = 0



wc = WordCloud(max_words=10000,
               stopwords=stopwords,
               margin=5,
               random_state=seed,
               height=height,
               max_font_size=max_font_size,
               width=width,
               prefer_horizontal=0.75)

freqs = wc.process_text(text)


freqs = [freqs[1::2],
         freqs[::2]]


count = [0, 0]



for i in range(min([len(x) for x in freqs])):
    if((len(freqs[0][i][0]) > len(freqs[1][i][0]) and count[0] > count[1]) or
       (len(freqs[0][i][0]) < len(freqs[1][i][0]) and count[0] < count[1])):
        freqs[0][i], freqs[1][i] = freqs[1][i], freqs[0][i]
        print("Swapped %d" % i, freqs[0][i], freqs[1][i], count)
Esempio n. 38
0
df = pd.read_csv('music_message.csv', header=None)

text = ''
for line in df[2]:
    text += ' '.join(jieba.cut(line, cut_all=False))
backgroud_Image = plt.imread('job.jpg')
stopwords = set('')
stopwords.update(['封面', 'none介绍', '介绍', '歌单', '歌曲', '我们', '自己', '没有', '就是', '可以', '知道', '一起', '不是', '因为', '什么', '时候', '还是', '如果', '不要', '那些', '那么', '那个', '所有', '一样', '一直', '不会', '现在', '他们', '这样', '最后', '这个', '只是', '有些', '其实', '开始', '曾经', '所以', '不能', '你们', '已经', '后来', '一切', '一定', '这些', '一些', '只有', '还有'])

wc = WordCloud(
    background_color='white',
    mask=backgroud_Image,
    font_path='C:\Windows\Fonts\STZHONGS.TTF',
    max_words=2000,
    max_font_size=150,
    random_state=30,
    stopwords=stopwords
)
wc.generate_from_text(text)
# 看看词频高的有哪些,把无用信息去除
process_word = WordCloud.process_text(wc, text)
sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True)
print(sort[:50])
img_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func=img_colors)
plt.imshow(wc)
plt.axis('off')
wc.to_file("活着.jpg")
print('生成词云成功!')

Esempio n. 39
0
def test_stopwords_lowercasing():
    # test that capitalized stopwords work.
    wc = WordCloud(stopwords=["Beautiful"])
    processed = wc.process_text(THIS)
    words = [count[0] for count in processed]
    assert_true("Beautiful" not in words)
Esempio n. 40
0
def test_include_numbers():
    wc_numbers = WordCloud(include_numbers=True)
    wc = wc_numbers.process_text(THIS)

    assert '14' in wc.keys()
Esempio n. 41
0
def test_min_word_length():
    wc_numbers = WordCloud(min_word_length=5)
    wc = wc_numbers.process_text(THIS)
    word_lengths = [len(word) for word in wc.keys()]

    assert min(word_lengths) == 5
Esempio n. 42
0
def test_process_text_regexp_parameter():
    # test that word processing is influenced by `regexp`
    wc = WordCloud(max_words=50, regexp=r'\w{5}')
    words = wc.process_text(THIS)

    assert_false('than' in words)