Esempio n. 1
0
    def create_wordclouds(self, text, name_of_cloud, additional_stop_list, max_words, width, height, bigram = False):
        text_nopunc = self.remove_punctuation(text, "", "")
        text_lower = text_nopunc.lower()
        stop = self.stopwords
        stop.extend(additional_stop_list)
        text_nostop = self.remove_stopword(text_lower, stop)
        tokens = wt(text_nostop)
        text_lem = self.lemmatize(tokens)
        tokens_lem = wt(text_lem)
        my_bigrams = nltk.bigrams(tokens_lem)
        if bigram:
            bigram_merged=list()
            for line in my_bigrams:
                bigram_merged.append(line[0]+' ' + line[1])
            counts = collections.Counter(bigram_merged)
        else:
            counts = collections.Counter(tokens_lem)
        final = counts.most_common(max_words)
        max_count = max(final, key=operator.itemgetter(1))[1]
        final = [(name, count / float(max_count))for name, count in final]

        # tags = make_tags(final, maxsize=max_word_size)
        # create_tag_image(tags, name_of_cloud+'.png', size=(width, height), layout=3, fontname='Crimson Text', background = (255, 255, 255))

        # temp_cloud = " ".join(text for text, count in final)
        word_cloud = WordCloud(font_path="fonts/Georgia.ttf",
            width=width, height=height, max_words=max_words, stopwords=stop)
        word_cloud.fit_words(final)
        word_cloud.to_file(name_of_cloud + ".png")
Esempio n. 2
0
def make_clouds(files, n_words=20):
    # set locations
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_d = '../browser/clouds/' + base_model_name + '/'
    if not os.path.exists(output_d):
        os.makedirs(output_d)
    # create wordcloud generator
    wc = WordCloud(width=1000, height=500, background_color='white')

    print('Loading model')
    model = LdaModel.load(files.model)
    beta = model.expElogbeta

    print('Normalizing by topics, and by words')
    pTW = normalize(beta, axis=0)
    pWT = normalize(beta, axis=1)

    # load bug<->id map, then invert to id<-> bug
    bug_to_id = json.loads(open(files.replacements).read())
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}

    for i in range(len(beta)):
        # compute RAR
        t_rar = np.sqrt(pTW[i] * pWT[i])
        top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
        top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
        top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
        wc.fit_words(zip(top_words, t_rar[top_word_ids]))
        wc.to_file(output_d + str(i) + '.png')
Esempio n. 3
0
def topic_word_cloud(nmf, topic_idx, max_words=300, figsize=(14, 8), width=2400, height=1300, ax=None):
    ''' Create word cloud for a given topic
    INPUT:
        nmf: NMFClustering object
        topic_idx: int
        max_words: int
            Max number of words to encorporate into the word cloud
        figsize: tuple (int, int)
            Size of the figure if an axis isn't passed
        width: int
        height: int
        ax: None or matplotlib axis object
    '''
    wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height)
    word_freq = nmf.topic_word_frequency(topic_idx)

    # Fit the WordCloud object to the specific topics word frequencies
    wc.fit_words(word_freq)

    # Create the matplotlib figure and axis if they weren't passed in
    if not ax:
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
    ax.imshow(wc)
    ax.axis('off')
Esempio n. 4
0
 def get_wordcloud_img(self, interval_id):
     text_freq = self.get_word_frequencies(interval_id)
     wordcloud = WordCloud(font_path=FONT_PATH, width=self.image_width, height=int(self.image_width * .75))
     wordcloud.fit_words(list(reversed(text_freq[-100:])))
     img_io = StringIO()
     wordcloud.to_image().save(img_io, 'JPEG', quality=70)
     img_io.seek(0)
     return img_io
 def generate_cloud(self,tags,sizeX,sizeY,filename = None):
     sentence = zip(tags.keys(),tags.values())
     wordcloud = WordCloud(width=sizeX, height=sizeY,relative_scaling=0.6)
     wordcloud.fit_words(sentence)
     plt.figure( figsize=(20,10), facecolor='k')
     plt.imshow(wordcloud)
     plt.axis("off")
     if(filename != None):
         plt.savefig(filename, facecolor='k', bbox_inches='tight')
     plt.show()
Esempio n. 6
0
def make_cloud(docs):
    flat_doc = count_words(docs)
    from wordcloud import WordCloud
    import wordcloud
   
    wc = WordCloud(ranks_only = True, font_path='/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf')
    wc.fit_words(flat_doc)
    
    
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
Esempio n. 7
0
def generate_wordcloud(vocab, metric, name):
    ''' Generate a simple word cloud of text '''
    list_tuples = []
    for w, c in zip(vocab, metric):
        list_tuples.append((w,int(c*100)))

    # Generate a word cloud image
    wordcloud = WordCloud(background_color = "white")

    wordcloud.fit_words(list_tuples)

    plt.imshow(wordcloud)
    plt.axis('off')
    plt.savefig('../../figures/'+name)
    pass
def paint_clouds(genre, cloud_words):
    '''
    For a given genre (text), paint a word cloud of at most cloud_words (int)
    words. Call the load_frequencies function to get a frequency list with 50
    more words in it than are needed for the word cloud, in case some don't fit.
    '''
    freq_list = load_frequencies(genre, cloud_words+50)
    wc = WordCloud(background_color = "white", max_words = cloud_words, \
    max_font_size = 40, random_state = 42)
    wc.fit_words(freq_list)
    fig = plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    plt.title(genre)
    plt.show()
    filename = '../data/cloud_' + genre + '.png'
    fig.savefig(filename)
Esempio n. 9
0
def topic_time_and_cloud(df, topic, feature_names, nmf, title, source=False, normalize=False, freq='W', year=True, max_words=300, positivity=True, show=True):
    fig = plt.figure(figsize=(14, 8.5))
    ax1 = fig.add_axes([0.05, 0.5, 0.93, 0.41])
    article_count_by_time(df, topic=topic, source=source, normalize=normalize, freq=freq, year=year, fig=fig, label=topic_labels[topic[1]], show=False)
    ax1.xaxis.labelpad = -4
    plt.suptitle(title, fontsize=20)

    fig.text(0.05, 0.44, 'Author: Erich Wellinger', fontsize=10, alpha=0.7)
    fig.text(0.33, 0.8, 'github.com/ewellinger/election_analysis', fontsize=20, color='gray', alpha=0.5)

    outlets = [('nyt', 'NYT', '#4c72b0'), ('foxnews', 'FOX', '#c44e52'), ('npr', 'NPR', '#55a868'), ('guardian', 'GUA', '#8172b2'), ('wsj', 'WSJ', '#ccb974')]

    # Create a boolean mask for whether each document is in the topic or not
    labels_mask = topic[0][:, topic[1]]
    num_articles = labels_mask.sum()
    percent_by_source = [float(len(df.loc[(labels_mask) & (df['source'] == outlet)])) / num_articles for outlet in zip(*outlets)[0]]
    normalized = [percent / np.sum(df['source'] == outlet) for percent, outlet in zip(percent_by_source, zip(*outlets)[0])]
    normalized = [percent / np.sum(normalized) for percent in normalized]

    plt.title('Number of Articles in Topic: {}'.format(num_articles), x=0.4825)

    ''' You should incorporate the word_cloud function in here!!! '''
    if not positivity:
        ax2 = fig.add_axes([0.025, 0, 0.79, 0.43])
        wc = WordCloud(background_color='white', max_words=max_words, width=1900, height=625)
    else:
        num_sources = 0
        for idx in xrange(len(outlets)):
            if len(df.loc[(labels_mask) & (df['source'] == outlets[idx][0])]) >= 5:
                num_sources += 1
        ax2 = fig.add_axes([0.025, 0, 0.712125-(num_sources*0.034425), 0.43])
        wc = WordCloud(background_color='white', max_words=max_words, width=1715-(num_sources*83), height=625)
        ax4 = fig.add_axes([0.782125-(num_sources*0.034425), 0.035, 0.034425+(num_sources*0.034425), 0.375])
    word_freq = topic_word_freq(nmf.components_, topic[1], feature_names)
    wc.fit_words(word_freq)
    ax2.imshow(wc)
    ax2.axis('off')
    ax3 = fig.add_axes([0.825, 0.01, 0.15555, 0.4])
    normalized_source_barchart(df, topic, outlets, ax3)
    if positivity:
        sentiment_source_barchart(df.loc[labels_mask], outlets, ax=ax4)
        if num_sources < 3:
            ax4.set_title('')
    if show:
        plt.show()
    return ax1
Esempio n. 10
0
def generate_wordcloud(y, vocab):
    ''' Generate a simple word cloud of text '''
    ingred_counts = np.sum(y, axis=0)

    word_cloud_text = []

    for i, vocab in zip(ingred_counts, vocab):
        word_cloud_text.append((str(vocab),int(i)))

    # Generate a word cloud image
    wordcloud = WordCloud(background_color = "white")

    wordcloud.fit_words(word_cloud_text)

    plt.imshow(wordcloud)
    plt.axis('off')
    plt.savefig('../../figures/vocab_wordcloud.png')
    pass
def post_process():

    #with open('clda_data/out_prism', 'r') as fin:
    #    phi_prism = [np.array(ast.literal_eval(line.strip())) for line in fin]
    #phi_prism = np.array(phi_prism)

    #theta_pb = np.load('/tmp/peircebayes/avg_samples.npz')
    #theta_pb = np.load('/home/rares/Desktop/peircebayes_all_no_sampling/last_sample.npz')
    theta_pb = np.load('data/avg_samples.npz')
    phi = theta_pb['arr_1']
    print phi.shape

    vocab = pickle.load(open('data/vocab.pkl', 'r'))
    inv = dict((v, k) for k, v in vocab.iteritems())

    axis = 1
    index = list(np.ix_(*[np.arange(i) for i in phi.shape]))
    index[axis] = phi.argsort(axis)
    a = phi[index][:,-20:]
    counts = np.rint(a/np.sum(a, axis=1).reshape(-1,1)*1000).tolist()
    idx_l = index[axis][:,-20:].tolist()
    words = [[inv[i] for i in subl] for subl in idx_l]
    #pprint(words)

    index_prism = list(np.ix_(*[np.arange(i) for i in phi_prism.shape]))
    index_prism[axis] = phi_prism.argsort(axis)
    a_prism = phi_prism[index_prism][:,-20:]
    idx_l_prism = index_prism[axis][:,-20:].tolist()
    words_prism = [[inv[i] for i in subl] for subl in idx_l_prism]

    #pprint(words_prism)

    # topic 1
    freq1 = list(reversed(zip(words[0], list(a[0,:]))))
    # topic 2
    freq2 = list(reversed(zip(words[1], list(a[1,:]))))

    # topic 1
    #freq1_prism = list(reversed(zip(words_prism[19], list(a_prism[19,:]))))
    # topic 2
    #freq2_prism = list(reversed(zip(words_prism[18], list(a_prism[18,:]))))


    wc = WordCloud(background_color="white", width=400, height=400,
        random_state=1234).fit_words(freq1)

    plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3))
    plt.axis("off")
    plt.savefig('data/topic_1.pdf', format='pdf')
    plt.close()

    plt.imshow(wc.fit_words(freq2).recolor(color_func=grey_color_func, random_state=3))
    plt.axis("off")
    plt.savefig('data/topic_2.pdf', format='pdf')
    plt.close()
Esempio n. 12
0
def make_word_cloud(topic_num, max_words=1000, width=10, height=10):
    post_nmf = pickle.load( open(POST_NMF_PICKLE) )
    post_tfidf = pickle.load( open(POST_TFIDF_PICKLE) )
    words = np.array(post_tfidf.get_feature_names())
    freq_sum = np.sum(post_nmf.components_[topic_num])
    frequencies = [val / freq_sum for val in post_nmf.components_[topic_num]]
    word_freq = zip(words, frequencies)


    wc = WordCloud(background_color='white')

    wc.fit_words(word_freq)


    #fig = plt.figure(figsize=(10,10))
    #ax = fig.add_subplot(111)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()
    return word_freq
    def topic_word_cloud(self, topic_num, max_words=200, figsize=None, width=2400, height=1300, ax=None, mask_fname=None, inherit_color=False):
        ''' Create word cloud for a given topic
        INPUT:
            topic_idx: int
            max_words: int (default 200)
                Max number of words to encorporate into the word cloud
            figsize: tuple (int, int)
                Size of the figure if an axis isn't passed
            width: int (default 2400)
            height: int (default 1300)
            ax: None or matplotlib axis object
            mask_fname: None or str
                None if no mask is desired, otherwise a string providing the path the image being used as the mask
            inherit_color: bool, default False
                Indicates whether the wordcloud should inherit the colors from the image mask
        '''
        if figsize == None:
            figsize = self.figsize

        if mask_fname:
            mask = np.array(Image.open(mask_fname))
            wc = WordCloud(background_color='white', max_words=max_words, mask=mask, width=width, height=height)
        else:
            wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height)
        word_freq = self.nmf.topic_word_frequency(topic_num)

        # Fit the WordCloud object to the specific topic's word frequencies
        wc.fit_words(word_freq)

        # Create the matplotlib figure and axis if they weren't passed in
        if not ax:
            fig = plt.figure(figsize=self.figsize)
            ax = fig.add_subplot(111)

        if mask_fname and inherit_color:
            image_colors = ImageColorGenerator(imread(mask_fname))
            plt.imshow(wc.recolor(color_func=image_colors))
            plt.axis('off')
        else:
            ax.imshow(wc)
            ax.axis('off')
Esempio n. 14
0
def make_word_cloud(cluster_to_words, colormaps):
    b64_figures = []
    font_path = "./crover/data/font/NotoSansJP-Regular_subset.otf"  # 通常使われる漢字を抽出したサブセット

    for i in range(len(cluster_to_words)):
        wordcloud = WordCloud(font_path=font_path, background_color="white",
                              width=500, height=500, colormap=colormaps[i % len(colormaps)])
        logger.info('fit word cloud')
        if len(cluster_to_words[i]) == 0:
            cluster_to_words[i] = {'ベクトル未割り当てワードなし': 1}
        wordcloud.fit_words(cluster_to_words[i])

        logger.info('save word cloud')
        # 画像書き込み用バッファに画像を保存してhtmlに返す
        buf = io.BytesIO()
        img = wordcloud.to_image()
        img.save(buf, 'PNG')
        #img.save(buf, 'JPEG')
        logger.info('b64 encode')
        qr_b64str = base64.b64encode(buf.getvalue()).decode("utf-8")
        b64_figures.append("data:image/png;base64,{}".format(qr_b64str))
        #b64_figures.append("data:image/jpg;base64,{}".format(qr_b64str))

    return b64_figures
Esempio n. 15
0
 def plot_top_words_with_filters(num_word_instances, stop_words,
                                 small_words, lower, more_stop_words):
     tweets = bok_tweets.text
     if lower:
         tweets = tweets.str.lower()
     if stop_words:
         tweets = tweets.apply(remove_stopwords)
     if small_words:
         tweets = tweets.str.findall('\w{3,}').str.join(' ')
     if len(more_stop_words) > 0:
         remove_more_stopwords = lambda x: ' '.join(y for y in x.split(
         ) if y not in (x.strip() for x in more_stop_words.split(',')))
         tweets = tweets.apply(remove_more_stopwords)
     tdm_df = create_term_document_matrix2(tweets, min_df=2)
     word_frequencies = tdm_df[[x for x in tdm_df.columns
                                if len(x) > 1]].sum()
     sorted_words = word_frequencies.sort_values(ascending=False)
     top_sorted_words = sorted_words[:num_word_instances]
     wordcloud = WordCloud(max_font_size=40)
     wordcloud.fit_words(top_sorted_words.to_dict())
     plt.figure()
     plt.imshow(wordcloud, interpolation="bilinear")
     plt.axis("off")
     plt.show()
Esempio n. 16
0
def creating_cloud(given_dict_with_freq_words):
    ### most freq 200 words

    dict_two_hundred = {}
    counting = 0

    for i, j in given_dict_with_freq_words.items():
        dict_two_hundred[i] = j
        if counting == 200:
            break
        counting += 1

    wordcloud = WordCloud(colormap='prism', background_color='white')
    wordcloud = wordcloud.fit_words(dict_two_hundred)
    wordcloud.to_file('PrideAndPrejudice.png')
Esempio n. 17
0
def gen_word_cloud_picture(words_stat,
                           font_path="./demo.ttf",
                           mask_file="./data/heart.jpg",
                           word_color_img="./data/pink.jpg",
                           background_color="white"):
    # 自定义图像背景并将词云图形化输出
    mask_img = imread(mask_file)
    wordcloud = WordCloud(background_color=background_color,
                          mask=mask_img,
                          font_path=font_path)
    word_frequence = {x[0]: x[1] for x in words_stat.head(20000).values}
    wordcloud = wordcloud.fit_words(word_frequence)
    color_img = imread(word_color_img)
    mask_color = ImageColorGenerator(color_img)
    return wordcloud.recolor(color_func=mask_color)
Esempio n. 18
0
def show(words_stat):
    #词云表示
    matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
    wordcloud = WordCloud(font_path='hanyiqihei.ttf',
                          background_color="white",
                          max_font_size=80)  # 指定字体类型、字体大小和字体颜色
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud = wordcloud.fit_words(dict(word_frequence_list))
    plt.imshow(wordcloud)
    plt.show()
Esempio n. 19
0
def draw_wordcloud(period_i, i):
    word_count = np.array(collections.Counter(period_i).most_common())
    tf = {
        word_count[j][0]: int(word_count[j][1])
        for j in range(len(word_count))
    }  #词频统计词典
    coloring = np.array(Image.open("zhongxing.jpg"))  #图片
    my_wordcloud = WordCloud(background_color="white",
                             max_words=2000,
                             mask=coloring,
                             max_font_size=60,
                             random_state=42,
                             scale=2,
                             font_path=os.environ.get(
                                 "FONT_PATH", "C:/Windows/Fonts/simfang.ttf"))
    my_wordcloud.fit_words(tf)
    image_colors = ImageColorGenerator(coloring)
    plt.figure(figsize=(18.5, 10.5))
    plt.imshow(my_wordcloud.recolor(color_func=image_colors))
    plt.xticks([]), plt.yticks([])  #隐藏坐标线
    plt.axis("off")
    plt.imshow(my_wordcloud)
    plt.savefig("period_" + str(i) + "_wordcloud.jpg")
    plt.show()
def main():
    #循环获取第一个电影的前10页评论
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    for i in range(10):    
        num = i + 1 
        commentList_temp = getCommentsById(NowPlayingMovie_list[1]['id'], num)
        commentList.append(commentList_temp)

    #将列表中的数据转换为字符串
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()

    #使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)
    print(cleaned_comments)



    #使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df=pd.DataFrame({'segment':segment})

    #去掉停用词
    stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

    #统计词频
    words_stat=words_df.groupby(by=['segment'])['segment'].agg({"num":numpy.size})
    words_stat=words_stat.reset_index().sort_values(by=["num"],ascending=False)

    #用词云进行显示
    d = path.dirname(__file__)
    alice_mask = np.array(Image.open(path.join(d, "alice_mask.png")))
    wordcloud=WordCloud(font_path=r'simhei.ttf',background_color="white",max_font_size=80, mask=alice_mask)
    word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}

    word_frequence_list = []
    for key in word_frequence:
        temp = (key,word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud=wordcloud.fit_words(word_frequence_list)
    plt.imshow(wordcloud)
    wordcloud.to_file('wwxd.png')  # 把词云保存下来 
Esempio n. 21
0
def main():
    #循环获取第一个电影的前10页评论
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    for i in range(1):    
        num = i + 1 
        commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)
        commentList.append(commentList_temp)

    #将列表中的数据转换为字符串
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()

    #使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    # 去除标签符号以后,变成了一个纯字符串
    cleaned_comments = ''.join(filterdata)

    #使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df=pd.DataFrame({'segment':segment})

    #去掉停用词
    stopwords=pd.read_csv(file_path+"stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)] # 其中 ~是取反

    #统计词频
    words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
    words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)

    bg_pic = imread(file_path+'3.jpg')
    #用词云进行显示
    if 'Windows' in platform.system():
        wordcloud=WordCloud(mask=bg_pic,font_path="simhei.ttf",background_color="white",max_font_size=80)
    else:
        wordcloud=WordCloud(font_path="/Library/Fonts/Songti.ttc",background_color="white",max_font_size=80)
    
    word_frequence = {x[0]:x[1] for x in words_stat.head(100).values}

    wordcloud=wordcloud.fit_words(word_frequence) #参数为dict类型
    image_colors = ImageColorGenerator(bg_pic)
    
    #显示词云
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
Esempio n. 22
0
def character_view_picture(data):
    # backgroud_Image = plt.imread('1.jpg')  # 有背景就用,没背景就不用
    picture = WordCloud(
        # mask=backgroud_Image,
        width=1024,
        height=768,
        background_color='white',
        font_path="C:\simhei.ttf",
        max_font_size=400,
        random_state=50)
    picture = picture.fit_words({x[0]: x[1] for x in data.head(100).values})
    plt.imshow(picture, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    base_path = path.dirname(__file__)
    picture.to_file(path.join(base_path, "yuebing.png"))
Esempio n. 23
0
def main():
    #循环获取第一个电影的前10页评论
    commentslist = []
    nowplayingmovie_list = getmovie_list()
    for i in range(20):
        num = i + 1
        commentlist_temp = get_comment(nowplayingmovie_list[0]['id'], num)
        commentslist.append(commentlist_temp)
        #将列表中的数据转换成字符串
        comments = ''
    for com in range(len(eachcomment)):
        comments = comments + (str(eachcomment[com])).strip()
    #使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    clean_comments = ''.join(filterdata)
    #print(clean_comments)
    #使用结巴分词进行中文分词
    segment = jieba.lcut(clean_comments)
    words_df = pd.DataFrame({'segment': segment})
    #去掉停用词
    stopwords = pd.read_csv("stopwords.txt",
                            index_col=False,
                            quotechar="3",
                            sep="\t",
                            names=['stopwords'],
                            encoding='gb2312')
    words_df = words_df[~words_df.segment.isin(stopwords)]
    #统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)
    #用词云进行显示
    wordcloud = WordCloud(font_path="C:/windows/fonts/simhei.ttf",
                          background_color="white",
                          max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)
    wordcloud = wordcloud.fit_words(dict(word_frequence_list))
    plt.imshow(wordcloud)
    #plt.savefig("result.jpg")
    plt.axis('off')
    plt.show()
def generate_word_cloud(data, stopwords, **kwargs):
    """
    生成词云
    :param data        词云数据
    :param stopwords   停用词
    :param kwargs
    :return:
    """
    movie_name = kwargs.get('movie_name')
    if movie_name:
        data_com_X = data[data.movie == movie_name]
    else:
        data_com_X = data

    content_X = data_com_X.comment.dropna().values.tolist()
    # 导入,分词
    segment = []
    for line in content_X:
        try:
            segs = jieba.lcut(line)
            for seg in segs:
                if len(seg) > 1 and seg != '\r\n':
                    segment.append(seg)
        except Exception as e:
            # print(line)
            continue

    # 去停用词
    words_df = pd.DataFrame({'segment': segment})
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
    # 统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {'计数': np.size})
    words_stat = words_stat.reset_index().sort_values(by=['计数'],
                                                      ascending=False)
    # print(words_stat.head())

    # 词云
    word_cloud = WordCloud(font_path='./data/simhei.ttf',
                           background_color='white',
                           max_font_size=80)
    words_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    print(words_frequence)
    word_cloud = word_cloud.fit_words(words_frequence)
    plt.imshow(word_cloud)

    return True
Esempio n. 25
0
def make_wordcloud():
    for folder in os.listdir('LRC'):
        os.chdir('E:\代码\python\CloudSpider')

        # 依次读取文件,分词,生成all_words列表,用停用词检查后生成新的all_words_new
        all_words =[]
        outstr = ''
        for filename in os.listdir('LRC/'+folder):
            with open('LRC/'+folder+'/' +filename,encoding='utf-8') as f:
                lyrics =f.read()
                data =jieba.cut(lyrics)
                all_words.extend(set(data))
        for word in all_words:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        all_words_new= outstr.split(" ")  # 转成列表

        # 对all_words中的词计数,并按照词频排序
        count =Counter(all_words_new)
        result =sorted(count.items(), key=lambda x: x[1], reverse=True)
        for r in result:
            if r[0]=='' or r[0]=='\ufeff':
                result.remove(r)
        for r in result:
            if r[0]=='\n':
                result.remove(r)
        # print(result[0:20])

        # 词云显示
        word_dic =dict(count.items())
        # 使matplotlib模块能显示中文
        mpl.rcParams['font.sans-serif'] = ['SimHei']        # 指定默认字体
        mpl.rcParams['axes.unicode_minus'] = False          # 解决保存图像是负号'-'显示为方块的问题
        color_mask =imread('bg_love.jpg')                      # 背景图
        cloud =WordCloud(
            font_path='msyh.ttc',                           # 注意选择本机字体文件的地址
            width=600,
            height=480,
            background_color='black',
            mask=color_mask,
            max_words=350,
            max_font_size=150)
        world_cloud =cloud.fit_words(word_dic)
        os.chdir('word_picture')
        world_cloud.to_file(folder+'.jpg')
Esempio n. 26
0
def wcfigure(
    wordsdf,
    path=r'd:\test.jpg'
):  #画云图图,传入dataframe,保存文件路径和名字 wdcounts.head(2000).itertuples(index=False)
    wordcloud = WordCloud(font_path='c:\windows\fonts\STCAIYUN.TTF',
                          background_color="white",
                          margin=5,
                          width=1800,
                          height=1000)
    #必须要加载文字体不然中文乱码  #print segStat.head(100).itertuples(index=False)
    wordcloud = wordcloud.fit_words(wordsdf.itertuples(index=False))
    plt.figure(num=None, figsize=(25, 16), dpi=8, facecolor='w', edgecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(path)
    plt.show()
    plt.close()
Esempio n. 27
0
def main():

    commentList = []
    '''
    for i in range(15):
        pagenum = i + 1
        commentList_temp = getComment(nowplaying_list[0]['id'], pagenum)
        commentList.append(commentList_temp)
    '''

    comments = ''
    for i in range(len(eachCommentList)):
        comments = comments + (str(eachCommentList[i])).strip()

    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)

    segment = jieba.lcut(cleaned_comments)
    words_df = pd.DataFrame({'segment': segment})

    stopwords = pd.read_csv("stopwords.txt",
                            index_col=False,
                            quoting=3,
                            sep="\t",
                            names=['stopword'],
                            encoding='GBK')
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)

    wordcloud = WordCloud(font_path="simhei.ttf",
                          background_color="white",
                          max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}

    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)
    wordcloud = wordcloud.fit_words(dict(word_frequence_list))
    plt.imshow(wordcloud)
    pylab.show()
Esempio n. 28
0
def main():
    #循环获取电影的前10页评论
    commentList = []
    # NowPlayingMovie_list = getNowPlayingMovie_list()
    movieId = getMovieIdByName()
    for i in range(10):
        num = i + 1
        commentList_temp = getCommentsById(movieId, num)
        # commentList_temp = getCommentsById('27133303', num)
        commentList.append(commentList_temp)

    #将列表中的数据转换为字符串
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()

    #使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)

    #使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df=pd.DataFrame({'segment':segment})

    #去掉停用词
    stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

    #统计词频
    words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
    words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)

    #用词云进行显示
    wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80,width=800,height=400)
    word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}

    word_frequence_list = []
    for key in word_frequence:
        temp = (key,word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud=wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Esempio n. 29
0
def one_movie():
    comment_list = []
    nowplaying_movie_list = getNowPlayingMovie_list()
    for i in range(10):  # 前10页
        num = i + 1
        comment_list_temp = getCommentsById(nowplaying_movie_list[0]['id'],
                                            num)  # 索引0为当前上映的《八佰》
        comment_list.append(comment_list_temp)
        # print(comment_list)
    # 将列表中转换为字符串
    comments = ''
    for j in range(len(comment_list)):
        comments = comments + (str(comment_list[j])).strip()
    # 使用正则表达式去掉标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filter_data = re.findall(pattern, comments)
    cleaned_comments = ''.join(filter_data)
    # 使用jieba分词,并获得词频排列列表
    result = jieba.analyse.textrank(cleaned_comments, topK=50, withWeight=True)
    keywords = {}
    for k in result:
        keywords[k[0]] = k[1]  # 把列表里的二元组形成字典
    print('before delete stopword:', keywords)
    # 读取为停用词集合
    stopwords = set()
    f = open('./movie_stopwords.txt', encoding='utf-8')
    while True:
        word = f.readline()
        if word == '':
            break
        stopwords.add(word[:-1])
    print(stopwords)
    keywords = {x: keywords[x] for x in keywords if x not in stopwords}
    print('after delete stopword:', keywords)
    # 制作词云图
    wordcloud = WordCloud(font_path='simhei.ttf',
                          background_color='white',
                          max_font_size=80,
                          stopwords=stopwords)
    word_frequence = keywords
    myword = wordcloud.fit_words(word_frequence)
    plt.axis('off')
    plt.imshow(myword)
    plt.savefig('movie_comments_from_douban.png', dpi=300, bbox_inches='tight')
    plt.show()  # 先保存,show之后生成空对象
Esempio n. 30
0
def main():
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    for i in range(10):
        num = i + 1
        commentList_temp = getCommentsById(NowPlayingMovie_list[0]["id"], num)
        commentList.append(commentList_temp)

    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()

    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)

    segment = jieba.lcut(cleaned_comments)
    words_df = pd.DataFrame({'segment': segment})

    stopwords = pd.read_csv("stopwords.txt",
                            index_col=False,
                            quoting=3,
                            sep='\t',
                            names=['stopword'],
                            encoding='utf-8')
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    words_stat = words_df.groupby(by=['segment'])['segment'].agg(
        {'计数': numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)

    wordcloud = WordCloud(font_path='simhei.ttf',
                          background_color='white',
                          max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    print(word_frequence)
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud = wordcloud.fit_words(dict(word_frequence_list))
    plt.imshow(wordcloud)
    plt.savefig("result.jpg")
Esempio n. 31
0
 def draw_wc(self, words, stopword=None, title=''):
     if stopword:
         data = self.make_df(words, stopword)
     else:
         data = self.make_df(words)
     bg_pic = imread('source/luhan.jpg')
     wordcloud = WordCloud(background_color='black',
                           max_font_size=110,
                           mask=bg_pic,
                           min_font_size=10,
                           mode='RGBA',
                           font_path='source/SimHei.ttf')
     word_frequence = {x[0]: x[1] for x in data.values}
     wordcloud = wordcloud.fit_words(word_frequence)
     plt.title(title, fontsize=16)
     plt.imshow(wordcloud)
     plt.axis("off")
     plt.show()
Esempio n. 32
0
def word_count():
    """
    绘制WordCloud
    :return:
    """
    df_all_words = pd.DataFrame({"all_words": all_word_list})
    words_count = df_all_words.groupby(by=["all_words"])["all_words"].agg(
        {"count": np.size})
    words_count = words_count.reset_index().sort_values(by=["count"],
                                                        ascending=False)
    matplotlib.rcParams["figure.figsize"] = (10.0, 5.0)
    word_cloud = WordCloud(font_path="simhei.ttf",
                           background_color="white",
                           max_font_size=80)
    word_frequency = {x[0]: x[1] for x in words_count.head(100).values}
    word_cloud = word_cloud.fit_words(word_frequency)
    plt.imshow(word_cloud)
    plt.show()
def plot_wordcloud(words_count):
    image = imread(blackground_pic)

    wordcloud = WordCloud(background_color="white",
                          mask=image,
                          font_path='./data/simhei.ttf',
                          max_words=5000,
                          scale=1.5)

    wordcloud = wordcloud.fit_words(words_count)

    plt.figure(figsize=(10, 6), dpi=100)

    plt.axis("off")
    wordcloud.to_file('./data/wordcloud.png')
    plt.imshow(wordcloud)
    # plt.show()
    plt.close()
Esempio n. 34
0
def get_word_list():
    with open("dan_mu.txt", encoding="utf-8") as f:
        word = f.read()
    word_list = word.split()

    # 使用lcut()方法进行分词
    data_cut = [jieba.lcut(x) for x in word_list]
    # data_cut
    print(data_cut)

    # 3 读取停用词
    with open(r"cn_stopwords.txt", encoding="utf-8") as f:
        stop = f.read()
    stop = stop.split()
    stop = [" ", "道", "说道", "说"] + stop
    # 4 去掉停用词之后的最终词
    s_data_cut = pd.Series(data_cut)
    all_words_after = s_data_cut.apply(
        lambda x: [i for i in x if i not in stop])
    # 5 词频统计
    all_words = []
    for i in all_words_after:
        all_words.extend(i)
    word_count = pd.Series(all_words).value_counts()

    # 6 词云图的绘制
    # 1)读取背景图片
    back_picture = imread("kakaxi.jpg")

    # 2)设置词云参数
    wc = WordCloud(font_path="FZNiNSJW.TTF",
                   background_color="white",
                   max_words=2000,
                   mask=back_picture,
                   max_font_size=200,
                   random_state=42)
    wc2 = wc.fit_words(word_count)

    # 3)绘制词云图
    plt.figure(figsize=(16, 8))
    plt.imshow(wc2)
    plt.axis("off")
    plt.show()
    wc.to_file("ciyun.png")
Esempio n. 35
0
    def drawWordCloud(self,
                      word_text,
                      filename,
                      dict_type=False,
                      background_image='image/tom2.jpeg'):
        """

        :param word_text:
        :param filename:
        :param dict_type:
        :param background_image: 词云图的背景形状
        :return:
        """
        mask = Image.open(BASE_DIR + background_image)
        mask = np.array(mask)
        my_wordcloud = WordCloud(
            background_color='white',  # 设置背景颜色
            mask=mask,  # 设置背景图片
            max_words=2000,  # 设置最大现实的字数
            stopwords=STOPWORDS,  # 设置停用词
            font_path=self.system_font,  # 设置字体格式,如不设置显示不了中文
            max_font_size=50,  # 设置字体最大值
            random_state=30,  # 设置有多少种随机生成状态,即有多少种配色方案
            scale=1.3)
        if not dict_type:
            my_wordcloud = my_wordcloud.generate(word_text)
        else:
            my_wordcloud = my_wordcloud.fit_words(word_text)
        image_colors = ImageColorGenerator(mask)
        my_wordcloud.recolor(color_func=image_colors)
        # 以下代码显示图片
        plt.imshow(my_wordcloud)
        plt.axis("off")
        # 保存图片
        if not self.from_web:
            my_wordcloud.to_file(filename=self.image_path + filename + '.jpg')
            print("result file path:", self.image_path + filename + '.jpg')
            plt.show()
        else:
            my_wordcloud.to_file(filename=self.web_image_bash_path + filename +
                                 '.jpg')
            print("result file path:",
                  self.web_image_bash_path + filename + '.jpg')
def word_segmentation():
    data = pd.read_csv(filepath, encoding='utf-8').fillna('')
    image = imread(os.path.join(PROJECT_ROOT, LAGOU.BACKGROUND))

    df = pd.read_csv(os.path.join(PROJECT_ROOT, LAGOU.STOPWORDS),
                     encoding='utf8',
                     index_col=False)

    jieba.load_userdict(os.path.join(PROJECT_ROOT, LAGOU.USERDICT))  ## 添加多个词汇
    stopwords = list(df['stopword'].unique())
    stopwords.extend([u'技术', u'高端', u'职位', u'企业', ',', '工程师', '类', '实施',
                      'IT'])  ## 添加停止词

    ## 需求岗位与技能词汇
    word_list = flatten(
        [list(jieba.cut(row['firstType'])) for _, row in data.iterrows()])
    word_list.extend(
        flatten([
            list(jieba.cut(row['secondType'])) for _, row in data.iterrows()
        ]))
    word_list.extend(
        flatten(
            [list(jieba.cut(row['thirdType'])) for _, row in data.iterrows()]))

    word_list = [word for word in word_list if word not in stopwords]
    words_count = pd.value_counts(word_list).to_dict()
    print pd.value_counts(word_list)

    wordcloud = WordCloud(background_color="white",
                          mask=image,
                          font_path=LAGOU.TTF,
                          max_words=5000,
                          scale=1.5)

    wordcloud = wordcloud.fit_words(words_count)

    plt.figure(figsize=(10, 6), dpi=100)

    plt.axis("off")
    wordcloud.to_file(filedir + 'data_wordcloud.png')
    plt.imshow(wordcloud)
    plt.show()
    plt.close()
Esempio n. 37
0
def main():
    #循环获取第一个电影的前10页评论
    commentList = []
    NowPlayingMovie_list = getNowPlayingMovie_list()
    for i in range(10):
        num = i + 1
        commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)
        commentList.append(commentList_temp)

    #将列表中的数据转换为字符串
    comments = ''
    for k in range(len(commentList)):
        comments = comments + (str(commentList[k])).strip()

    #使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)

    #使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df=pd.DataFrame({'segment':segment})

    #去掉停用词
    stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

    #统计词频
    words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
    words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)

    #用词云进行显示
    wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80)
    word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}

    word_frequence_list = []
    for key in word_frequence:
        temp = (key,word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud=wordcloud.fit_words(word_frequence_list)
    plt.imshow(wordcloud)
Esempio n. 38
0
def show(words_stat):
    import matplotlib.pyplot as plt
    #%matplotlib inline

    import matplotlib
    matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
    from wordcloud import WordCloud  #词云包

    wordcloud = WordCloud(font_path="simhei.ttf",
                          background_color="white",
                          max_font_size=80)  #指定字体类型、字体大小和字体颜色
    word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud = wordcloud.fit_words(word_frequence_list)
    plt.im
    show(wordcloud)
Esempio n. 39
0
def analyze_data_and_generate_word_cloud():
    df_allwords_clean_dist = pd.DataFrame({'allwords': allwords_clean_dist})

    word_count = df_allwords_clean_dist.allwords.value_counts().reset_index()
    word_count.columns = ['word', 'count']

    wc = WordCloud(width=1024,
                   height=768,
                   background_color='white',
                   font_path="simhei.ttf",
                   max_font_size=400,
                   random_state=50)

    wc = wc.fit_words({x[0]: x[1] for x in word_count.head(100).values})

    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()

    wc.to_file(path.join(path.dirname(__file__), "yuebing.png"))
Esempio n. 40
0
def description(job_list, filename):
    comments = ''
    for k in range(len(job_list)):
        comments = comments + (str(job_list[k])).strip()
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)
    result = jieba.analyse.extract_tags(cleaned_comments,
                                        topK=100,
                                        withWeight=True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]

    wordcloud = WordCloud(font_path="simhei.ttf",
                          background_color="white",
                          max_font_size=80)
    word_frequence = keywords
    myword = wordcloud.fit_words(word_frequence)
    myword.to_file('wordCloud/' + filename + '.png')
Esempio n. 41
0
def statistics(words_df, moviename):
    words_stat = words_df.groupby(by=["segment"])["segment"].agg(
        {"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"],
                                                      ascending=False)
    print(words_stat)
    wordcloud = WordCloud(
        font_path="‪C:\\Windows\\Fonts\\msyh.ttc",
        background_color="white",
        max_font_size=150,
        width=1000,
        height=860,
        margin=2,
    )
    word_frequence = {x[0]: x[1] for x in words_stat.head(500).values}
    wordcloud = wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show(block=False)
    img_name = "./" + moviename + ".jpg"
    wordcloud.to_file(img_name)
Esempio n. 42
0
 def make_wordclouds(self, commons):
     d = path.dirname(__file__)
     msk = np.array(
         Image.open(
             path.join(
                 d,
                 "C:/Users/27438/PycharmProjects/huzhou/tss/asset/me.jpg")))
     wordcloud = WordCloud(font_path="simhei.ttf",
                           background_color="#DDDDDD",
                           max_font_size=250,
                           width=1920,
                           height=1080,
                           mask=msk)  # 指定字体类型、字体大小和字体颜色
     word_frequence = {
         x[0]: x[1]
         for x in self.data_clear(commons).head(200).values
     }
     wordcloud = wordcloud.fit_words(word_frequence)
     plt.imshow(wordcloud)
     plt.axis("off")
     plt.show()
Esempio n. 43
0
def word(words):
    """
    生词词云图片
    :param words:
    :type words:
    :return:
    :rtype:
    """
    words_count = words.groupby(by=['word_cloud'])['word_cloud'].agg({"count"})
    words_count = words_count.reset_index().sort_values(by=["count"], ascending=False)

    matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

    wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color=None,max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_count.head(100).values}
    wordcloud=wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    wordcloud.to_file('./data/wordcloud.png')

    url = qiniu_picture()

    return url
Esempio n. 44
0
def main():
    # 循环获取第一个电影的前10页评论
    commentList = []
    movieList = getLatestShowMovieList()
    for i in range(10):
        commentListTmp = getCommentsById(movieList[0]['id'], i + 1)
        commentList.append(commentListTmp)
    # 将列表中的数据转换为字符串
    comments = ''
    for comm in commentList:
        comments = comments + str(comm)
    # 使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterData = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterData)
    # 使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words = pd.DataFrame({'segment': segment})
    # # 去掉停用词
    # stopwords = pd.read_csv("D:\Program\PythonCrawler\DouBan_Movie\stopword.txt",
    #                         index_col=False, quoting=3, sep="\t", names=['stopword'],
    #                         encoding='utf-8')  # quoting=3全不引用
    # words = words[~words.segment.isin(stopwords.stopword)]
    # 统计词频
    wordsFreq = words.groupby(by=['segment'])['segment'].agg(
        {"计数": numpy.size})
    wordsFreq = wordsFreq.reset_index().sort_values(by=["计数"], ascending=False)
    #用词云显示
    wordcloud = WordCloud(font_path="simhei.ttf",
                          background_color="white",
                          max_font_size=80)
    word_frequence = {x[0]: x[1] for x in wordsFreq.head(1000).values}
    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)

    wordcloud = wordcloud.fit_words(word_frequence_list)
    plt.imshow(wordcloud)
print("--- LDA trained : %s minutes ---" % round(((time.time() - start_lda_time)/60),2)) 


#################################
##### Display WordCloud #########
#################################
curr_topic = 0
wc = WordCloud(background_color="black", max_words=2000,max_font_size=40, width=120, height=120, random_state=42)
for line in final_topics:
    line = line[1]
    scores = [float(x.split("*")[0]) for x in line.split(" + ")]
    words = [x.split("*")[1] for x in line.split(" + ")]
    freqs = []
    for word, score in zip(words, scores):
        freqs.append((word, score))

   	elements = wc.fit_words(freqs)
    fig = plt.figure()
    plt.imshow(elements)
    plt.axis("off")
    fig.savefig('images/topic'+str(curr_topic))
    curr_topic += 1
    
plt.show()

program = os.path.basename(sys.argv[0])
logging.info("finished running %s" % program)


Esempio n. 46
0
            cur.execute(sql)
            conn.commit()
        blockSize = 10000
        record = []
        sql = u'%s,' * 207
        sql = u'insert into TB_BEER_STEM_PERCENT values(' + sql
        sql = sql[:-1] + u')'
        for i in dfBeer.index.values:
            record.append((i,) + tuple(dfBeer.loc[i]))
            if i and not i % blockSize:
                #  print sql
                cur.executemany(sql, record)
                record = []
                conn.commit()
        if len(record):
            cur.executemany(sql, record)
            record = []
            conn.commit()    #  Generate a word cloud
    beer_mask = imread("/home/bobbruno/BeerApp/Analysis/beer-glass-mask.png", as_grey=True)
    beer_mask.shape
    pylab.rcParams['figure.figsize'] = (30.0, 40.0)
    wc = WordCloud(width=800, height=500, background_color='white', ranks_only=False,
                   #  font_path='/usr/share/fonts/truetype/msttcorefonts/Verdana.ttf',
                   color_func=my_color_func, mask=beer_mask, prefer_horizontal=0.1)
    wc.fit_words(counts)
    wc2 = ndimage.rotate(wc, -60, cval=255)
    plt.imshow(wc2)
    plt.tight_layout()
    plt.axis("off")
    plt.show()
Esempio n. 47
0
def ouput_word_cloud_by_frequency(frequency_list=None, outputFileName='word_cloud.png', maskFile=None, font_path='../resource/fonts/simhei.ttf', background_color='white', max_words=2000):
    pic_mask = np.array(Image.open(maskFile))
    wc = WordCloud(font_path=font_path, background_color=background_color, max_words=max_words, mask=pic_mask)
    wc.fit_words(frequency_list)
    wc.to_file(outputFileName)
Esempio n. 48
0
def print_top_tokens(model, feature_names, n_top_words, category):
    """
    Prints top words for category to text file 
    """
    with open('results_m4w_4.txt', 'wb') as fid: 
       for topic_idx, topic in enumerate(model.components_):
       fid.write('\n')
       fid.write(category)
       fid.write('\n')
       fid.write("Topic #%d:" % topic_idx)
       fid.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words -1:-1]]))
       #fid.write('\n')
       #fid.write( for i in topic.argsort()[:-n_top_words -1:-1]])
       fid.write('\n')

def topic_word_freq(topics, idx, feature_names): 
    """
    Calculates word frequencies 
    Returns iterator of tuples
    """
    freq_sum = np.sum(topics[idx]) 	
    frequencies =  [val/freq_sum for val in topics[idx]] 
    return zip(feature_names, frequencies) 

def get_data():
    """
    Reads pandas dataframe from .pkl
    Returns dataframe
    """
    with open('dataframe_for_eda.pkl', 'rb') as fid:
	df = cPickle.load(fid)
    return df

def reduce_dimensions(total_mat, n_topics):
    """
    Calculates and returns nmf 
    Input is data matrix, shape (n_samples, n_features)
    returns W array, shape (n_samples, n_components)
    """
    nmf = NMF(n_components = n_topics, random_state=42, alpha=.2,  l1_ratio=0.5)
    nmf.fit(total_mat)
    X = nmf.transform(total_mat) 
    w = nmf.components_ 
    return nmf 

def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(70, 100)

if __name__=='__main__':
    df = get_data()
    mask_path = False
    #Use tfidf features for NMF
    for category in df.category_code.unique().tolist():
        if category == 'mis':
            #df_cat = df.ix[df['category_code']==category, :]
            text_mat, text_features = md.tfidf_matrix(df.ix[df['category_code'] == category, 'total_text'])
            #Fit NMF
            n_samples = text_mat.shape[0]
            n_features = text_mat.shape[1]
            n_topics = 10 
            n_top_words = 500
            print category 
            print 'Fitting the NMF model with tf-idf features'
            nmf = reduce_dimensions(text_mat, n_topics) 
            #print_top_tokens(nmf,text_features,n_top_words, category)

            word_freq = topic_word_freq(nmf.components_, 2, text_features)
            wc = WordCloud(stopwords=tp.custom_stop_words(), background_color='black', max_words=n_top_words, width=2000, height=1800)
            wc.fit_words(word_freq)
            plt.figure()
            plt.imshow(wc)
            #wc.recolor(color_func=grey_color_func, random_state=3)
            #wc.to_file('background.png')
            plt.axis('off')
Esempio n. 49
0
def unknowncoll(filename='unknownwords.p', stem=False):
    """
    Word cloud from sentiment analysis.
    
    Finds the bi-collocation of unknown words (words without sentiment) 
    and displays the 10 most common words based on frequency in a word-cloud, 
    colored green for words seen mostly in positive sentiments and red 
    for the opposite. Comparison is made on all comments concatenated
    
    -> filename: name of the file to load unknown words from
    -> stem: stem the words
    """
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    subreddits = scraper.load_data('sub-reddits.txt')
    fullcomment = []
    
    print 'building comment'
    for name, data in subreddits.items():
        for sub_id, sub in data.items():
            fullcomment += [fixer(comment, True, stem).split() for comment in sub.comments]

    print 'getting unknowns'
    unknownwords = unknownsent(filename)
    
    #flatten the comment structure
    fullcomment = [word for comment in fullcomment for word in comment]
    
    basefinder = BigramCollocationFinder.from_words(fullcomment)
    count = 0
    
    for unknown, unknownscore in unknownwords:
        finder = copy.copy(basefinder)
        
        print '\n' + unknown
        #only bigrams that contain the unknown word
        finder.apply_ngram_filter(lambda w1, w2: unknown != w1 and unknown != w2)
        
        wordcloud = WordCloud()
        wordcloud.font_path = 'C:\Windows\Fonts\comic.ttf'
        #trick the wordcloud to accept custom input
        wordcloud.generate('generate')
        
        colls = finder.score_ngrams(bigram_measures.raw_freq)
        colls = colls[:10]        
        maximum = colls[1][1]
        
        #generate the tuple (word, score) for the wordcloud
        cloudwords = [(word, score) for ((word, _), score) in colls if word != unknown]
        cloudwords += [(word, score) for ((_, word), score) in colls if word != unknown]
        
        #normalize the scores
        cloudwords = [(word, score / maximum) for (word, score) in cloudwords]
        
        #tricking part 2.
        wordcloud.fit_words(cloudwords)
        wordcloud.to_image()
        if(unknownscore > 0):
            wordcloud = wordcloud.recolor(color_func=green_color_func, random_state=3)
        else:
            wordcloud = wordcloud.recolor(color_func=red_color_func, random_state=3)
        
        count += 1
        plt.figure(count)
        plt.title(unknown)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.savefig('plots/' + unknown + '.png', bbox_inches='tight')
        plt.close()
Esempio n. 50
0
from wordcloud import WordCloud
import matplotlib.pyplot as plt

line='0.475*director + 0.302*band + 0.169*school + 0.014*student + 0.011*ingredient + 0.000*aicf + 0.000*development + 0.000*foundation + 0.000*parent + 0.000*life'
scores = [x.split("*")[0] for x in line.split(" + ")]
words = [x.split("*")[1] for x in line.split(" + ")]
freqs = []
w=[]
curr_topic = 0

for word, score in zip(words, scores):
    freqs.append((word, float(score)))
    w.append(word)
wc=WordCloud()
cloud=wc.fit_words(freqs)
plt.imshow(cloud)
plt.axis("off")
plt.show()
Esempio n. 51
0
segStat = segmentDF.groupby(
            by=["segment"]
        )["segment"].agg({
            "计数":numpy.size
        }).reset_index().sort(
            columns=["计数"],
            ascending=False
        );
    
segStat.head(100)


#绘画词云
#http://www.lfd.uci.edu/~gohlke/pythonlibs/
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(
    font_path='D:\\simhei.ttf', 
    background_color="black"
)

words = segStat.set_index('segment').to_dict()

wordcloud = wordcloud.fit_words(words['计数'])

plt.imshow(wordcloud)

plt.close()
# remove words with just one occurence
word_counts2 = defaultdict(lambda: 0)
total_count = float(total_count)
for word in word_counts:
    if word_counts[word] > 1:
        word_counts2[word] = word_counts[word]/total_count
word_counts2 = sorted(word_counts2.items(), key=itemgetter(1), reverse=True)

print "Creating wordcloud in wordcloud.png.."
print word_counts2
wordcloud = WordCloud(font_path='OpenSans-Bold.ttf',
                      background_color='black',
                      width=1920,
                      height=1080)
wordcloud.fit_words(word_counts2)
wordcloud.to_file('./wordcloud.png')

# generate email heatmap
print email_datetime
cols = ["Midnight", "6 AM", "Noon", "6 PM"]
rows = ['M','Tu','W','Th','F','Sa','Su']
plt.pcolor(email_datetime)
plt.xticks(np.arange(0,24,6)+0.5,cols)
plt.yticks(np.arange(0,7)+0.5,rows)
plt.colorbar()
plt.title("When the Most Emails are Being Received?")
plt.xlabel("Time of Day")
plt.ylabel("Day of Week")
plt.savefig("./heatmap.png", dpi=300)
Esempio n. 53
0
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = path.dirname(__file__)

with open('haizi.txt', 'r') as poet:
    s = poet.read()

seg_list = [x for x in jieba.cut(s) if len(x) > 1 and x not in [u'一个', u'一只', u'一样', u'一直', u'一种']]

alice_coloring = np.array(Image.open(path.join(d, "haizi.jpg")))

font = "/Library/Fonts/Lantinghei.ttc"
wc = WordCloud(background_color="white", font_path=font, mask=alice_coloring, max_font_size=80, random_state=42, scale=1.5)

# generate word cloud
wc.fit_words(Counter(seg_list).items())

# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)


# show
plt.imshow(wc)
plt.axis("off")
plt.figure()

# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.figure()
Esempio n. 54
0
count = float(count)

print "Sum: %f" % count

words = []

# get the words from the whitelist and calculate their frequences
sql = """SELECT word, count FROM word_whitelist ORDER BY `count` DESC"""
for word, frequency in query(sql):
    words.append((word, float(frequency) / count))

print "Creating cloud."

from scipy.misc import imread

mask = imread(MASK_FILE)

# generate the world cloud. This takes a while because the library is not parallelized.
wordcloud = WordCloud(font_path="/usr/share/fonts/truetype/msttcorefonts/Georgia.ttf", ranks_only=True, max_words = len(words),
    mask=mask, background_color="white")
wordcloud.fit_words(words)

print "Creating LOW RES image."
wordcloud.to_file(LOW_RES)

# low let's beef up the scale
wordcloud.scale = 12

print "Creating HI RES image."
img = wordcloud.to_image()
img.save(HIGH_RES, dpi=(100000,100000))
Esempio n. 55
0
    by="segment")['segment'].agg({
    'count':numpy.size}).reset_index().sort(
    columns=['count'],
    ascending=False
)

segStat.to_csv('segStat.csv',encoding='utf-8')
print 'save segStat to segStat.csv...'
#segStat=pandas.read_csv('D:\\python\\code\\movie\\comments\\segStat.csv')
stopwords= pandas.read_csv(
    'D:\\python\\code\\movie\\comments\\stop.txt',
    error_bad_lines=False,
)

fSegStat=segStat[~segStat.segment.isin(stopwords.stopword)]

fSegStat.to_csv('fSegStat.csv',encoding='utf-8')
print 'save fSegStat to fSegStat.csv...'
#fSegStat=pandas.read_csv('D:\\python\\code\\movie\\comments\\fSegStat.csv',encoding='utf-8')
wordcloud=WordCloud(
    font_path='D:\\python\\simhei.ttf',
    background_color='black'
)
#fSegStat_file=pandas.read_csv('D:\\python\\code\\movie\\comments\\fSegStat.csv')

wordcloud=wordcloud.fit_words(fSegStat.itertuples(index=False))
plt.imshow(wordcloud)
plt.show()
plt.close()

__author__ = 'sandip'

import os
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

MODELS_DIR = "models"

final_topics = open("lda_topics.txt", 'r', encoding='ascii')
curr_topic = 0
lines = ""
for line in final_topics:
    lines = line
lines = lines.strip("[").strip("]")
for line in lines.split(","):
    scores = [float(x.split("*")[0].replace("'","")) for x in line.split(" + ")]
    words = [x.split("*")[1] for x in line.split(" + ")]
    freqs = []
    for word, score in zip(words, scores):
        freqs.append((word, score))
    wc = WordCloud(background_color="white", max_words=2000,
               stopwords=STOPWORDS.add("said"))
    elements = wc.fit_words(frequencies=freqs)
    plt.imshow(elements)
    plt.axis("off")
    plt.show()
    curr_topic += 1
final_topics.close()