예제 #1
0
def main():
    data_preprocess.split_data('./data/emails.csv')
    data_preprocess.convert_email()
    data_preprocess.build_corpus()
    data_preprocess.build_email_corpus_by_selected_person()
    word_cloud.create_word_cloud()
    word_cloud.output_word_frequencies(200)
    community_detection.output_gexf_file(graph_type="unweighted",
                                         min_degree=100,
                                         max_degree=200)
예제 #2
0
    def visualizeHam(self):
        spam_words = ''.join(
            list(dataSet[dataSet['label'] == 'ham']['message']))
        spam_wc = WordCloud(width=512, height=512).generate(spam_words)

        plt.figure(figsize=(10, 8), facecolor='k')
        plt.imshow(spam_wc)
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.show()
예제 #3
0
def get_vod_Keyword(id):
	
	kw_VODs = {"status": "false", "content": []}

	if os.path.isfile("./data/"+id+"_keyword.txt") :
		f = open("./data/"+id+"_keyword.txt", 'r', encoding='utf-8-sig')
		kw_VODs['content'] = f.readlines()[0]
		kw_VODs['status'] = "true"
		#print(kw_VODs['content'])
	else:
		if os.path.isfile("./data/VoD_"+id+".txt") :
			WordCloud.PrintTFIDFTotxt(id)
		if os.path.isfile("./data/"+id+"_keyword.txt") :
			f = open("./data/"+id+"_keyword.txt", 'r', encoding='utf-8-sig')
			kw_VODs['content'] = f.readlines()[0]
			kw_VODs['status'] = "true"

	body = json.dumps(kw_VODs)
	return Response(content_type='application/json; charset=utf-8', body=body)
예제 #4
0
파일: stats.py 프로젝트: wsrtka/Hyperreal
def show_word_cloud(drug,
                    raw_docs,
                    narkopedia_map,
                    doc_freq,
                    filter_numeric=True,
                    length=1,
                    top=100):
    """
    Shows word cloud of ngrams associated with given drug.
    :param drug: string containing drug name
    :param raw_docs: list of strings containing texts for analysis
    :param narkopedia_map: dictionary of drug names and their alternative forms
    :param doc_freq: dictionary containing ngrams and number of docs in which they appeared
    :param filter_numeric: boolean, True if drug ngrams should not contain numbers
    :param length: int length of ngrams to create
    :param top: int number of top ngrams to use
    :return: print word cloud of ngrams associated with drug
    """
    fig, axes = plt.subplots(3, 2, figsize=(25, 18))

    for i, metric in enumerate(["tf", "tfidf"]):
        for j, n in enumerate([1, 2, 3]):
            top_ngrams = ngrams_describing_drug(narkopedia_map[drug],
                                                raw_docs,
                                                doc_freq[n],
                                                filter_numeric=filter_numeric,
                                                length=length,
                                                top=top,
                                                metric=metric)

            text_scores = {" ".join(k): v for k, v in top_ngrams}

            wc = WordCloud(height=400, width=800)
            wc.generate_from_frequencies(text_scores)

            axes[j, i].imshow(wc)

    plt.tight_layout()
    plt.show()
예제 #5
0
def getChat(videoId, video_type):
    import time, json, requests, sys
    if (video_type == 'h'):
        video_type = 'highlight'
    elif (video_type == 'v'):
        video_type = 'VoD'
    #videoId = input('Enter the video ID: ')
    videoId = videoId
    #API url
    url = 'https://rechat.twitch.tv/rechat-messages'

    #Use start=0 to get the begin and end time.
    url1 = url + '?' + 'start=0&video_id=v' + videoId
    re = requests.get(url1).json()
    detail = re['errors'][0]['detail'].split(' ')
    start, stop = int(detail[4]), int(detail[6])
    total = stop - start

    messageIds = []

    file_name = video_type + '_' + videoId + '.txt'
    fw = open('./data/' + file_name, 'w', encoding='utf-8-sig')
    # Download all the messages from chatroom for time=start to end.
    timestamp = start
    print('Start downloading ' + video_type + ' ' + videoId +
          '... Please wait for a while.')
    while timestamp <= stop:
        url2 = url + '?start=' + str(timestamp) + '&video_id=v' + videoId

        re = requests.get(url2).json()
        try:
            data = re['data']
        except:
            print("No chats for this video")
            return None

        timestamp = timestamp + 1

        for datum in data:
            if not any(datum['id'] in messageId for messageId in messageIds):
                messageIds.append(datum['id'])
                date = time.strftime(
                    '%m/%d/%Y %H:%M:%S',
                    time.gmtime(datum['attributes']['timestamp'] / 1000.))
                user = datum['attributes']['from']
                message = datum['attributes']['message']
                progress = timestamp - start
                percentage = round(progress * 100 / float(total), 2)
                percentage = 100.0 if percentage > 100.0 else percentage
                sys.stdout.write('Downloading... (' + str(percentage) + '%)\r')
                sys.stdout.flush()
                #fw.write(date + ' ' + user + ': ' + message + '\n')
                fw.write(
                    str(int(datum['attributes']['timestamp']) // 1000) + ' ' +
                    user + ': ' + message + '\n')
                timestamp = int(
                    datum['attributes']['timestamp'] / 1000
                )  #Change timestamp to the last message of this time frame to improve performance.
    fw.close()
    print('Finished downloading ' + videoId + '\n')
    WordCloud.PrintTFIDFTotxt(videoId)
    return start
예제 #6
0
import WordCloud
from mysql_query import search_keywords

def gen_word_cloud(return_size, source_input = [], category_input = [], date_input = str(72), sort_type="rdate"):
    df = search_keywords(return_size, source_input, category_input, date_input, sort_type)

    for index, row in df.iterrows():
        word = row['word']
        freq = row['freq']
        for (i in range(freq)):
            comment_words += " ".join(val) + " "

    wordcloud = WordCloud(width=800, height=800,
                          background_color='white',
                          stopwords=stopwords,
                          min_font_size=10).generate(comment_words)

    # plot the WordCloud image
    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig('xxxxxxxx.png')
예제 #7
0
"""
For Unit Test, use it as appropriate
Author: Zhiyi Wang
Date: 04-11-2021
Version: 1.0
"""

import WordCloud as word_cloud

word_cloud.more_test_cloud_word(200)
# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")


# In[24]:


# Let's make some WORD CLOUDS!
# PowerShell Prompt: python -m pip install wordcloud "AND" Anaconda Prompt: conda install -c conda-forge wordcloud

rom wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2", max_font_size=150, random_state=42)


# In[25]:


# Reset the output dimensions
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 6]

full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr', 'Bo Burnham', 'Dave Chappelle', 'Hasan Minhaj',
              'Jim Jefferies', 'Joe Rogan', 'John Mulaney', 'Louis C.K.', 'Mike Birbiglia', 'Ricky Gervais']

# Create subplots for each comedian
for index, comedian in enumerate(data.columns):
    def retranslateUi(self, AnalysisWindow):
        _translate = QtCore.QCoreApplication.translate
        AnalysisWindow.setWindowTitle(_translate("AnalysisWindow", "Analysis"))
        self.label.setText(_translate("AnalysisWindow", "Feature Relevancy"))
        self.label_2.setText(_translate("AnalysisWindow", "Accuracy of relevant features extracted"))
        self.label_3.setText(_translate("AnalysisWindow", "Score Comparision"))
        self.label_4.setText(_translate("AnalysisWindow", "Our Rating"))
        self.label_5.setText(_translate("AnalysisWindow", "Amazon Rating"))
        self.label_6.setText(_translate("AnalysisWindow", "Word Cloud"))

        #our stuff
        asin=Asin.getAsinValue()
        WordCloud.wc()
        self.label_7.setStyleSheet("#label_7{border-image:url(images/"+asin+"_wc.png)}")


        from collections import defaultdict
        key_rating_d = algo.noun_dict()
        objects = key_rating_d.keys()
        performance = []

        for l in key_rating_d.values():
            for i in l:
                # print(i)
                s = i.split('/')
                performance.append(float(s[0]))
        # print(performance)
        sum=0

        for pi in performance:
            sum = sum + pi
        avg = sum/len(performance)
        avg/=2
        # print("old avg:")
        # print(avg)
        phone_word_rating = Asin.getPhone()
        # print("phone:" + str(phone_word_rating))
        new_avg = (avg + phone_word_rating / 2) / 2
        # print("new avg:")
        # print(new_avg)

        Asin.set_Prod_avg(round(new_avg, 2))
        Asin.set_Ama_avg(asin)

        amazon_rating =  Asin.get_Amazon_avg()
        product_rating = Asin.get_Prod_avg()
        # print(amazon_rating,product_rating)
        self.ama_rating.setText(str(amazon_rating))
        self.prod_rating.setText(str(product_rating))

        relcount = 0
        i=1
        for obj in objects:
            relevancy = Asin.check(str(obj))
            if relevancy == 'relevant':
                relcount += 1
            # print(str(obj), relevancy)
            self.listWidget.addItem(str(i)+". "+str(obj)+" \t : \t"+str(relevancy))
            i+=1
        rele = str(round((relcount / len(objects)),2) * 100)
        rel=rele+"%"
        self.relevancyAccuracyLabel.setText(rel)
예제 #10
0
def main(pData,
         Desc,
         sntmnt=False,
         wrdcld=True,
         viz=True,
         Market=False,
         is_preprocess=False):
    try:
        if is_preprocess:
            lstatusPreprocessing, pDataProcess = preprocessing.preprocess(
                pData, Desc)
            pDataProcess = pDataProcess.reset_index()
            pDataProcess = pDataProcess[~pDataProcess.Sample.str.
                                        contains("nan", regex=False, na=False)]
        else:
            pDataProcess = pData
            pDataProcess['Sample'] = pDataProcess[Desc]
        if Market:
            pMarketUnitList = pDataProcess[pSubUnit].unique().tolist()
            for index in range(len(pMarketUnitList)):
                pMarketUnitData = pDataProcess.loc[pDataProcess[pSubUnit] ==
                                                   pMarketUnitList[index]]
                if sntmnt:
                    sentiment.sentiment(pMarketUnitData,
                                        Desc=pDataProcess['Sample'],
                                        filename=str(pMarketUnitList[index]))
                if wrdcld:
                    WordCloud.plotwordcloud(pMarketUnitData,
                                            Desc=pDataProcess['Sample'],
                                            filename=str(
                                                pMarketUnitList[index]))
                if viz:
                    visualization.plotmostfrqKwds(pMarketUnitData,
                                                  n,
                                                  filename=str(
                                                      pMarketUnitList[index]))
                    visualization.plotFreqngram(pMarketUnitData,
                                                n,
                                                ngram,
                                                filename=str(
                                                    pMarketUnitList[index]))
                    visualization.plotngramnetwork(pMarketUnitData,
                                                   pNodeName,
                                                   filename=str(
                                                       pMarketUnitList[index]))
        else:
            if sntmnt:
                sentiment.sentiment(pDataProcess,
                                    Desc=pDataProcess['Sample'],
                                    filename='AllData')
            if wrdcld:
                WordCloud.plotwordcloud(pDataProcess,
                                        Desc=pDataProcess['Sample'],
                                        filename='AllData')
            if viz:
                visualization.plotmostfrqKwds(pDataProcess,
                                              n,
                                              filename='AllData')
                visualization.plotFreqngram(pDataProcess,
                                            n,
                                            ngram,
                                            filename='AllData')
                visualization.plotngramnetwork(pDataProcess,
                                               pNodeName,
                                               filename='AllData')
                visualization.plottablefrqKwds(pDataProcess,
                                               n,
                                               filename='AllData')

    except Exception as e:
        print('Error ocurred main file')
        print(traceback.format_exc())
        return (-1)
    return (0)
예제 #11
0
# from main import french_tweets_df
# from main import cleaned_tweets
# from main import filter_tweets
# from main import list_of_key_words
# from main import get_sparkdf_wordcount
# from datetime import datetime
# import nltk
# from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import wordcloud as WordCloud

text = 'These flannel wipes are OK, but in my opinion not worth keeping. I also ordered someImse Vimse Cloth Wipes-Ocean Blue-12 countwhich are larger, had a nicer, softer texture and just seemed higher quality. I use cloth wipes for hands and faces and have been usingThirsties 6 Pack Fab Wipes, Boyfor about 8 months now and need to replace them because they are starting to get rough and have had stink issues for a while t'
cloud = WordCloud(max_font_size=50, max_words=100,
                  background_color="white").generate(text)
plt.figure()
plt.imshow(WordCloud, interpolation="bilinear")
plt.axis("off")
plt.show()
plt.savefig('images/Name.png')

print(cloud)