Ejemplo n.º 1
1
def wordcloud(datafile):

    #remove stop words, the most common words in a language
    vectorizer=CountVectorizer(stop_words='english')

    for word in vectorizer.get_stop_words():
        STOPWORDS.add(word)
    STOPWORDS.add("said")

    pony_mask = np.array(Image.open("../pinkyB.jpg"))
    wc = WordCloud(background_color="black", max_words=2000, mask=pony_mask, stopwords=STOPWORDS)

    #init dictionary with the five categories
    categoriesSet = set(datafile["Category"])
    categoriesDict = dict.fromkeys(categoriesSet,"")

    #Conditional Selection
    # business = datafile.ix[datafile["Category"]=="Business"]
    # print business["Content"].size

    #fill index with data from cv
    for index, row in datafile.iterrows():
        categoriesDict[row["Category"]] += str(row["Content"])

    for category, text in categoriesDict.iteritems():
        wc.generate(text)
        image = wc.to_image()
        image.save("../wordcloud/wordcloud_" + category + ".jpg")
    return
Ejemplo n.º 2
0
def wordCloud(text_array,name,keyword=""):
	new_text_arr=[]
	if keyword is not "":
		keyword=keyword.split(" ")[1]
	for text in text_array:
		if keyword in text:
			new_text_arr.append(text)

	text_array=new_text_arr

	cloud_text=""
	for text in text_array:
		cloud_text+=text+" "

	m_stopwords=['police','traffic','sir']

	for word in m_stopwords:
		STOPWORDS.add(word)

	image_mask = os.path.join(BASE_DIR, 'static/tool/img/nebula.png')
	coloring = imread(image_mask)
	
	wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white",mask=coloring,ranks_only=True,max_words=50).generate(cloud_text)
	filename=os.path.join(BASE_DIR, 'static/tool/img/'+name+'.png')

	image_colors = ImageColorGenerator(coloring)
	wordcloud.recolor(color_func=image_colors)
	wordcloud.to_file(filename)
	data_uri = open(filename, 'rb').read().encode('base64').replace('\n', '')

	img_tag = '<img src="data:image/png;base64,{0}" style="height:400px;">'.format(data_uri)
	
	layout=wordcloud.layout_
	words_colours={}
	count=1
	for lo in layout:
		entry={}
		entry['word']=lo[0][0]
		color=lo[len(lo)-1]
		color=color[4:]
		color=color[:-1]
		color_split=color.split(',')
		color_num=[int(x) for x in color_split]
		color_hex='#%02x%02x%02x' % tuple(color_num)
		# print color_num
		entry['color']=color_hex
		words_colours[count]=entry
		count+=1

	# print words_colours
	list_html=""
	cap=51
	if cap>len(words_colours):
		cap=len(words_colours)

	for i in range(1,cap):
		list_html+='<li class="list-group-item" ><a class="cloud-key-'+name+'" href="#" style="color:'+words_colours[i]['color']+'">'
		list_html+="#"+str(i)+" "+words_colours[i]['word']+'</a></li>'

	return (img_tag,list_html)
Ejemplo n.º 3
0
def generateWordCloud(text, stop):
    d = path.dirname(outputdir)

    for w in stop:
        STOPWORDS.add(w)

    # Generate the wordcloud without the stop words    
    wordcloud = WordCloud(stopwords=STOPWORDS).generate(text)

    # Draw the positioned words to a PNG file.
    wordcloud.to_file(path.join(d, 'diabetes-wordcloud.png'))
Ejemplo n.º 4
0
def main(save_files = False, db_filename = '../output/database.sqlite'):
    conn = sqlite3.connect(db_filename)
    c = conn.cursor()

    # Retrieve papers
    c.execute('''SELECT *
                 FROM Papers''')

    paper_content = c.fetchall()
    conn.close()

    titles = ''

    for pc in paper_content:
        titles += pc[1]

    # A Marvin Minsky mask
    mask = np.array(Image.open("../files/minsky_mask.png"))

    wc = WordCloud(background_color="white", max_words=2000, mask=mask, stopwords=STOPWORDS.copy())
    # Generate word cloud
    wc.generate(titles)
    
    if (save_files):
        # Store to file
        wc.to_file("../files/title_cloud.png")
    
    # Show word cloud
    plt.imshow(wc)
    plt.axis("off")
    # Show mask
#    plt.figure()
#    plt.imshow(mask, cmap=plt.cm.gray)
#    plt.axis("off")
    plt.show()
Ejemplo n.º 5
0
def cloudplot(person):

    person = re.sub(r'\+', ' ', person)

    text = GetTextRange(Emails, person)
    text = rmBoring(rmNonAlpha(text)).decode('ascii', 'ignore')

    plt.clf()

    d = path.dirname(path.abspath(__file__))

    hilcolor = np.array(Image.open(path.join(d, "static/img/hillarylogo.jpg")))

    wc = WordCloud(background_color="white", max_words=150, mask=hilcolor,
               stopwords=STOPWORDS.add("said"),
               max_font_size=80, random_state=42,
               relative_scaling = 0.5)


    wc.generate(text)
    image_colors = ImageColorGenerator(hilcolor)

    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")

    fig = plt.gcf()
    img = StringIO.StringIO()
    fig.savefig(img)
    img.seek(0)

    return send_file(img, mimetype='image/png')
def word_cloud(csv_file, stopwords_path, pic_path):
    pic_name = csv_file+"_词云图.png"
    path = os.path.abspath(os.curdir)
    csv_file = path+ "\\" + csv_file + ".csv"
    csv_file = csv_file.replace('\\', '\\\\')
    d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
    content = []
    for i in d['content']:
        try:
            i = translate(i)
        except AttributeError as e:
            continue
        else:
            content.append(i)
    comment_after_split = jieba.cut(str(content), cut_all=False)
    wl_space_split = " ".join(comment_after_split)
    backgroud_Image = plt.imread(pic_path)
    stopwords = STOPWORDS.copy()
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        for i in f.readlines():
            stopwords.add(i.strip('\n'))
        f.close()

    wc = WordCloud(width=1024, height=768, background_color='white',
                   mask=backgroud_Image, font_path="C:\simhei.ttf",
                   stopwords=stopwords, max_font_size=400,
                   random_state=50)
    wc.generate_from_text(wl_space_split)
    img_colors = ImageColorGenerator(backgroud_Image)
    wc.recolor(color_func=img_colors)
    plt.imshow(wc)
    plt.axis('off')  
    plt.show() 
    wc.to_file(pic_name)
Ejemplo n.º 7
0
def make_cloud(words, image, size=10, filename='figures/cloud.png', max_words=200, horizontal=0.8):

    # Remove URLs, 'RT' text, screen names, etc
    my_stopwords = ['RT', 'amp', 'lt']
    words_no_urls = ' '.join([word for word in words.split()
                              if word not in my_stopwords])

    # Add stopwords, if needed
    stopwords = STOPWORDS.copy()
    stopwords.add("RT")
    stopwords.add('amp')
    stopwords.add('lt')

    # Load up a logo as a mask & color image
    logo = imread(image)

    # Generate colors
    image_colors = ImageColorGenerator(logo)

    # Generate plot
    wc = WordCloud(stopwords=stopwords, mask=logo, color_func=image_colors, scale=0.8,
                   max_words=max_words, background_color='white', random_state=42, prefer_horizontal=horizontal)

    wc.generate(words_no_urls)

    plt.figure(figsize=(size, size))
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig(filename)
    def config_stopwords(self, more_stopwords=None):
        """
          (obj) -> None

          Configuring stopwords by adding more if required
        """

        if more_stopwords is not None:
            self.STOPWORDS = STOPWORDS.union(more_stopwords)
Ejemplo n.º 9
0
def makeWordCloud(text):
	#preprocess
	stopwords = STOPWORDS.copy()
#        text.replace("State","")
#        text.replace("year","")
#        text.replace("Congress","")
#        text.replace("will","")
	wC = WordCloud(max_words=2000, stopwords=stopwords, margin=5, random_state=1, width = 1600, height = 800).generate(text)
	plt.imshow(wC)
	plt.show()
Ejemplo n.º 10
0
def create_wordcloud(posts):
	wordcloud_str=' '.join(post['message'] for post in posts) #join all posts together
	aces_mask=imread("aces.png") #add aces mask
	wc=WordCloud(background_color="BLACK", mask=aces_mask, stopwords=STOPWORDS.add("will")) #don't include the word "will" in the wordcloud
																							#(not an interesting word and took up a large chunk of the wordcloud)
	wc.generate(wordcloud_str)
	plt.axis("off")
	plt.imshow(wc)
	plt.show()
	wc.to_file("aces_wordcloud.png")
def generate_wc(content):
    path = r'fzzqhj.TTF'
    bg_pic = imread('mo.png')  # 读取一张图片文件
    image_colors = ImageColorGenerator(bg_pic)  # 从背景图片生成颜色值
    wc = WordCloud(font_path=path, background_color="white",
                   mask=bg_pic,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=40,
                   color_func=image_colors,
                   random_state=42)
    wc = wc.generate(content)
    wc.to_file(c.outputs_pictures_path + 'result.jpg')
Ejemplo n.º 12
0
def plotTwiiterWordCloud():
	args = sys.argv
	tracefile = open(args[2], 'r')
	nLines = sum(1 for line in tracefile)
	tracefile.seek(0)

	dictTerms = dict()
	blacklist = STOPWORDS.copy()
	blacklist.add('rt')
	punctuation = set(string.punctuation)
	punctuation.remove('@')
	punctuation.remove('&')
	# punctuation.remove('#')
	for line in tqdm(tracefile, total=nLines):
		try:
			linesplited = line.split(', ')
			tweet = linesplited[6].lower()
			for p in punctuation:
				tweet = tweet.replace(p, '')
			terms = tweet.split(' ')
			for t in terms:
				if (len(t) > 1) and 'http' not in t and (t not in blacklist):
					try:
						dictTerms[t] += 1
					except KeyError:
						dictTerms[t] = 1
		except IndexError:
			print 'IndexError'
	for t in blacklist:
		try:
			del dictTerms[t]
		except KeyError:
			continue
	popularTerms = sorted(dictTerms.keys(), key=lambda w:dictTerms[w], reverse=True)
	popularTerms = [p for p in popularTerms if (dictTerms[p]) > 1]
	print len(popularTerms)
	text = list()
	terms = ''
	for p in popularTerms:
		text.append((p, dictTerms[p]))
		for i in range(dictTerms[p]):
			terms += ' ' + p
	# print terms
	maskfile = 'csgo-icon'
	mask = imread(maskfile + '.jpg') # mask=mask
	wc = WordCloud(mask=mask, background_color='white', width=1280, height=720).generate(terms) # max_words=10000
	default_colors = wc.to_array()
	plt.figure()
	plt.imshow(default_colors)
	plt.axis('off')
	plt.savefig(maskfile + '-wordcloud.png', dpi=500, bbox_inches='tight', pad_inches=0) # bbox_inches='tight'
	plt.show()
Ejemplo n.º 13
0
def makeCloud(text, imgFile, words):
    """
    Makes a word cloud and stores it in a jpeg file
    """
    excludewords = STOPWORDS.copy()
    
    for word in words:
        excludewords.add(word)
    
    wordcloud = WordCloud(max_words=NUM_OF_WORDS, width=WIDTH, height=HEIGHT, stopwords=excludewords).generate(text)
    image = wordcloud.to_image()
    image.show()
    image.save(imgFile + '.jpeg')      
Ejemplo n.º 14
0
def make_word_cloud(data):
  text = ''
  for d in data:
    text = text + d[0] + ' '

  # Generate a word cloud image
  wordcloud = WordCloud(stopwords=STOPWORDS.add('watson')).generate(text)

  # Display the generated image:
  # the matplotlib way:
  import matplotlib.pyplot as plt
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.show()
Ejemplo n.º 15
0
def mainProcess(usernames):
	print "Processing "+str(len(usernames)-1)+" usernames"
	words4=""
	loginFacebook(driver)
	timeread=time.time()
 	time0=time.clock()

	for username in usernames:
		if len(username) is not 0: 
			username=username.strip()
			time1=time.clock() 
			count, words3 =produce3(username) 
			module.Database.edit2(username, count, conn)
			time2=time.clock()
			words4=words4+" "+words3

 	
 	time3=time.clock()
 	timeread=time.time()-timeread
 	print "TOTAL TIME"
 	print time3-time0
 	print timeread
 	more_stopwords =["ja", "aga", "kui", "siis", "tongue", "nii", "ka", "et", "see", "ma","oma","oli", "emoticon", "ei","ning", "seda", "või", "smile", "grin", "Kas", "kes", "veel"]
 	for more in more_stopwords: 
 		STOPWORDS.add(more)
 	utf=["Translation", "nüüd", "või", "ära", "Kas"]
  	for u in utf: 
  		words4=words4.replace(u, "")
 	wordcloud = WordCloud(stopwords=STOPWORDS).generate(words4)
	image = wordcloud.to_image()
	image.save("words.png","PNG")
	driver.close() 
 	driver.quit 
 	conn.commit()
 	conn.close() 
	print "Done"
Ejemplo n.º 16
0
def writeFreq(text, outFile, words):
    """
    Writes frequencies of words into the specified file
    """

    excludewords = STOPWORDS.copy()
    
    for word in words:
        excludewords.add(word)
    
    wordcloud = WordCloud(max_words=NUM_OF_WORDS, stopwords=excludewords)
    freqList  = wordcloud.process_text(text)

    for item in freqList:
        outFile.write(item[0] + ',' + str(item[1]) + '\n')
def title_wordcloud(dataFrame):
    from wordcloud import WordCloud, STOPWORDS
    from PIL import Image
    #WordCloud Visualization
    text = " ".join(list(dataFrame['track_name']))
    STOPWORDS = STOPWORDS.union(["feat","Remix","Edit","Radio","Version","Mix","Remastered"])
    spotify_mask = np.array(Image.open(path.join( "spotify-logo.jpg")))
    wordcloud = WordCloud(width=2880, height=1800,background_color="white",
                          stopwords=STOPWORDS,mask = spotify_mask).generate(text)
    # Open a plot of the generated image.
    plt.figure( figsize=(10,6))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig("project3_wordcloud.png")
    plt.show()
def main():
    d = os.path.dirname(__file__)
    DOC_NAME = "cvpr2015papers.txt"
    text = open(os.path.join(d, DOC_NAME)).read()

    # adding computer vision specific stopwords
    stopwords = STOPWORDS.copy()
    stopwords.add("image")

    wc = WordCloud(max_words=300, stopwords=stopwords, width=800, height=400)
    wc.generate(text)
    wc.to_file(os.path.join(d, "cvpr2015wordcloud.png"))

    plt.imshow(wc)
    plt.axis("off")
    plt.show()
Ejemplo n.º 19
0
def generate_word_cloud(text, mask_filename):
    d = path.dirname(__file__)  #??
    mask = imread(path.join(d, mask_filename))

    # adding movie script specific stopwords
    stopwords = STOPWORDS.copy()
    stopwords.add("info")
    stopwords.add("meetbot")
    stopwords.add("supybot")

    wc = WordCloud(max_words=1000, mask=mask, stopwords=stopwords, margin=10,
                random_state=1).generate(text)

    _, tmpfilename = tempfile.mkstemp('-wordcloud.png')
    wc.to_file(tmpfilename)
    return tmpfilename
Ejemplo n.º 20
0
def create_cloud(word, img, out_path):

    # Read the whole text.
    # text = open(word_path).read()
    text = word.read().decode('utf-8')
    # read the mask image
    # taken from
    # http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
    alice_mask = np.array(Image.open(img))
    # alice_mask = np.array(img_path)
    wc = WordCloud(font_path = '华文黑体.ttf' ,background_color="white", max_words=2000, mask=alice_mask,
                   stopwords=STOPWORDS.add("said"), width=1000, height=2300, ranks_only=True, mode='RGBA')
    # generate word cloud
    wc.generate(text)
    # wc.generate_from_frequencies([()])
    # store to file
    wc.to_file(out_path)
def generate_wordcloud(text):
    def my_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        """
        To change colors change the range for random ints below:
        Hue values are between 0 and 360
        Follows rainbow: 
        Red Orange Yellow Green Blue Indigo Violet
         0   50  100  150  200  250  300   360
        """
        hue_lower = 0
        hue_upper = 150

        saturation = 500

        light_lower = 80
        light_upper = 120

        return "hsl(%d, %d%%, %d%%)" % (
            random.randint(hue_lower, hue_upper),
            saturation,
            random.randint(light_lower, light_upper),
        )

    stopwords = STOPWORDS.copy()
    stopwords.add("us")
    stopwords.add("one")
    stopwords.add("will")
    stopwords.add("u")

    rand_num = random.randint(1, 100)

    wc = WordCloud(
        max_words=100, stopwords=stopwords, margin=10, random_state=rand_num, width=2000, height=1200
    ).generate(text)

    fig = plt.figure(figsize=(32, 20), dpi=100)
    plt.imshow(wc.recolor(color_func=my_color_func, random_state=1))

    # Save image
    outfilename = "tmp.png"
    wc.to_file(outfilename)
    plt.axis("off")

    plt.show()
Ejemplo n.º 22
0
def makeWC(theText, mask_image, mw):
    SW = STOPWORDS.copy()
    mywords = ['and', 'the', 'to', 'by', 'in', 'of', 'up',
           'Facebook', 'Twitter', 'Pinterest', 'Flickr',
           'Google', 'Instagram', 'login', 'Login', 'Log',
           'website', 'Website', 'Contact', 'contact',
           'twitter', 'Branding', 'Tweet', 'pic', 'location',
           'Details'
           ] + list(bad_words())
    [SW.add(w) for w in mywords]
    wordcloud = WordCloud(
                relative_scaling=0, 
                prefer_horizontal=random.uniform(0.5, 1), 
                stopwords=SW,
                background_color='black',
                max_words=mw, 
                mask = mask_image
                ).generate(theText)
    return wordcloud
Ejemplo n.º 23
0
def generate_wc(text = "Hello World"):
    #if int(time.time()*10)%10 in [0]:
    d = path.dirname(__file__)


    # read the mask image
    alice_coloring = np.array(Image.open(path.join(d, '..','static','images',"heart.png")))

    wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=40, random_state=42)
    # generate word cloud
    wc.generate(text)

    # generate word cloud image and save it 
    filename = "wordcloud.png"
    wc.to_file(path.join(d,'..','static','images',filename))
    del wc

    return filename
Ejemplo n.º 24
0
def WordCloudTopic( items , imagePath = None):
    # Generate a word cloud image
    
    if imagePath:
    	alice_coloring = np.array(Image.open(imagePath))

    	wc = WordCloud(background_color="white", max_words=200, mask=alice_coloring,
                   stopwords=STOPWORDS.add("said"),
                   max_font_size=300)
    	# generate word cloud
    	wc.generate_from_frequencies(items)
    	image_colors = ImageColorGenerator(alice_coloring)
    	plt.imshow(wc.recolor(color_func=image_colors))
    else:
    	wc = WordCloud(background_color="white", max_words=300,
        max_font_size=40, random_state=42)
    	wordcloud = wc.generate_from_frequencies(items)    
    	plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(description='Generate word cloud')
    parser.add_argument('artist', help='Artist to be searched')
    args = parser.parse_args()
    artist = string_to_url(args.artist)
    #artist = "Gaslight Anthem"

    api_url = "http://lyrics.wikia.com/api.php?func=getArtist&artist=%s&fmt=realjson" % (artist, )
    data = json.load(urllib2.urlopen(api_url))
    art_data = data['albums']

    songs_by_album = [album['songs'] for album in art_data]
    songs = sum(songs_by_album, [])
    lyrics = ""
    for song in songs:
        song = song.strip(bad_chars)
        lyrics += get_lyrics(string_to_url(song), artist)
        wc = WordCloud(background_color="white", max_words=2000,stopwords=STOPWORDS.add("said"))
        if not args.sum:
            wc.generate(lyrics)
            wc.to_file("%s_%s.png" %(artist,song,))
def wordcloud(wordSource):
    #writes origional catagory list to text file
    d = os.path.dirname(__file__)
    file = open("catagory.txt", 'w')
    for item in wordSource:
        file.write("%s\n" % item)
    thefile = open(os.path.join(d, "catagory.txt")).read()

    #adds words to exclude list
    STOPWORDS.add("chronic")
    STOPWORDS.add("disease")
    STOPWORDS.add("obstructive")
    STOPWORDS.add("status")

    # generate word cloud
    wordcloud = WordCloud(stopwords=STOPWORDS,
        background_color="white",
        width = 650,
        height = 250).generate_from_text(thefile)

    #re-colers and saves wordcloud as png
    wordcloud.recolor(color_func=grey_color_func, random_state=3)
    wordcloud.to_file("wordcloud.png")
Ejemplo n.º 27
0
def cloud_word_with_mask(file_name):
	text = open(file_name).read()
	# read the mask / color image
	# amazon_coloring = imread('amazon-logo_grey.png')

	wc = WordCloud(background_color="white", max_words=200, #mask=amazon_coloring,
	               stopwords=STOPWORDS.add("said"),
	               max_font_size=200, random_state=42, width=1800, height=1000)
	# generate word cloud
	wc.generate(text)

	# create coloring from image
	# image_colors = ImageColorGenerator(amazon_coloring)

	# recolor wordcloud and show
	# we could also give color_func=image_colors directly in the constructor
	# plt.imshow(wc.recolor(color_func=image_colors))
	plt.figure()
	plt.imshow(wc)
	plt.axis("off")
	# plt.show()
	plt.savefig(file_name.split('.')[0] + '.png')
Ejemplo n.º 28
0
def generateWordcloud(wordlist, outfile, title, nwords=100):
    """

    :param wordlist: words in a list
    :param outfile: name of the output file to which to store the figure
    :param title: title of the figure
    :param nwords: maximum number of words to plot

    :return: None
    """
    # generate word cloud
    wc = WordCloudSMN(background_color="white", max_words=nwords,
                      width=800, height=400,
                      stopwords=STOPWORDS.add("looking"),
                      max_font_size=80, random_state=42)
    wc.generate_SMN(wordlist)

    # generate the figure
    plt.figure(figsize=(16, 16))
    plt.title(title)
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig(outfile)
    plt.close()
Ejemplo n.º 29
0
from src.twitter import Tweet
from nltk.tokenize.casual import TweetTokenizer
from wordcloud import STOPWORDS
import numpy as np
import nltk as nlp
import sys
import os
import re
from keras.preprocessing import sequence
from keras.models import model_from_json
import nltk
nltk.download('punkt')
nltk.download('wordnet')

STOPWORDS.add("rt")
STOPWORDS.add("s")
STOPWORDS.add("u")
STOPWORDS.add("amp")
STOPWORDS.add("th")
STOPWORDS.add("will")
STOPWORDS.add("t")
STOPWORDS.add("m")

__author__ = 'Shivchander Sudalairaj'

json_file = open('model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
def grey_color_func(word,
                    font_size,
                    position,
                    orientation,
                    random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


d = os.getcwd()

#these are the extracted words from bob murphy's article titles
filename = 'words.txt'
text = open(filename).read()

stopwords = STOPWORDS.copy()

width = 2400
height = 2200
colors = ['white', 'black']

for number_of_words in list_of_number_of_words:
    for color in colors:

        print "Making a " + color + " wordcloud with " + str(
            number_of_words) + " words."
        wordcloud = WordCloud(max_words=number_of_words,
                              stopwords=stopwords,
                              margin=0,
                              random_state=1,
                              width=width,
Ejemplo n.º 31
0
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from PIL import Image

# Read the whole text.
df = pd.read_csv("train_set.csv", sep="\t")
my_category = df['Category']
my_content = df['Content']

# read the mask image
# taken from
# http://rtyuiope.deviantart.com/art/Code-Geass-Wallpaper-374008098
zero_mask = numpy.array(Image.open("zero.png"))
STOPWORDS.add("said")
wc = WordCloud(background_color="red",
               max_words=2000,
               mask=zero_mask,
               stopwords=STOPWORDS)

# generate word cloud
text = ""
for b in range(len(my_category.index)):
    if (my_category[b] == "Politics"):
        text += my_content[b]

wc.generate(text)

# store to file
wc.to_file("politics_cloud.png")
Ejemplo n.º 32
0
def index():

    if request.method == 'POST':
        hashtag_name = request.form['hashtag']
        number = request.form['number']
        splitted_hashtags = [ht.strip() for ht in re.split(", ", hashtag_name)]
        if check_if_hashtags_are_valid(splitted_hashtags):
            results = []
            for tweet in tweepy.Cursor(api.search,
                                       q=splitted_hashtags,
                                       lang="en").items(int(number)):
                results.append(tweet)
            data_set = tweets_df(results)

            text = data_set["text"]
            for i in range(0, len(text)):
                txt = ' '.join(word for word in text[i].split()
                               if not word.startswith('https:'))
                data_set.at[i, 'text2'] = txt
                data_set.drop_duplicates('text2', inplace=True)
                data_set.reset_index(drop=True, inplace=True)
                data_set.drop('text', axis=1, inplace=True)
                data_set.rename(columns={'text2': 'text'}, inplace=True)

            # Join all the text from the 1000 tweets
            text_Combined = " ".join(text.values.astype(str))
            more_stopwords = {
                'https', 'RT', 'rt', 'CO', '@', 'el', 't', '&amp;', 'covid',
                'covid 19', hashtag_name, hashtag_name[1:], '#covid19', 'tco',
                'covid19', 'amp', '@drericding'
            }
            stopwords = STOPWORDS.union(more_stopwords)
            covid = " ".join([word for word in text_Combined.split()])
            wordcount = {}

            # To eliminate duplicates, remember to split by punctuation, and use case demiliters.
            for word in covid.lower().split():
                word = word.replace(".", "")
                word = word.replace(",", "")
                word = word.replace(":", "")
                word = word.replace("\"", "")
                word = word.replace("!", "")
                word = word.replace("“", "")
                word = word.replace("‘", "")
                word = word.replace("*", "")
                if word not in stopwords:
                    if word not in wordcount:
                        wordcount[word] = 1
                    else:
                        wordcount[word] += 1

            word_counter = collections.Counter(wordcount)

            # Create a data frame of the most common words
            lst = word_counter.most_common(100)
            df = pd.DataFrame(lst, columns=['Word', 'Count'])
            text1 = df["Word"]
            text_Combined = " ".join(text1.values.astype(str))
            covid = " ".join([word for word in text_Combined.split()])

            #Create a Word Cloud
            wc = WordCloud(background_color="White",
                           stopwords=STOPWORDS.union(more_stopwords),
                           width=600,
                           height=300,
                           relative_scaling=0,
                           max_words=50)
            wc.generate(covid)
            wc.to_file('static/temporary_files/fig100.png')
            full_filename = os.path.join(app.config['UPLOAD_FOLDER'],
                                         'fig100.png')

            return render_template("search.html", image=full_filename)
        else:
            return render_template("index.html")
    else:
        return render_template("index.html")
Ejemplo n.º 33
0
import tweepy
from Teil_06_Twitter_Credentials import *
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

auth = tweepy.OAuthHandler(ConsumerKey, ConsumerSecret)
api = tweepy.API(auth)
text = ''

tweeds = api.user_timeline(screen_name='realDonaldTrump',
                           count=100,
                           include_rts=False,
                           tweet_mode='extended')
for tweed in tweeds:
    text = text + ' ' + tweed.full_text

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update(['https', 'co', 'amp'])
wordcloud.generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
Ejemplo n.º 34
0
# coding: utf-8
import jieba
from numpy import unicode
from scipy.misc import imread  # 这是一个处理图像的函数
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

back_color = imread('./dataset/test_picture.jpeg')  # 解析该图片

wc = WordCloud(
    background_color='white',  # 背景颜色
    max_words=1000,  # 最大词数
    mask=back_color,  # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
    max_font_size=100,  # 显示字体的最大值
    stopwords=STOPWORDS.add('苟利国'),  # 使用内置的屏蔽词,再添加'苟利国'
    font_path="./dataset/Xingkai.ttc",  # 解决显示口字型乱码问题,可进入C:/Windows/Fonts/目录更换字体
    random_state=42,  # 为每个词返回一个PIL颜色
    # width=1000,  # 图片的宽
    # height=860  #图片的长
)
# WordCloud各含义参数请点击 wordcloud参数

# 添加自己的词库分词,比如添加'金三胖'到jieba词库后,当你处理的文本中含有金三胖这个词,
# 就会直接将'金三胖'当作一个词,而不会得到'金三'或'三胖'这样的词
jieba.add_word('金三胖')

# 打开词源的文本文件
text = open('./dataset/wordcloud.txt').read()

# text = open('./dataset/poetry.txt').read()
def tag_and_lem(element):
    sent = pos_tag(word_tokenize(element))
    return ' '.join([
        lemmer.lemmatize(sent[k][0], convert_tag(sent[k][1][0]))
        for k in range(len(sent))
    ])


data.loc[:, 'tweet'] = data['tweet'].apply(lambda x: tag_and_lem(x))
data.loc[:, 'hashtags'] = data['hashtags'].apply(
    lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# In[6]:

from wordcloud import WordCloud, STOPWORDS
stopwords = STOPWORDS.add('amp')

all_words = ' '.join(data.tweet.values)
hatred_words = ' '.join(data[data.label == 1].tweet.values)

plt.figure(figsize=(16, 8))

cloud1 = WordCloud(width=400,
                   height=400,
                   background_color='white',
                   stopwords=stopwords).generate(all_words)
plt.subplot(121)
plt.imshow(cloud1, interpolation="bilinear")
plt.axis("off")
plt.title('All tweets', size=20)
Ejemplo n.º 36
0
 def loaddata(Text,mods):
         #read the preprocessed data from pickle file
         df = pd.read_pickle("corpus.pkl")
         
         STOPWORDS.add("rt")
         STOPWORDS.add("s")
         STOPWORDS.add("u")
         STOPWORDS.add("amp")
         STOPWORDS.add("th")
         STOPWORDS.add("will")
         STOPWORDS.add("t")
         STOPWORDS.add("m")
         STOPWORDS.add("today")
        
         
         #split the data into train and test set
         from sklearn.model_selection import train_test_split
         train, test = train_test_split(df, test_size=0.3, train_size=0.7, random_state=14)
         
         
         #performing stemming
         lt = LancasterStemmer()
         def token(text):
             txt = nltk.word_tokenize(text.lower())
             return [lt.stem(word) for word in txt]
         
         #document term matrix using Tfidf vectorizer
         tfv = TfidfVectorizer(tokenizer=token,stop_words=STOPWORDS,analyzer=u'word', min_df=4)
         X_train_tfv = tfv.fit_transform(train['clean_tweet']) 
         X_test_tfv = tfv.transform(test['clean_tweet']) 
     
        
         X_train_tfv = pd.DataFrame(X_train_tfv.toarray(), columns=tfv.get_feature_names())
         X_test_tfv = pd.DataFrame(X_test_tfv.toarray(), columns=tfv.get_feature_names())
         
         if(mods=="MNB"):
             
             st.success("Performing MNB Classification")
             #build the model
             nb = MultinomialNB()
             # Train the model
             nb.fit(X_train_tfv, train['Party_log'])
             
             #transform the entered text into document term matrix
             vec_text = tfv.transform(Text).toarray()
             #predicting the value for newly entered tweet
             result = nb.predict(vec_text)
             #if result is 1 then democrat else republican
         else:
             st.success("Performing Logistic Regression")
             #build the model
             lr = LogisticRegression()
             # Train the model
             lr.fit(X_train_tfv, train['Party_log'])
             
             #transform the entered text into document term matrix
             vec_text = tfv.transform(Text).toarray()
             #predicting the value for newly entered tweet
             result = lr.predict(vec_text)
             #if result is 1 then democrat else republican
             
         if result == 1:
              return "demo"
         elif result == 0:
              return "rep"
Ejemplo n.º 37
0
def main():
    
    def loaddata(Text,mods):
            #read the preprocessed data from pickle file
            df = pd.read_pickle("corpus.pkl")
            
            STOPWORDS.add("rt")
            STOPWORDS.add("s")
            STOPWORDS.add("u")
            STOPWORDS.add("amp")
            STOPWORDS.add("th")
            STOPWORDS.add("will")
            STOPWORDS.add("t")
            STOPWORDS.add("m")
            STOPWORDS.add("today")
           
            
            #split the data into train and test set
            from sklearn.model_selection import train_test_split
            train, test = train_test_split(df, test_size=0.3, train_size=0.7, random_state=14)
            
            
            #performing stemming
            lt = LancasterStemmer()
            def token(text):
                txt = nltk.word_tokenize(text.lower())
                return [lt.stem(word) for word in txt]
            
            #document term matrix using Tfidf vectorizer
            tfv = TfidfVectorizer(tokenizer=token,stop_words=STOPWORDS,analyzer=u'word', min_df=4)
            X_train_tfv = tfv.fit_transform(train['clean_tweet']) 
            X_test_tfv = tfv.transform(test['clean_tweet']) 
        
           
            X_train_tfv = pd.DataFrame(X_train_tfv.toarray(), columns=tfv.get_feature_names())
            X_test_tfv = pd.DataFrame(X_test_tfv.toarray(), columns=tfv.get_feature_names())
            
            if(mods=="MNB"):
                
                st.success("Performing MNB Classification")
                #build the model
                nb = MultinomialNB()
                # Train the model
                nb.fit(X_train_tfv, train['Party_log'])
                
                #transform the entered text into document term matrix
                vec_text = tfv.transform(Text).toarray()
                #predicting the value for newly entered tweet
                result = nb.predict(vec_text)
                #if result is 1 then democrat else republican
            else:
                st.success("Performing Logistic Regression")
                #build the model
                lr = LogisticRegression()
                # Train the model
                lr.fit(X_train_tfv, train['Party_log'])
                
                #transform the entered text into document term matrix
                vec_text = tfv.transform(Text).toarray()
                #predicting the value for newly entered tweet
                result = lr.predict(vec_text)
                #if result is 1 then democrat else republican
                
            if result == 1:
                 return "demo"
            elif result == 0:
                 return "rep"
                
    
    st.title("Sentiment Analysis ")
    st.title("Democrats vs Republicans Twitter Data")
    # available NlP techniques
    activities=["Prediction","NLP"]
    
    #using streamlit sidebar option
    choice = st.sidebar.selectbox("Select Activity",activities)
    
    #if prediction is chosen 
    if choice == "Prediction":
        #read the text from the text_area
        Tweet_text = st.text_area("Enter Text","Type Here")
        
        #cleaning the tweet entered
        url_re = re.compile('http\S+') 
        punc_re = re.compile('[%s]' % re.escape(string.punctuation)) 
        num_re = re.compile('(\d+)')
        alpha_num_re = re.compile("[^a-zA-Z]")
        # convert to lowercase
        Tweet_text = Tweet_text.lower()
        # remove hyperlinks
        Tweet_text = url_re.sub(' ', Tweet_text)
        # remove puncuation
        Tweet_text = punc_re.sub(' ', Tweet_text)
        # remove numeric 'words'
        Tweet_text = num_re.sub(' ', Tweet_text)
        Tweet_text = alpha_num_re.sub(' ', Tweet_text)
                    
        #just considering the model with highest accuracy, can include other models
        all_ml_models = ["MNB","LRM"]
        #display the models using streamlit selectbox
        model_choice=st.selectbox("Choose ML Model",all_ml_models)
        
        #streamlit button
        if st.button("Classify"):
            
            #displaying the preprocessed data
            st.text("Pre-Processed Data (stop words will be removed while creating document term matrix(tfidf Vectorizer))::\n{}".format([Tweet_text]))
            
            #if statement runs depending on the model chosen 
            if model_choice == "MNB":
                
                st.success("You have chosen Multinominal Naive Bayes model")
                #function loaddata returns the predicted party
                prediction = loaddata([Tweet_text],model_choice)
                
                
                if prediction == "demo":
                    #display the results
                    
                    st.success('Party:{}'.format("Democrat"))
                    #path for the image
                    image='Images/Democrat.jpg'
                    img=Image.open(image)
                    #display the image 
                    st.image(img,width=300)
                    
                else:
                    st.success('Party:{}'.format("Republican"))
                    image='Images/Republican.jpg'
                    img=Image.open(image)
                    st.image(img,width=300)
                    
            if model_choice == "LRM":
                
                st.success("You have chosen Logistic Regression model")
                #function loaddata returns the predicted party
                prediction = loaddata([Tweet_text],model_choice)
                
                
                if prediction == "demo":
                    #display the results
                    st.success('Party:{}'.format("Democrat"))
                    #path for the image
                    image='Images/Democrat.jpg'
                    img=Image.open(image)
                    #display the image 
                    st.image(img,width=300)
                else:
                    st.success('Party:{}'.format("Republican"))
                    image='Images/Republican.jpg'
                    img=Image.open(image)
                    st.image(img,width=300)
                    
                    
    
    #if chosen option is nlp
    if choice == 'NLP':
        st.info("Natural Language Processing")
        
        #enter the new tweet
        Tweet_text = st.text_area("Enter Here","Type Here")
        
        #cleaning the tweet entered
        url_re = re.compile('http\S+') 
        punc_re = re.compile('[%s]' % re.escape(string.punctuation)) 
        num_re = re.compile('(\\d+)')
        alpha_num_re = re.compile("[^a-zA-Z]")
        # convert to lowercase
        Tweet_text = Tweet_text.lower()
        # remove hyperlinks
        Tweet_text = url_re.sub(' ', Tweet_text)
        # remove puncuation
        Tweet_text = punc_re.sub(' ', Tweet_text)
        # remove numeric 'words'
        Tweet_text = num_re.sub(' ', Tweet_text)
        Tweet_text = alpha_num_re.sub(' ', Tweet_text)
        
        STOPWORDS.add("rt")
        STOPWORDS.add("s")
        STOPWORDS.add("u")
        STOPWORDS.add("amp")
        STOPWORDS.add("th")
        STOPWORDS.add("will")
        STOPWORDS.add("t")
        STOPWORDS.add("m")
        
        
       
        list_pos = 0
        cleaned_str = ''
        text = Tweet_text.split()
        for word in text:
            if word not in STOPWORDS:
                if list_pos == 0:
                    cleaned_str = word
                else:
                    cleaned_str = cleaned_str + ' ' + word
                list_pos += 1
            
        
        #clean tweet             
        Tweet_text = cleaned_str
        #optoin available 
        nlp_options=["Tokenization","Lemmatization","POS Tags"]
        #selected option 
        nlp_choice=st.selectbox("Choose the NlP option",nlp_options)
        
        if st.button("Start"):
            
            #displaying the cleaned tweet
            st.info("Original Text::\n{}".format(Tweet_text))
            
            #nlp coverts text to processed docx object that is understood by spacy
            Sentence= nlp(Tweet_text)
            if nlp_choice=='Tokenization':
                result=[token.text for token in Sentence]
            elif nlp_choice == 'Lemmatization':
                result = ["'Token':{},'Lemma':{}".format(token.text,token.lemma_) for token in Sentence]
            elif nlp_choice == 'POS Tags':
                result = ["'Token':{},'POS':{},'Dependency':{}".format(word.text,word.tag_,word.dep_) for word in Sentence]

            st.json(result)
            
            #display the results in table form
        if st.button("Tabulize"):
            docx = nlp(Tweet_text)
            c_tokens = [token.text for token in docx ]
            c_lemma = [token.lemma_ for token in docx ]
            c_pos = [token.pos_ for token in docx ]
            
            #creating dataframe using the results
            new_df = pd.DataFrame(zip(c_tokens,c_lemma,c_pos),columns=['Tokens','Lemma','POS'])
            #display df
            st.dataframe(new_df)
        
        #display using wordcloud
        if st.checkbox("WordCloud"):
            fig, ax = plt.subplots(figsize=(15,5))
            c_text = Tweet_text
            wordcloud = WordCloud().generate(c_text)
            plt.imshow(wordcloud,interpolation='bilinear')
            plt.axis("off")
            st.pyplot(fig)
            st.set_option('deprecation.showPyplotGlobalUse', False)
Ejemplo n.º 38
0
import numpy as np
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'sozler3.txt')).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, "mask3.png")))

wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=STOPWORDS.add("yorulunca"))
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "mask_output3.png"))

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
Ejemplo n.º 39
0
#!/usr/bin/python3.7

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

with open("alice_in_wonderland.txt") as f:
    text = f.read()

worldcloud = WordCloud(width=1920, height=1200)
worldcloud.generate(text)

STOPWORDS.add('said')
STOPWORDS.add('Illustration')

plt.imshow(worldcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
Ejemplo n.º 40
0
f = open('tweets', 'wb')

for status in api.user_timeline():
    f.write(api.get_status(status.id).text.encode("utf-8"))
f.close()

words = ' '
count = 0
f = open('tweets', 'rb')
for line in f:
    words = words + line.decode("utf-8")
f.close

stopwords = {'https', 'co', 'RT'}

logomask = imread('twitter_mask.png')

wordcloud = WordCloud(font_path='/Users/Ryan/Library/Fonts/Inconsolata.otf',
                      stopwords=STOPWORDS.union(stopwords),
                      background_color='white',
                      mask=logomask,
                      max_words=500,
                      width=1800,
                      height=1400).generate(words)

plt.imshow(wordcloud.recolor(color_func=None, random_state=3))
plt.axis('off')
plt.savefig('./Twitter Word Cloud - ' + time.strftime("%Y%m%d") + '.png',
            dpi=300)
plt.show()
Ejemplo n.º 41
0
from scipy.misc import imread
#import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

d = path.dirname(__file__)

# Read the whole text.
text = open(path.join(d, 'hot_key.txt')).read()

# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = imread(path.join(d, "alice_mask.png"))

wc = WordCloud(font_path="simhei.ttf", background_color="white", max_words=2000, mask=alice_mask,
               stopwords=STOPWORDS.add("Qq"))
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "alice_Chinese.png"))

# show
# plt.imshow(wc)
# plt.axis("off")
# plt.figure()
# plt.imshow(alice_mask, cmap=plt.cm.gray)
# plt.axis("off")
# plt.show()
Ejemplo n.º 42
0
from os import path

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS
STOPWORDS.add('said')

d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()

text = open(path.join(d, 'alice in Wonderland.txt')).read()

alice_mask = np.array(Image.open(path.join(d, 'alice_mask.png')))

STOPWORDS = set(STOPWORDS)
STOPWORDS.add("said")

wordcloud = WordCloud(background_color="white",
                      max_words=200,
                      mask=alice_mask,
                      contour_width=3,
                      contour_color='steelblue')
wordcloud.generate(text)
wordcloud.to_file(path.join("alice.png"))

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.axis("off")
plt.show()
Ejemplo n.º 43
0
#!/usr/bin/env python3

import argparse
import pathlib
import random

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

stopwords = STOPWORDS.update(
    ["self", 'item', 'refdef', 'http', 'https', 'null'])

# def random_color_func(
#     word, font_size, position, orientation, random_state=None, **kwargs
# ):
#     h = int(360.0 * 143.0 / 255.0)
#     s = int(77.0 * 255.0 / 255.0)
#     l = int(100.0 * float(random.randint(44, 100)) / 255.0)
#     return "hsl({}, {}%, {}%)".format(h, s, l)


def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--filename",
                        "-f",
                        dest="filename",
                        required=True,
                        help="Markdown/text file")
    parser.add_argument(
        "--save_dir",
        "-s",
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator  # 単語の出現頻度の可視化

use_image = np.array(Image.open("apple_icon.png"))  # 使用する画像
img_color = np.array(Image.open("apple_icon.png"))  # 使用する画像の色合い

jobs_speech = "I am honored to be with you today at your commencement from one of the finest universities in the world. I never graduated from college. Truth be told, this is the closest I’ve ever gotten to a college graduation. Today I want to tell you three stories from my life. That’s it. No big deal. Just three stories.Related to this story 2005 Stanford Commencement coverage The first story is about connecting the dots.I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit. So why did I drop out?It started before I was born. My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption. She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the last minute that they really wanted a girl. So my parents, who were on a waiting list, got a call in the middle of the night asking: “We have an unexpected baby boy; do you want him?” They said: “Of course.” My biological mother later found out that my mother had never graduated from college and that my father had never graduated from high school. She refused to sign the final adoption papers. She only relented a few months later when my parents promised that I would someday go to college.And 17 years later I did go to college. But I naively chose a college that was almost as expensive as Stanford, and all of my working-class parents’ savings were being spent on my college tuition. After six months, I couldn’t see the value in it. I had no idea what I wanted to do with my life and no idea how college was going to help me figure it out. And here I was spending all of the money my parents had saved their entire life. So I decided to drop out and trust that it would all work out OK. It was pretty scary at the time, but looking back it was one of the best decisions I ever made. The minute I dropped out I could stop taking the required classes that didn’t interest me, and begin dropping in on the ones that looked interesting.It wasn’t all romantic. I didn’t have a dorm room, so I slept on the floor in friends’ rooms, I returned Coke bottles for the 5¢ deposits to buy food with, and I would walk the 7 miles across town every Sunday night to get one good meal a week at the Hare Krishna temple. I loved it. And much of what I stumbled into by following my curiosity and intuition turned out to be priceless later on. Let me give you one example:Reed College at that time offered perhaps the best calligraphy instruction in the country. Throughout the campus every poster, every label on every drawer, was beautifully hand calligraphed. Because I had dropped out and didn’t have to take the normal classes, I decided to take a calligraphy class to learn how to do this. I learned about serif and sans serif typefaces, about varying the amount of space between different letter combinations, about what makes great typography great. It was beautiful, historical, artistically subtle in a way that science can’t capture, and I found it fascinating.None of this had even a hope of any practical application in my life. But 10 years later, when we were designing the first Macintosh computer, it all came back to me. And we designed it all into the Mac. It was the first computer with beautiful typography. If I had never dropped in on that single course in college, the Mac would have never had multiple typefaces or proportionally spaced fonts. And since Windows just copied the Mac, it’s likely that no personal computer would have them. If I had never dropped out, I would have never dropped in on this calligraphy class, and personal computers might not have the wonderful typography that they do. Of course it was impossible to connect the dots looking forward when I was in college. But it was very, very clear looking backward 10 years later.Again, you can’t connect the dots looking forward; you can only connect them looking backward. So you have to trust that the dots will somehow connect in your future. You have to trust in something — your gut, destiny, life, karma, whatever. This approach has never let me down, and it has made all the difference in my life.My second story is about love and loss.I was lucky — I found what I loved to do early in life. Woz and I started Apple in my parents’ garage when I was 20. We worked hard, and in 10 years Apple had grown from just the two of us in a garage into a $2 billion company with over 4,000 employees. We had just released our finest creation — the Macintosh — a year earlier, and I had just turned 30. And then I got fired. How can you get fired from a company you started? Well, as Apple grew we hired someone who I thought was very talented to run the company with me, and for the first year or so things went well. But then our visions of the future began to diverge and eventually we had a falling out. When we did, our Board of Directors sided with him. So at 30 I was out. And very publicly out. What had been the focus of my entire adult life was gone, and it was devastating.I really didn’t know what to do for a few months. I felt that I had let the previous generation of entrepreneurs down — that I had dropped the baton as it was being passed to me. I met with David Packard and Bob Noyce and tried to apologize for screwing up so badly. I was a very public failure, and I even thought about running away from the valley. But something slowly began to dawn on me — I still loved what I did. The turn of events at Apple had not changed that one bit. I had been rejected, but I was still in love. And so I decided to start over.I didn’t see it then, but it turned out that getting fired from Apple was the best thing that could have ever happened to me. The heaviness of being successful was replaced by the lightness of being a beginner again, less sure about everything. It freed me to enter one of the most creative periods of my life.During the next five years, I started a company named NeXT, another company named Pixar, and fell in love with an amazing woman who would become my wife. Pixar went on to create the world’s first computer animated feature film, Toy Story, and is now the most successful animation studio in the world. In a remarkable turn of events, Apple bought NeXT, I returned to Apple, and the technology we developed at NeXT is at the heart of Apple’s current renaissance. And Laurene and I have a wonderful family together.I’m pretty sure none of this would have happened if I hadn’t been fired from Apple. It was awful tasting medicine, but I guess the patient needed it. Sometimes life hits you in the head with a brick. Don’t lose faith. I’m convinced that the only thing that kept me going was that I loved what I did. You’ve got to find what you love. And that is as true for your work as it is for your lovers. Your work is going to fill a large part of your life, and the only way to be truly satisfied is to do what you believe is great work. And the only way to do great work is to love what you do. If you haven’t found it yet, keep looking. Don’t settle. As with all matters of the heart, you’ll know when you find it. And, like any great relationship, it just gets better and better as the years roll on. So keep looking until you find it. Don’t settle.My third story is about death.When I was 17, I read a quote that went something like: “If you live each day as if it was your last, someday you’ll most certainly be right.” It made an impression on me, and since then, for the past 33 years, I have looked in the mirror every morning and asked myself: “If today were the last day of my life, would I want to do what I am about to do today?” And whenever the answer has been “No” for too many days in a row, I know I need to change something.Remembering that I’ll be dead soon is the most important tool I’ve ever encountered to help me make the big choices in life. Because almost everything — all external expectations, all pride, all fear of embarrassment or failure — these things just fall away in the face of death, leaving only what is truly important. Remembering that you are going to die is the best way I know to avoid the trap of thinking you have something to lose. You are already naked. There is no reason not to follow your heart.About a year ago I was diagnosed with cancer. I had a scan at 7:30 in the morning, and it clearly showed a tumor on my pancreas. I didn’t even know what a pancreas was. The doctors told me this was almost certainly a type of cancer that is incurable, and that I should expect to live no longer than three to six months. My doctor advised me to go home and get my affairs in order, which is doctor’s code for prepare to die. It means to try to tell your kids everything you thought you’d have the next 10 years to tell them in just a few months. It means to make sure everything is buttoned up so that it will be as easy as possible for your family. It means to say your goodbyes.I lived with that diagnosis all day. Later that evening I had a biopsy, where they stuck an endoscope down my throat, through my stomach and into my intestines, put a needle into my pancreas and got a few cells from the tumor. I was sedated, but my wife, who was there, told me that when they viewed the cells under a microscope the doctors started crying because it turned out to be a very rare form of pancreatic cancer that is curable with surgery. I had the surgery and I’m fine now.This was the closest I’ve been to facing death, and I hope it’s the closest I get for a few more decades. Having lived through it, I can now say this to you with a bit more certainty than when death was a useful but purely intellectual concept:No one wants to die. Even people who want to go to heaven don’t want to die to get there. And yet death is the destination we all share. No one has ever escaped it. And that is as it should be, because Death is very likely the single best invention of Life. It is Life’s change agent. It clears out the old to make way for the new. Right now the new is you, but someday not too long from now, you will gradually become the old and be cleared away. Sorry to be so dramatic, but it is quite true.Your time is limited, so don’t waste it living someone else’s life. Don’t be trapped by dogma — which is living with the results of other people’s thinking. Don’t let the noise of others’ opinions drown out your own inner voice. And most important, have the courage to follow your heart and intuition. They somehow already know what you truly want to become. Everything else is secondary.When I was young, there was an amazing publication called The Whole Earth Catalog, which was one of the bibles of my generation. It was created by a fellow named Stewart Brand not far from here in Menlo Park, and he brought it to life with his poetic touch. This was in the late 1960s, before personal computers and desktop publishing, so it was all made with typewriters, scissors and Polaroid cameras. It was sort of like Google in paperback form, 35 years before Google came along: It was idealistic, and overflowing with neat tools and great notions.Stewart and his team put out several issues of The Whole Earth Catalog, and then when it had run its course, they put out a final issue. It was the mid-1970s, and I was your age. On the back cover of their final issue was a photograph of an early morning country road, the kind you might find yourself hitchhiking on if you were so adventurous. Beneath it were the words: “Stay Hungry. Stay Foolish.” It was their farewell message as they signed off. Stay Hungry. Stay Foolish. And I have always wished that for myself. And now, as you graduate to begin anew, I wish that for you.Stay Hungry. Stay Foolish.Thank you all very much."

# 禁止キーワードがあればリストに入れる
add_STOPWORDS = []
for i in add_STOPWORDS:
    STOPWORDS.add(i)
str(STOPWORDS)

# 画像データ情報
wordcloud = WordCloud(
    width=480 * 0.75,
    height=320 * 0.75,
    background_color="white",
    max_words=2000,
    mask=use_image,
    contour_width=0,
    contour_color="steelblue",
)

wordcloud.generate(jobs_speech)  # テキストデータを入れた変数を入れる
image_colors = ImageColorGenerator(img_color)  # 保存する画像に色つけ
color_mark = wordcloud.recolor(color_func=image_colors)  # 保存する画像に色をつける
Ejemplo n.º 45
0
def Contacts_greater_than_5(filename):
    df = pd.read_csv(os.path.join('csvs',filename))
    os.remove(os.path.join('csvs', filename))
    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
    fig  = plt.GridSpec(13,4,wspace=0.4,hspace=0.5)
    plt.figure(figsize=(16, 50))

    # title
    ax1 = plt.subplot(fig[0, :])
    ax1.text(0.2, 0.4, 'CHAT ANALYSIS', weight='bold',
            color='#470070', fontsize="60")
    #sb.despine(left=True, bottom=True, ax=ax1)
    plt.xticks([], [])
    plt.yticks([], [])


    # 1st Row----------------------------
    ax2 = plt.subplot(fig[1, 0])
    msgs = df.shape[0]
    ax2.text(0.5, 0.4, msgs, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax2.text(0.5, 0.1, 'Total Messages', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax2, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax3 = plt.subplot(fig[1, 1])
    members = np.unique(df['Contacts']).shape[0]
    ax3.text(0.5, 0.4, members, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax3.text(0.5, 0.1, 'Members', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax3, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax4 = plt.subplot(fig[1, 2])
    sDate = df['Date'][0]
    ax4.text(0.5, 0.4, sDate, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax4.text(0.5, 0.1, 'Start Date', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax4, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax5 = plt.subplot(fig[1, 3])
    eDate = df['Date'][df.shape[0]-1]
    ax5.text(0.5, 0.4, eDate, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax5.text(0.5, 0.1, 'End Date', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax5, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    # 2nd Row-----------------------------
    ax6 = plt.subplot(fig[2, 0])
    i = 0
    for msg in df['Messages']:
        i += (len(str(msg).split(' ')))

    avgMsg = str(i/df.shape[0])
    ax6.text(0.5, 0.4, avgMsg[:4]+' words', horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax6.text(0.5, 0.1, 'Average msg length', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax6, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax7 = plt.subplot(fig[2, 1])
    length = 0
    name = ""
    for msg in df['Messages']:
        if(length < len(str(msg).split(' '))):
            length = len(str(msg).split(' '))
            name = df[df['Messages'] == msg]['Contacts'].values[0]

    ax7.text(0.5, 0.4, str(length)+' words', horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax7.text(0.5, 0.1, 'Maximum msg length', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax7, left=True)
    plt.xticks([], [])
    plt.yticks([], [])

    ax8 = plt.subplot(fig[2, 2])
    week = {0: "Monday", 1: "Tuesday", 2: "Wednesday",
            3: "Thursday", 4: "Friday", 5: 'Saturday', 6: 'Sunday'}
    busy_day = week[Counter(pd.to_datetime(
        df['Date']).dt.weekday).most_common(1)[0][0]]
    ax8.text(0.5, 0.4, busy_day, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax8.text(0.5, 0.1, 'Most Busy WeekDay', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax8, left=True)
    plt.xticks([], [])
    plt.yticks([], [])


    ax9 = plt.subplot(fig[2, 3])
    month = {1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July",
            8: "August", 9: "September", 10: "October", 11: "November", 12: "December"}
    busy_month = month[Counter(pd.to_datetime(
        df['Date']).dt.month).most_common(1)[0][0]]
    ax9.text(0.5, 0.4, busy_month, horizontalalignment='center',
            color='#9f21de', fontsize="30")
    ax9.text(0.5, 0.1, '    Most Busy Month    ', horizontalalignment='center',
            color='#8f8da6', fontsize="20")
    sb.despine(ax=ax9, left=True)
    plt.xticks([], [])
    plt.yticks([], [])


    # 3rd Row-----------------------------
    ax10 = plt.subplot(fig[3, :])
    ax10.set_facecolor('#9f21de')
    ax10.text(0.5, 0.4, name, weight='bold',
            horizontalalignment='center', color='white', fontsize="30")
    ax10.text(0.5, 0.1, 'Maximum Length Message Send By',
            horizontalalignment='center', color='#e9ddf0', fontsize="20")
    sb.despine(ax=ax10, left=True)
    plt.xticks([], [])
    plt.yticks([], [])


    # pie chart---------------------------
    pie_plot = plt.subplot(fig[4:6, :2])
    i=1
    df['Shift'] = pd.Series()
    for t in df['Time'] :
        if(str(t).endswith('am')):
            df['Shift'].loc[i] = 'am' 
        else :
            df['Shift'].loc[i] = 'pm'
        i+=1

    recipe = list( df.groupby('Shift').count()['Time'].index )
    data = list(df.groupby('Shift').count()['Time'].values)
    lable = list([str(recipe[0] + '\n'+str(data[0])+' msgs') ,str(recipe[1] + '\n'+str(data[1])+' msgs')])

    pie_plot.pie(data, textprops=dict( fontsize=18,
        color="black"), wedgeprops=dict(width=0.45), startangle=20 ,labels=lable)

    pie_plot.set_title("Messages in respective Meridian", fontsize=20)
    sb.despine(ax=pie_plot, left=True, bottom=True)

    # top active bar chart----------------
    top_active = plt.subplot(fig[4:8, 2:])
    sorted_active = df.groupby('Contacts').count()['Time'].sort_values()
    if(df.groupby('Contacts').count().shape[0] > 10):
        sb.barplot(sorted_active[-10:].values,
                sorted_active[-10:].index,
                ax=top_active,
                palette='spring'
                )
        j = -10
        for i, v in enumerate(sorted_active.values[-10:]):
            top_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    else:
        sb.barplot(sorted_active.values,
                sorted_active.index,
                ax=top_active,
                palette='spring'
                )
        j = -1*len(sorted_active.values)
        for i, v in enumerate(sorted_active.values):
            top_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    top_active.set_title("Most Active Memebers", fontsize=20)
    top_active.set_yticks([], [])
    top_active.set_ylabel("")
    sb.despine(ax=top_active, left=True)


    # least active data------------------
    least_active = plt.subplot(fig[6:8, :2])
    sorted_active = df.groupby('Contacts').count()['Time'].sort_values()
    if(df.groupby('Contacts').count().shape[0] > 5):
        sb.barplot(sorted_active[:5].values,
                sorted_active[:5].index,
                ax=least_active,
                palette='spring'
                )
        j = 0
        for i, v in enumerate(sorted_active.values[:5]):
            least_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    else:
        sb.barplot(sorted_active.values,
                sorted_active.index,
                ax=least_active,
                palette='spring'
                )
        j = 0
        for i, v in enumerate(sorted_active.values):
            least_active.text(
                0, i + 0.2, str(sorted_active.index[j]), color='black', fontsize=20)
            j += 1
    least_active.set_title("Least Active Memebers", fontsize=20)
    least_active.set_yticks([], [])
    least_active.set_ylabel("")
    sb.despine(ax=least_active, left=True)


    # weekday wise msgs------------------
    week_plot = plt.subplot(fig[8:10, :])
    weekday = Counter(pd.to_datetime(df['Date']).dt.weekday)
    od = collections.OrderedDict(sorted(weekday.items()))
    values = []
    for value in od.values():
        values.append(value)
    keys = []
    for key in od.keys():
        keys.append(key)
    week = ["Monday", 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 'Sunday']

    x = []
    for k in keys:
        x.append(week[k])

    sb.barplot(x, values, palette='plasma', ax=week_plot)
    week_plot.set_xticklabels(x, fontsize=16)
    week_plot.set_title("WeekDay-wise Messages", fontsize=20)
    sb.despine(ax=week_plot)


    # WordCloud---------------------------
    word_Cloud = plt.subplot(fig[10:, :])
    new_stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
                'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
                "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
                'that', "that'll",'nan','media','omitted','media omitted'
                'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
                'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
                'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
                'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
                'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
                'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
                'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
                'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
                'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
                "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '1', '2', '3', '4', '5', '6', '7',
                '8', '9', '0', '.', ',', '/', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '+', '-'
                ]

    for stop in new_stop:
        STOPWORDS.add(stop)

    i = 0

    comment_words = ' '
    stopwords = set(STOPWORDS)

    # iterate through the csv file
    for val in df['Messages']:

        # typecaste each val to string
        val = str(val)

        if "media omitted" in val:
            i += 1
        # split the value
        tokens = val.split()

        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        for words in tokens:
            comment_words = comment_words + words + ' '


    wordcloud = WordCloud(width=1400, height=800,
                        background_color='white',
                        stopwords=stopwords,
                        min_font_size=15,
                        max_font_size=100,
                        colormap='plasma').generate(comment_words)

    word_Cloud.set_title("WORD CLOUD", fontsize=40)
    word_Cloud.imshow(wordcloud)
    word_Cloud.axis("off")

    plt.savefig(os.path.join('static/images/dashboard',filename+'.png'), bbox_inches='tight')
    return
Ejemplo n.º 46
0
def stop_words_for_wordcloud(stop_words):
    for word in stop_words:
        STOPWORDS.add(word)
Ejemplo n.º 47
0
import tweepy
import wordcloud
from Credentialstwitter import *
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

auth = tweepy.OAuthHandler(ConsumerKey, ConsumerSecret)
api = tweepy.API(auth)
text = " "

tweets = api.user_timeline(screen_name="maiconkusterkkk",
                           count=1000,
                           include_rts=False,
                           tweet_mode="extended")
for tweet in tweets:
    #print(tweet.full_text)
    text = text + " " + tweet.full_text

wordcloud = WordCloud(width=1920, height=1200)
STOPWORDS.update(["hppt", "https", "co", "da","de","em","na","se","às","como","que", "para", "os", "dos", "das", "assim", "quais","feira","um", "uma", "mais", "ao", "por","pelo","pela",\
    "como", "nosso", "nossa", "zu", "das", "zu","die","der","dem","und","auf","ein","nicht","von","wie","wird", "daß", "dass","mit","für", "Sie","sie","er","noch","vor","ist", "bei",\
    "wenn", "sich", "den", "hat", "des", "diese", "diesen", "dieses", "dieser", "über", "eine", "einer", "einen", "eines", "auch", "es", "werden", "auch", "im", "als", "uns", "sehr",\
    "aber", "einem", "zur", "nun", "mehr", "zum", "durch", "sind", "kann", "man", "aus", "nur", "haben", "will", "é" ])
wordcloud.generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
# load config file
config = SafeConfigParser()
script_dir = path.dirname(__file__)
config_file = path.join(script_dir, 'config/settings.cfg')
config.read(config_file)

# tell script where to put the JSON files returned
logfile = config.get('files','logfile')
listfile = config.get('files','listfile')
outfolder = config.get('files','outfolder')

# get usernames
users = get_users(listfile)

# add stop words
STOPWORDS.add('https')

# create a word cloud for each user
for user in users:

    # get image masks for different users
    # from http://masterkoyo.deviantart.com/art/Template-Donald-Trump-35925789
    # from https://openclipart.org/detail/211473/jeb-bush-outlines
    # from http://www.spstencils.com/shop/politics/hilary-clinton-stencil/
    image_mask = None
    try:
        image_mask = imread(path.join(script_dir, ".".join([user,'jpg'])))
        print user
    except IOError:
        print 'Cannot open file '+ user + '.jpg under directory ' + script_dir
Ejemplo n.º 49
0
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061'
                                                       and uchar <= u'\u007a'):
        return True
    else:
        return False


def format_str(content):
    content_str = ''
    for i in content:
        if is_chinese(i) or is_alphabet(i):
            content_str = content_str + i
    return content_str


result_content = format_str(string)
cut = jieba.cut(result_content)  #text为你需要分词的字符串/句子
result_string = ' '.join(cut)  #将分开的词用空格连接
# k = jieba.analyse.extract_tags(result_content, topK=50, withWeight=True)
# print(k)
font = r'C:\Users\DELL\Downloads\FZFengYKSJ.TTF'
wc = WordCloud(font_path=font,
               background_color='white',
               width=1000,
               height=800,
               stopwords=STOPWORDS.add("电影")).generate(result_string)
wc.to_file('ss.png')
plt.imshow(wc)  #用plt显示图片
plt.axis('off')  #不显示坐标轴
plt.show()  #显示图片
Ejemplo n.º 50
0
def create_wordcloud(df):
    complaints_text = list(df["Consumer complaint narrative"].dropna().values)

    # join all documents in corpus
    text = " ".join(list(complaints_text))
    print("Complaints received")
    print(len(complaints_text))

    d = getcwd()
    mask = np.array(Image.open(path.join(d, "thumbs-down.png")))

    STOPWORDS.add("XXXX")
    STOPWORDS.add("XX")
    STOPWORDS.add("xx")
    STOPWORDS.add("xxxx")
    # TODO exclude name of all banks here
    STOPWORDS.add("wells")
    STOPWORDS.add("fargo")

    wc = WordCloud(
        background_color="white",
        stopwords=STOPWORDS,
        max_words=1000,
        mask=mask,
        max_font_size=90,
        random_state=42,
        contour_width=1,
        contour_color="#119DFF",
    )
    wc.generate(text)

    # create wordcloud shape from image
    fig = plt.figure(figsize=[8, 8])
    ax = plt.imshow(wc.recolor(), interpolation="bilinear")
    plt.axis("off")
    out_url = fig_to_uri(fig, bbox_inches="tight")
    return out_url
Ejemplo n.º 51
0
def make_word_cloud(imagemaskurl, relative_scaling, nwords, text, title,
                    customstopwords, width, height, color, colormap, maxfont,
                    minfont, scale):
    if imagemaskurl is not None and imagemaskurl != '':
        # imgstr = re.search(r'base64,(.*)', imagemask).group(1)
        try:
            if imagemaskurl.startswith('data:image'):
                imgstr = re.search(r'base64,(.*)', imagemask).group(1)
                b = base64.b64decode(imgstr)
            else:
                r = requests.get(imagemaskurl)
                b = r.content
            image_bytes = io.BytesIO(b)
            im = Image.open(image_bytes).convert('RGBA')
            canvas = Image.new('RGBA', im.size, (255, 255, 255, 255))
            canvas.paste(im, mask=im)
            mask = np.array(canvas)
            width, height = im.size
        except:
            mask = None
            text = 'Invalid Image Mask!'
    else:
        mask = None
    from wordcloud import STOPWORDS
    STOPWORDS = list(STOPWORDS)

    for word in customstopwords:
        STOPWORDS.append(word)
        STOPWORDS.append(word + 's')
        STOPWORDS.append(word + "'s")
    if color == '':
        color = None
    cloud = WordCloud(width=width, height=height, mask=mask, background_color=color,
                      stopwords=STOPWORDS, max_words=nwords, colormap=colormap,
                      max_font_size=maxfont, min_font_size=minfont,
                      random_state=42, scale=scale, mode='RGBA',
                      relative_scaling=relative_scaling).generate(text)
    try:
        coloring = ImageColorGenerator(mask)
        cloud.recolor(color_func=coloring)
    except:
        pass
    image = cloud.to_image()

    byte_io = io.BytesIO()
    image.save(byte_io, 'PNG')
    byte_io.seek(0)
    data_uri = base64.b64encode(byte_io.getvalue()).decode('utf-8').replace('\n', '')
    src = 'data:image/png;base64,{0}'.format(data_uri)
    x = np.array(list(cloud.words_.keys()))
    y = np.array(list(cloud.words_.values()))
    order = np.argsort(y)[::-1]
    x = x[order]
    y = y[order]
    trace = go.Bar(x=x, y=y)
    layout = go.Layout(margin=go.Margin(l=10, r=00),
                       title='Relative frequency of words/bigrams')
    fig = go.Figure(data=[trace], layout=layout)
    children = [
        H2(title, className='card-title'),
        Img(src=src, width=image.size[0], height=image.size[1],
            style={'maxWidth': '100%', 'height': 'auto',
                   'margin': '0 auto', 'display': 'block'}),
        # Details([
        #     Summary('View Frequency Plot'),
        #     dcc.Graph(id='word-freq', figure=fig, config={'displayModeBar': False})
        # ])
    ]

    return children
event_ = PSTAT.event_

# Construct corpus: to lower case, strip numeric
corpus = {}
events = [26, 27] #[16, 83]
for event in events:
    docs = keydev['events'].find(
        {'keydeveventtypeid': {'$eq': event}}, {'_id': 0})
    corpus[event] = [re.sub(r'\b\w*[\d]\w*\b', ' ', " ".join(
            d[k] for k in ['headline', 'situation'])).lower() for d in docs]
DataFrame({'description': [event_[event] for event in corpus.keys()],
           'count': [len(lines) for lines in corpus.values()]},
          index=corpus.keys())

# Tokenize, and remove stopwords
stop_words = STOPWORDS.union(['co', 'ltd', 'mr', 'mrs', 'inc', 'llc'])
for event, lines in corpus.items():
    corpus[event] = [[w for w in re.findall(r"\w\w+", line)
                      if w not in stop_words] for line in lines]
    
# Split shuffled into labelled training and test sets
train_data = []
test_data = []
split_frac = 0.9
for label, (event, lines) in enumerate(corpus.items()):
    np.random.shuffle(lines)
    n = int(split_frac * len(lines))   # split point of train and test sets
    train_data.extend([(label, corpus[event][p]) for p in range(n)])
    test_data.extend([(label, corpus[event][p]) for p in range(n, len(lines))])
N = len(train_data)
print('train/test:', N, [np.mean([label for label,_ in subset])
Ejemplo n.º 53
0
from env import *  # holds all the secrets
import praw
from ray import Ray
from flappy_answers import answers
import json
import random
import os
import requests
import re
import datetime
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

STOPWORDS.add("game")
STOPWORDS.add("deleted")
STOPWORDS.add("f**k")
STOPWORDS.add("f*****g")
STOPWORDS.add("localray")
STOPWORDS.add("https")
STOPWORDS.add("reddit")
STOPWORDS.add("create")
STOPWORDS.add("wordcloud")
STOPWORDS.add("commets")
STOPWORDS.add("imgur")
# set up a praw instance to use as a listener
# let's listen to all comments on r/tampabayrays and highlight those that have the word cash in them
#works
ray = Ray()


def create_wordcloud(url):
    print("in create_wordlcoud")
Ejemplo n.º 54
0
def make_word_cloud(imagemaskurl, nwords, text, customstopwords):
    if imagemaskurl is not None and imagemaskurl != '':
        try:
            r = requests.get(imagemaskurl)
            b = r.content
            image_bytes = io.BytesIO(b)
            im = Image.open(image_bytes).convert('RGBA')
            canvas = Image.new('RGBA', im.size, (255, 255, 255, 255))
            canvas.paste(im, mask=im)
            mask = np.array(canvas)
            width, height = im.size
        except:
            mask = None
            text = 'Invalid Image Mask!'
    else:
        mask = None
    from wordcloud import STOPWORDS
    STOPWORDS = list(STOPWORDS)

    for word in customstopwords:
        STOPWORDS.append(word)
        STOPWORDS.append(word + 's')
        STOPWORDS.append(word + "'s")

    cloud = WordCloud(
        width=width,
        height=height,
        mask=mask,
        background_color='white',
    ).generate(text)

    try:
        coloring = ImageColorGenerator(mask)
        cloud.recolor(color_func=coloring)
    except:
        pass
    image = cloud.to_image()

    byte_io = io.BytesIO()
    image.save(byte_io, 'PNG')
    byte_io.seek(0)
    data_uri = base64.b64encode(byte_io.getvalue()).decode('utf-8').replace(
        '\n', '')
    src = 'data:image/png;base64,{0}'.format(data_uri)
    x = np.array(list(cloud.words_.keys()))
    y = np.array(list(cloud.words_.values()))
    order = np.argsort(y)[::-1]
    x = x[order]
    y = y[order]
    trace = go.Bar(x=x, y=y)
    layout = go.Layout(title='Relative frequency of words')
    fig = go.Figure(data=[trace], layout=layout)
    children = [
        Img(src=src,
            width=image.size[0],
            height=image.size[1],
            style={
                'maxWidth': '100%',
                'height': 'auto',
                'margin': '0 auto',
                'display': 'block'
            }),
    ]

    return children
Ejemplo n.º 55
0
    if (l == '<EOF>'):
      break
    else:
      s=l[53:]
      words +=s[:s.find('\t')]+' '

no_urls_no_tags = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
                            
for c in string.punctuation:
  no_urls_no_tags= no_urls_no_tags.replace(c,"")                            
                            
STOPWORDS.add('amp')    
STOPWORDS.add('want')
STOPWORDS.add('new')
STOPWORDS.add('via')
STOPWORDS.add('man')
STOPWORDS.add('will')
STOPWORDS.add('here')
STOPWORDS.add('Heres')
STOPWORDS.add('Here')
                        
wordcloud = WordCloud(
                      font_path='C:/Tweets/cabin-sketch-v1.02/CabinSketch-Regular.ttf',
                      stopwords=STOPWORDS,
                      background_color='black',
                      width=1800,
                      height=1400
Ejemplo n.º 56
0
d = path.dirname(__file__)
text_from_file_with_apath = open('dashuju.txt').read()

wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=True)
wl_space_split = " ".join(wordlist_after_jieba)
text = wl_space_split
# Read the whole text.
# text = open(path.join(d, 'au.txt')).read()
#
# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = np.array(Image.open(path.join(d, "huge.jpg")))

wc = WordCloud(background_color="white",
               max_words=2000,
               mask=alice_mask,
               stopwords=STOPWORDS.add("said"))
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join(d, "alice.png"))

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
Ejemplo n.º 57
0
import matplotlib.pyplot as plt
from PIL import Image
import matplotlib.colors as colors

#读取文件,返回一个字符串,该文档位于此python同以及目录下
f = open('introduction.txt', 'r').read()

background_Image = np.array(Image.open("TiDB-logo.jpg"))

colormaps = colors.ListedColormap(
    ['#E36209', '#E6b949', '#CCAC55', '#A5D10D', '#127d72'])

#生成一个词云对象
wordcloud = WordCloud(
    background_color="white",  #设置背景为白色,默认为黑色
    font_path="Ubuntu-Medium.ttf",
    stopwords=STOPWORDS.add("store"),
    mask=background_Image,
    colormap=colormaps,
    width=1400,  #设置图片的宽度
    height=900,  #设置图片的高度
    margin=10,  #设置图片的边缘
    relative_scaling=0.3).generate(f)
# 绘制图片
plt.imshow(wordcloud)
# 消除坐标轴
plt.axis("off")
# 展示图片
# plt.show()
# 保存图片
wordcloud.to_file('my_test4.png')
Ejemplo n.º 58
0
def get_trending(offset):
    conn = sqlite3.connect('data/newsdb.db')
    sql = '''
        SELECT tokenizer_content FROM article;
    '''
    rows = conn.execute(sql, )
    sentences = []
    for r in rows:
        sentences.append(r)
    cloud = np.array(sentences).flatten()
    fig = plt.figure(figsize=(30, 20))

    # stop_words = stop_words + list(STOPWORDS)
    stop_words = []
    with codecs.open("StopWord/stopword.txt", 'r', encoding='utf8') as file_in:
        for line in file_in:
            stop_words.append(line.strip())
    for n in stop_words:
        STOPWORDS.add(n)
    word_cloud = wordcloud.WordCloud(stopwords=STOPWORDS,
                                     max_words=700,
                                     background_color="white",
                                     width=1000,
                                     height=400,
                                     mode="RGB").generate(
                                         str(cloud)).to_image()
    # convert word-clou image to base64
    img = io.BytesIO()
    word_cloud.save(img, "PNG")
    img.seek(0)
    img_b64 = base64.standard_b64encode(img.getvalue()).decode()
    # Send image to Imgur
    client_id = "3bc58602360427f"
    headers = {'Authorization': 'Client-ID ' + client_id}
    data = {
        'image': img_b64,
        'title': 'word cloud image'
    }  # create a dictionary.
    main_data = urllib.parse.urlencode(data)
    main_data = main_data.encode('utf-8')
    request = urllib2.Request(url="https://api.imgur.com/3/upload.json",
                              data=main_data,
                              headers=headers)
    response = urlopen(request).read()
    parse = json.loads(response)
    image_url = parse['data']['link']

    t = WordCloud().process_text(str(cloud))
    lst_trending_word = sorted(t.items(), key=lambda x: x[1], reverse=True)

    top10 = lst_trending_word[:10]
    tmp = []
    for i in top10:
        tmp.append(i[0])
    sql2 = '''
        SELECT * FROM article WHERE tokenizer_content LIKE ? ORDER BY id DESC LIMIT 20 OFFSET ?'''
    trending_article = []
    for word in tmp:
        data = conn.execute(sql2, ('%' + word + '%', offset)).fetchall()
        for i in data:
            if i[0] not in trending_article:
                trending_article.append(i)
    conn.close()
    return trending_article, image_url
Ejemplo n.º 59
0
cursor = links.find({},{"body":1})


test=""

for document in cursor :
    test=test+document['body']

with codecs.open("text_mining/my_stopwords.txt","r",encoding="utf-8") as f:
     read_data = f.readlines()





stopwords = STOPWORDS.copy()


for data in read_data:

    stopwords.add(data)


stopwords = map(lambda s: s.strip(), stopwords)


mask_choko = np.array(Image.open("text_mining/chokomag.png"))


wordcloud = WordCloud( stopwords=stopwords,background_color="black", max_words=10000,mask=mask_choko).generate(test)
Ejemplo n.º 60
0
"""
In order to make the graphs more useful we decided to prevent some words from being included
"""
ADDITIONAL_STOPWORDS = [
    "XXXX",
    "XX",
    "xx",
    "xxxx",
    "n't",
    "Trans Union",
    "BOA",
    "Citi",
    "account",
]
for stopword in ADDITIONAL_STOPWORDS:
    STOPWORDS.add(stopword)
"""
Proudly written for Plotly by Vildly in 2019. [email protected]


The aim with this dashboard is to demonstrate how Plotly's Dash framework
can be used for NLP based data analysis. The dataset is open and contains
consumer complaints from US banks ranging from 2013 to 2017.

Users can select to run the dashboard with the whole dataset (which can be slow to run)
or a smaller subset which then is evenly and consistently sampled accordingly.

Once a data sample has been selected the user can select a bank to look into by
using the dropdown or by clicking one of the bars on the right with the top 10
banks listed by number of filed complaints. Naturally bigger banks tend to end
up in this top 10 since we do not adjust for number of customers.