def draw(infile, name): #Read in input #infile = open(file_name, 'r') infile.seek(0) str_in = infile.read() BlackList = open("BlackList", 'r') list_words = BlackList.read().split(" ") + [name] #Processes the input words, sums = sum([x for x in str_in.split() if x not in list_words]) #Actually makes the wordcloud itself wordcloud.make_wordcloud(words = words, counts = sums, fname ="out.png",width=1600, height=800)
def ctm_word_cloud(): # 新文档中各个主题所占的比重,只取前两个主题 with open('holdout-phi-sum.dat','r') as fp: topicProportion=fp.readlines() # 列表中的数字是float型 topicProportion=map(lambda x: float(x.split()[0]),topicProportion) # tP=[[10,0],[90,1],[50,2],..] tP=[] for i in range(len(topicProportion)): item1=list((topicProportion[i],i)) tP.append(item1) # 从小到大排序 tP.sort() # 取两个主题,tP_new=[[75,8],[90,2]], [主题所占'比重',主题] tP_new=tP[-2:] sum2=tP_new[0][0]+tP_new[1][0] for i in range(2): tP_new[i][0]=tP_new[i][0]/sum2 # now, tP_new=[[0.4,8],[0.6,2]] topics=save_topics('final-log-beta.dat', 'vocab.dat') # 构造ctm单词云 wordsList=[] countsList=[] for (rate,topic) in tP_new: if rate>=0.1: # 四舍五入 wordsNum=int(round(WORDS*rate)) # 抽取主题topic,wordsNum个单词,加入words和counts for i in range(wordsNum): wordsList.append(topics[topic][i][1]) countsList.append(topics[topic][i][0]) words=np.array(wordsList) counts=np.array(countsList) font_path=r'C:\Windows\Fonts\simsun.ttc' # 宋体 imageName='ctm' wordcloud.make_wordcloud(words,counts,font_path,imageName)
def format_and_draw(infile): #Read in input #infile = open(file_name, 'r') infile.seek(0) str_in = format(infile.read()) BlackList = open("BlackList", 'r') list_words = BlackList.read().split(" ") #Processes the input words, sums = sum([x for x in str_in.split() if (x not in list_words)]) #Actually makes the wordcloud itself wordcloud.make_wordcloud(words = words, counts = sums, fname ="out.png",width=1600, height=800) #Automatically changes your desktop background. Spontaneously stopped working, set file manually as background instead. #call("gsettings set org.gnome.desktop.background picture-uri file://./out.png".split()) pass
def display_wordcloud(words, counts): with tempfile.NamedTemporaryFile(suffix='.png') as tmp: temp_filename = tmp.name counts = wordcloud.make_wordcloud(words, counts, temp_filename, font_path='/Library/Fonts/Georgia.ttf',width=800, height=800, ranks_only=False) image_file = cbook.get_sample_data(temp_filename) image = plt.imread(image_file) fig, ax = plt.subplots() im = ax.imshow(image) plt.axis('off') plt.show()
def process(self, document): fdist = filter_stopwords(document['freqdist'], document['language']) words = numpy.array([t[0] for t in fdist]) counts = numpy.array([t[1] for t in fdist]) wordcloud_img = make_wordcloud(words, counts) fd = StringIO() wordcloud_img.save(fd, format="PNG") fd.seek(0) result = {'wordcloud': base64.b64encode(fd.read())} fd.close() return result
def make_clouds(auth_vec_fname, crawl_fname, out_dir="img", max_words=200): arch = np.load(auth_vec_fname) labels = arch["author_labels"] vectorizer = arch["vectorizer"][()] vectors = arch["vectors"][()] # get author'a ascii name (as used for IDI photo) tree = etree.parse(crawl_fname) authors = [unicode(e) for e in tree.xpath("//name/text()")] ascii_name= [splitext(basename(e))[0] for e in tree.xpath("//img/text()")] toascii = dict(zip(authors, ascii_name)) vocab = np.array(vectorizer.get_feature_names()) for author, vec in zip(labels, vectors): counts = vec.toarray().ravel() if counts.sum() > max_words: mask_inds = counts.argsort()[-max_words:] counts = counts[mask_inds] words = vocab[mask_inds] else: log.error(u"vector for author {} has not enough words ({})".format( author, int(counts.sum()))) continue out_fname = join(out_dir, toascii[author] + ".png") log.debug(u"writing " + out_fname) make_wordcloud(words, counts, out_fname, #font_path="/usr/local/texlive/2012/texmf-dist/fonts/truetype/public/droid/DroidSansMono.ttf", font_path="/usr/local/texlive/2012/texmf-dist/fonts/truetype/public/opensans/OpenSans-Regular.ttf", width=1000, height=600, show_img=False)
def make_wordcloud_rawtext(text, savename, width=400, height=200): import os import sys from sklearn.feature_extraction.text import CountVectorizer sources = [savename] cv = CountVectorizer(min_df=1, charset_error="ignore", stop_words="english", max_features=200) counts = cv.fit_transform([text]).toarray().ravel() words = np.array(cv.get_feature_names()) # throw away some words, normalize words = words[counts > 1] counts = counts[counts > 1] output_filename = (os.path.splitext(os.path.basename(sources[0]))[0] + ".bmp") counts = wordcloud.make_wordcloud(words, counts, output_filename, width=width, height=height)
def display_wordcloud(words, counts): with tempfile.NamedTemporaryFile(suffix='.png') as tmp: temp_filename = tmp.name counts = wordcloud.make_wordcloud( words, counts, temp_filename, font_path='/Library/Fonts/Georgia.ttf', width=800, height=800, ranks_only=False) image_file = cbook.get_sample_data(temp_filename) image = plt.imread(image_file) fig, ax = plt.subplots() im = ax.imshow(image) plt.axis('off') plt.show()
def generate_image(profile, field): fonts_to_use = [ "JosefinSansStd-Light.ttf", "Neucha.ttf", "Molengo-Regular.ttf", "ReenieBeanie.ttf", "Lobster.ttf" ] assert len(fonts_to_use) == len(profile.field_dicts) fonts_dir = "/home/roman/Dropbox/django_practice/mysite/mysite/tag_cloud/fonts" # field_to_fonts = {} # for key, font in zip(profile.field_dicts.keys(), fonts_to_use): # field_to_fonts[key] = font random_int = np.random.randint(len(fonts_to_use)) # font_path = os.path.join(fonts_dir, field_to_fonts[field]) font_path = os.path.join(fonts_dir, fonts_to_use[random_int]) max_len = max(profile.words[field].apply(lambda x: len(x))) try: img = wordcloud.make_wordcloud( np.array(profile.words[field], dtype="S%d" % max_len), np.array(profile.scores[field]), font_path=font_path, width=640, height=480, ) except Exception, e: print e
def generate_image(profile, field): fonts_to_use = ["JosefinSansStd-Light.ttf", "Neucha.ttf", "Molengo-Regular.ttf", "ReenieBeanie.ttf", "Lobster.ttf"] assert len(fonts_to_use) == len(profile.field_dicts) fonts_dir = "/home/roman/Dropbox/django_practice/mysite/mysite/tag_cloud/fonts" # field_to_fonts = {} # for key, font in zip(profile.field_dicts.keys(), fonts_to_use): # field_to_fonts[key] = font random_int = np.random.randint(len(fonts_to_use)) # font_path = os.path.join(fonts_dir, field_to_fonts[field]) font_path = os.path.join(fonts_dir, fonts_to_use[random_int]) max_len = max(profile.words[field].apply(lambda x: len(x))) try: img = wordcloud.make_wordcloud( np.array(profile.words[field], dtype="S%d" % max_len), np.array(profile.scores[field]), font_path=font_path, width = 640, height = 480, ) except Exception, e: print e
def commRanking(self,numTopComms, prevTimeslots,xLablNum): import itertools, tfidf # from pymongo import MongoClient from pytagcloud.lang.stopwords import StopWords # from nltk.corpus import stopwords from wordcloud import make_wordcloud from PIL import Image '''Detect the evolving communities''' uniCommIdsEvol=self.uniCommIdsEvol timeslots=self.timeslots tempcommRanking = {} #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]} commRanking,fluctuation,lifetime = {},{},0 for Id in self.uniCommIds: uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0])) timeSlLen=len(uniCommIdsEvol[Id][0]) tempcommRanking[Id] = [] tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality # '''Checking Theseus Ship''' # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1]))) # tempcommRanking[Id].append(theseus) commRanking[Id] = np.prod(tempcommRanking[Id]) #Construct average jaccardian between timeslots for each dyn comm if timeSlLen not in fluctuation: fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)] else: fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)) lifetime=max(lifetime,timeSlLen) '''All the communities ranked in order of importance''' rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True) if numTopComms>len(rankedCommunities): numTopComms=len(rankedCommunities) '''Jaccardian for lifespans which appear only once are discarded (outliers)''' flux=[] for lifeT in range(lifetime+1): if lifeT in fluctuation and len(fluctuation[lifeT])>1: flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT])) else: flux.append(0) '''Constructing community size heatmap data''' commSizeHeatData = np.zeros([numTopComms, timeslots]) for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]): for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]): if commSizeHeatData[rCIdx, timesteps] != 0: commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps]) else: commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx]) normedHeatdata = commSizeHeatData/commSizeHeatData.max() '''Writing ranked communities to json files + MongoDB''' dataset_name=self.dataset_path.split('/') dataset_name=dataset_name[-1] #Mongo-------------------- # client = MongoClient() # db = client[dataset_name] # dyccos=db.dyccos #------------------------- rankedCommunitiesFinal = {} twitterDataFile = open(self.dataset_path + '/data/nonadaptive/results/rankedCommunities.json', "w")#, encoding="utf-8-sig") jsondata = dict() jsondata["ranked_communities"] = [] '''Create corpus and stopwords''' # stop = stopwords.words('english') stop = [] # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb')) # stop.extend(grstopwords) definiteStop = ['gt','amp','rt','via'] stop.extend(definiteStop) if not os.path.exists(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck"): idf = self.corpusExtraction(rankedCommunities[:numTopComms]) else: idf = pickle.load(open(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck", 'rb')) print('loaded corpus from file') #------------------------- regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE) regex2 = re.compile("\w+'?\w",re.UNICODE) width,height = 400,200 blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height) blank_image.paste(timeimage, (tmptime*width,height)) for rank, rcomms in enumerate(rankedCommunities[:numTopComms]): tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], [] strRank = '{0}'.format(str(rank).zfill(2)) rankedCommunitiesFinal[strRank] = [rcomms] rankedCommunitiesFinal[strRank].append(commRanking[rcomms]) rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3]) timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]] '''make and save wordclouds''' if not os.path.exists(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank)): os.makedirs(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank)) for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]): uscentr, tmptweetText = [], [] for us in users: uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]]) # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr}) tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText] seen = set() seen_add = seen.add tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)] commTwText.append({timeSlotApp[tmsl]: tmptweetText2}) #topic extraction topicList = " ".join(tmptweetText2) topicList = topicList.lower() topicList = regex1.sub('', topicList) topicList = regex2.findall(topicList) s = StopWords() s.load_language(s.guess(topicList)) topicList = collections.Counter(topicList) tmpkeys = topicList.keys() if len(topicList)>5: for i in list(tmpkeys): if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i): del topicList[i] else: for i in list(tmpkeys): if i in definiteStop or not i: del topicList[i] timeSlLen=len(uniCommIdsEvol[Id][0]) tmpTopic=tfidf.comm_tfidf(topicList,idf,10) topic.append({timeSlotApp[tmsl]: tmpTopic}) # tmpTopic = [x[0] for x in tmpTopic] '''wordcloud image''' popkeys = [x[0] for x in tmpTopic] popvals = [x[1] for x in tmpTopic] if len(popvals)<2: try: if popvals[0]<1: popvals[0]=1 except: pass '''Create intermediate image''' position = (rank+1)*2 backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255) locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height)) popusers = [x[0] for x in uscentr[:10]] popcentr = [x[1]*100 for x in uscentr[:10]] locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height)) # tmpkeywrds.extend(tmpTopic) if tmpTags: popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))] popTags = collections.Counter(popTags) popTags = popTags.most_common(10) else: popTags=[] if tmpUrls: if tmpUrls[0]: tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x] popUrls = collections.Counter(tmpUrls) popUrls = popUrls.most_common(10) else: popUrls=[] else: popUrls=[] commTweetIds = list(set(tmptweetids)) # popKeywords = collections.Counter(tmpkeywrds) # popKeywords = popKeywords.most_common(10) # popkeys = [x[0] for x in popKeywords] # popvals = [x[1] for x in popKeywords] # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf') dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText, 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic, 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2], 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs, 'popTags': popTags, 'popUrls': popUrls} jsondycco=dycco.copy() # dyccos.insert(dycco) jsondata["ranked_communities"].append(jsondycco) twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f","")) twitterDataFile.close() for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2]) blank_image.paste(timeimage, (tmptime*width,(position+2)*height)) imsize=blank_image.size blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS) blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50) makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum) return rankedCommunitiesFinal
for k in mycounts.iterkeys(): words.append(k) counts.append(mycounts[k]) words = np.array(words) counts = np.array(counts) # throw away some words, normalize words = words[counts > 1] counts = counts[counts > 1] subjects = {} for w in words: subjects[w] = mysubjectind[most_common(subs[w])] output_filename = "%s/cloud.png" % datetime.date.today() print output_filename os.system("mkdir %s" % datetime.date.today()) counts = make_wordcloud(words, counts, output_filename) os.system("rm thisweekarxiv.png ") os.system("ln -s %s thisweekarxiv.png " % output_filename) _t = time.time() wordlists = {} i = 0 for row in content: i += 1 user = row[0] if (i % 100 == 0): sys.stdout.write('\r%s' % user) sys.stdout.flush() wordlists[user] = {} wordlists[user]['words'] = set([s for s in row[2:] if s != ''])
import sys import numpy as np sys.path.insert(0, './word_cloud-master/') import wordcloud if len(sys.argv)<3: sys.exit("Usage: python get_wordcloud.py wordlist.txt number") wordlist_name = sys.argv[1] cluster_number = sys.argv[2] wordlist_file = open(wordlist_name,'r') words = [] dists = [] # the distances are to the centroid for line in wordlist_file: word = line.split()[0] cluster = line.split()[1] dist = float(line.split()[2]) if cluster==cluster_number: words.append(word) dists.append(dist) max_dist = max(dists)*1.05 weights = [max_dist - dist for dist in dists] words_array = np.array(words) weights_array = np.array(weights) wordcloud.make_wordcloud(words_array,weights_array,wordlist_name+'.'+cluster_number+'.png')
for k in mycounts.iterkeys(): words.append(k) counts.append(mycounts[k]) words=np.array(words) counts=np.array(counts) # throw away some words, normalize words = words[counts > 1] counts = counts[counts > 1] subjects={} for w in words: subjects[w]=mysubjectind[most_common(subs[w])] output_filename = "%s/cloud.png"%datetime.date.today() print output_filename os.system("mkdir %s"%datetime.date.today()) counts = make_wordcloud(words, counts, output_filename) os.system("rm thisweekarxiv.png ") os.system("ln -s %s thisweekarxiv.png "%output_filename) _t = time.time() wordlists={} i=0 for row in content: i+=1 user = row[0] if (i%100==0): sys.stdout.write('\r%s' % user) sys.stdout.flush() wordlists[user] = {}
def batch_vs_online(articleName): # 载入字典和语料库 diction = gensim.corpora.Dictionary.load('people.dict') mm = gensim.corpora.MmCorpus('people_tfidf.mm') # 读入新的文章 with open(articleName,'r') as fp: content=fp.read() list1=fenci(content) # 停用词已经去掉 doc_bow=diction.doc2bow(list1) # 统计词频后的向量,新文章的向量表示 # 得到batch_lda与online_lda # alpha,eta使用默认参数 batch_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \ num_topics=TOPICS, update_every=0, passes=20) """ batch_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \ num_topics=TOPICS, update_every=0, passes=80,alpha=50.0/TOPICS) """ # 主题数大一点,chunksize大一点,alpha要自己设置 online_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \ num_topics=TOPICS, update_every=1, chunksize=5, passes=1) """ online_lda=gensim.models.ldamodel.LdaModel(corpus=mm, id2word=diction, \ num_topics=TOPICS, update_every=1, chunksize=40, passes=80,alpha=50.0/TOPICS) """ # batchTopics=[[],[],[]] # 每个主题下,单词已经按概率大小排列好了 # list1=lda.show_topic(0) # list1=[(0.0017132658909227052, '\xb7\xa8\xd4\xba'), (0.0016304890553909524, '\xd4\xbd\xc0\xb4\xd4\xbd')] for k in range(TOPICS): batchTopics.append(batch_lda.show_topic(k)) onlineTopics.append(online_lda.show_topic(k)) # 先画batch_lda的单词云 # 一篇文档中,各个主题所占的比重,是一个列表 doc_batch_lda = batch_lda[doc_bow] print doc_batch_lda tP_batch=[] for yuanzu in doc_batch_lda: tP_batch.append(list(yuanzu)) for i in range(len(tP_batch)): tmp=tP_batch[i][0] tP_batch[i][0]=tP_batch[i][1] tP_batch[i][1]=tmp # 从小到大排序 tP_batch.sort() # 取3个主题 [主题所占'比重',主题] tP_batch_new=tP_batch[-3:] #print tP_batch # 归一标准化 sum0=0 for i in range(len(tP_batch_new)): sum0+=tP_batch_new[i][0] for i in range(len(tP_batch_new)): tP_batch_new[i][0]=tP_batch_new[i][0]/sum0 # now, tP_new=[[0.1,8],[0.3,2],[0.6,1]] # 画单词云用 batchWordsList=[] batchWordsCount=[] for (rate,topic) in tP_batch_new: if rate>=0.1: # 四舍五入 wordsNum=int(round(WORDS*rate)) # 抽取主题topic,wordsNum个单词,加入words和counts for i in range(wordsNum): batchWordsList.append(batchTopics[topic][i][1]) batchWordsCount.append(batchTopics[topic][i][0]) words=np.array(batchWordsList) counts=np.array(batchWordsCount) font_path=r'C:\Windows\Fonts\simsun.ttc' # 宋体 imageName='batch_lda_'+articleName wordcloud.make_wordcloud(words,counts,font_path,imageName) # online_lda的单词云 doc_online_lda = online_lda[doc_bow] print doc_online_lda tP_online=[] for yuanzu in doc_online_lda: tP_online.append(list(yuanzu)) for i in range(len(tP_online)): tmp=tP_online[i][0] tP_online[i][0]=tP_online[i][1] tP_online[i][1]=tmp # 从小到大排序 tP_online.sort() # 取3个主题 [主题所占'比重',主题] tP_online_new=tP_online[-3:] # 归一标准化 sum0=0 for i in range(len(tP_online_new)): sum0+=tP_online_new[i][0] for i in range(len(tP_online_new)): tP_online_new[i][0]=tP_online_new[i][0]/sum0 # now, tP_new=[[0.1,8],[0.3,2],[0.6,1]] onlineWordsList=[] onlineWordsCount=[] for (rate,topic) in tP_online_new: if rate>=0.1: # 四舍五入 wordsNum=int(round(WORDS*rate)) # 抽取主题topic,wordsNum个单词,加入words和counts for i in range(wordsNum): onlineWordsList.append(onlineTopics[topic][i][1]) onlineWordsCount.append(onlineTopics[topic][i][0]) words=np.array(onlineWordsList) counts=np.array(onlineWordsCount) font_path=r'C:\Windows\Fonts\simsun.ttc' # 宋体 imageName='online_lda_'+articleName wordcloud.make_wordcloud(words,counts,font_path,imageName)
table.append(r) line_count += 1 ifile.close() print "Matrix shape: %d rows, %d columns" % (r_count, num_columns) print "Header: " + str(header) # CHECK THAT ALL ROWS HAVE SAME NUMBER OF COLUMNS if not all(map(lambda x: x == num_columns,map(len,table))): sys.exit("ERROR: Not all rows have the same number of columns. Aborting.") # TRANSPOSE table_t = map(list,zip(*table)) # GENERATE WORDCLOUDS # columns words = np.asarray(table_t[0]) for j in xrange(1,num_columns): print "Processing column cloud #%d (%s)..." % (j,header[j]) counts = np.asarray(table_t[j]) fname = output_dir_cols + header[j] + "_wordcloud" + ext wordcloud.make_wordcloud(words, counts, fname, font_path, width, height) # rows words = np.asarray(header[1:]) for i in xrange(1,r_count): row_label = table[i][0] row_data = table[i][1:] print "Processing row cloud #%d (%s)..." % (i,row_label) counts = np.asarray(row_data) fname = output_dir_rows + row_label + "_wordcloud" + ext wordcloud.make_wordcloud(words, counts, fname, font_path, width, height)
def commRanking(self,numTopComms, prevTimeslots,xLablNum): import itertools, tfidf # from pymongo import MongoClient from pytagcloud.lang.stopwords import StopWords # from nltk.corpus import stopwords from wordcloud import make_wordcloud from PIL import Image '''Detect the evolving communities''' uniCommIdsEvol=self.uniCommIdsEvol timeslots=self.timeslots tempcommRanking = {} #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]} commRanking,fluctuation,lifetime = {},{},0 for Id in self.uniCommIds: uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0])) timeSlLen=len(uniCommIdsEvol[Id][0]) tempcommRanking[Id] = [] tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality # '''Checking Theseus Ship''' # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1]))) # tempcommRanking[Id].append(theseus) commRanking[Id] = np.prod(tempcommRanking[Id]) #Construct average jaccardian between timeslots for each dyn comm if timeSlLen not in fluctuation: fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)] else: fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)) lifetime=max(lifetime,timeSlLen) '''All the communities ranked in order of importance''' rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True) if numTopComms>len(rankedCommunities): numTopComms=len(rankedCommunities) '''Jaccardian for lifespans which appear only once are discarded (outliers)''' flux=[] for lifeT in range(lifetime+1): if lifeT in fluctuation and len(fluctuation[lifeT])>1: flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT])) else: flux.append(0) '''Constructing community size heatmap data''' commSizeHeatData = np.zeros([numTopComms, timeslots]) for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]): for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]): if commSizeHeatData[rCIdx, timesteps] != 0: commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps]) else: commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx]) normedHeatdata = commSizeHeatData/commSizeHeatData.max() '''Writing ranked communities to json files + MongoDB''' dataset_name=self.dataset_path.split('/') dataset_name=dataset_name[-1] #Mongo-------------------- # client = MongoClient() # db = client[dataset_name] # dyccos=db.dyccos #------------------------- rankedCommunitiesFinal = {} twitterDataFile = open(self.dataset_path + '/data/results/rankedCommunities.json', "w")#, encoding="utf-8-sig") jsondata = dict() jsondata["ranked_communities"] = [] '''Create corpus and stopwords''' # stop = stopwords.words('english') stop = [] # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb')) # stop.extend(grstopwords) definiteStop = ['gt','amp','rt','via'] stop.extend(definiteStop) if not os.path.exists(self.dataset_path + "/data/tmp/datasetCorpus.pck"): idf = self.corpusExtraction(rankedCommunities[:numTopComms]) else: idf = pickle.load(open(self.dataset_path + "/data/tmp/datasetCorpus.pck", 'rb')) print('loaded corpus from file') #------------------------- regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE) regex2 = re.compile("\w+'?\w",re.UNICODE) width,height = 400,200 blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height) blank_image.paste(timeimage, (tmptime*width,height)) for rank, rcomms in enumerate(rankedCommunities[:numTopComms]): tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], [] strRank = '{0}'.format(str(rank).zfill(2)) rankedCommunitiesFinal[strRank] = [rcomms] rankedCommunitiesFinal[strRank].append(commRanking[rcomms]) rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3]) timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]] '''make and save wordclouds''' if not os.path.exists(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank)): os.makedirs(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank)) for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]): uscentr, tmptweetText = [], [] for us in users: uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]]) # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr}) tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText] seen = set() seen_add = seen.add tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)] commTwText.append({timeSlotApp[tmsl]: tmptweetText2}) #topic extraction topicList = " ".join(tmptweetText2) topicList = topicList.lower() topicList = regex1.sub('', topicList) topicList = regex2.findall(topicList) s = StopWords() s.load_language(s.guess(topicList)) topicList = collections.Counter(topicList) tmpkeys = topicList.keys() if len(topicList)>5: for i in list(tmpkeys): if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i): del topicList[i] else: for i in list(tmpkeys): if i in definiteStop or not i: del topicList[i] timeSlLen=len(uniCommIdsEvol[Id][0]) tmpTopic=tfidf.comm_tfidf(topicList,idf,10) topic.append({timeSlotApp[tmsl]: tmpTopic}) # tmpTopic = [x[0] for x in tmpTopic] '''wordcloud image''' popkeys = [x[0] for x in tmpTopic] popvals = [x[1] for x in tmpTopic] if len(popvals)<2: try: if popvals[0]<1: popvals[0]=1 except: pass '''Create intermediate image''' position = (rank+1)*2 backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255) locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height)) popusers = [x[0] for x in uscentr[:10]] popcentr = [x[1]*100 for x in uscentr[:10]] locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height)) # tmpkeywrds.extend(tmpTopic) if tmpTags: popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))] popTags = collections.Counter(popTags) popTags = popTags.most_common(10) else: popTags=[] if tmpUrls: if tmpUrls[0]: tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x] popUrls = collections.Counter(tmpUrls) popUrls = popUrls.most_common(10) else: popUrls=[] else: popUrls=[] commTweetIds = list(set(tmptweetids)) # popKeywords = collections.Counter(tmpkeywrds) # popKeywords = popKeywords.most_common(10) # popkeys = [x[0] for x in popKeywords] # popvals = [x[1] for x in popKeywords] # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf') dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText, 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic, 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2], 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs, 'popTags': popTags, 'popUrls': popUrls} jsondycco=dycco.copy() # dyccos.insert(dycco) jsondata["ranked_communities"].append(jsondycco) twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f","")) twitterDataFile.close() for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2]) blank_image.paste(timeimage, (tmptime*width,(position+2)*height)) imsize=blank_image.size blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS) blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50) makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum) return rankedCommunitiesFinal
if y > 0: partial_integral += integral[x:, y - 1][:, np.newaxis] integral[x:, y:] = partial_integral # redraw in color img = Image.new("RGB", (width, height), (backgroundweight,backgroundweight,backgroundweight)) draw = ImageDraw.Draw(img) everything = zip(words, font_sizes, positions, orientations) for word, font_size, position, orientation in everything: font = ImageFont.truetype(font_path, font_size) # transpose font optionally transposed_font = ImageFont.TransposedFont(font, orientation=orientation) draw.setfont(transposed_font) draw.text((position[1], position[0]), word, #fill = "red") fill="hsl(%d" % random.randint(0, 50) + ", 80%, 50%)") #img.show() try: img.save(fname) except: pass return img if __name__ == "__main__": x=['qqqqq','wwww','eeee','rrrr','ddddd','hhnhhhh'] co=[1,2,3,4,5,6] from wordcloud import make_wordcloud make_wordcloud(x,co,'wordy.jpg')