def get_tag_counts(text1, text2): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ words1 = map(lambda x:x.lower(), re.findall(r'\w+', text1, re.UNICODE)) words2 = map(lambda x:x.lower(), re.findall(r'\w+', text2, re.UNICODE)) s = StopWords() s.load_language(s.guess(words1)) s.load_language(s.guess(words2)) counted = {} for word in words1: if not s.is_stop_word(word) and len(word) > 1: if counted.has_key(word): counted[word] += 1 else: counted[word] = 1 for word in words2: if not s.is_stop_word(word) and len(word) > 1: if counted.has_key(word): counted[word] -= 1 else: counted[word] = -1 return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)
def get_tag_counts(text): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ text = unicode(text, 'UTF-8') words = map(lambda x:x.lower(), re.findall(r'\w+', text, re.UNICODE)) # Workaround because the regex mangled the unicode # _PUNCTUATION = u'.,<>;:/?\\\"\'-_+=!@\#$%^&*()' # words = text.split() # words = map(lambda x:x.lower(), words) # words = map(lambda x:unicode(x, 'UTF-8'), words) # words = map(lambda x: x.translate(dict.fromkeys(map(ord, _PUNCTUATION))), # words) s = StopWords() s.load_language(s.guess(words)) counted = {} for word in words: if not s.is_stop_word(word) and len(word) > 1: if counted.has_key(word): counted[word] += 1 else: counted[word] = 1 if len(counted.keys()) == 0: return None else: return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)
def get_tag_counts(text): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ words = map(lambda x: x.lower(), re.findall(r"[\w']+", text, re.UNICODE)) s = StopWords() s.load_language(s.guess(words)) counted = defaultdict(int) for word in words: if not s.is_stop_word(word) and len(word) > 1: counted[word] += 1 return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)
def get_tag_counts(text): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE)) s = StopWords() s.load_language(s.guess(words)) counted = defaultdict(int) for word in words: if not s.is_stop_word(word) and len(word) > 1: counted[word] += 1 return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)
def get_tag_counts(text): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE)) # words above is an iterator, which would get consumed by guess(words), # So, convert it to a list, as it would have been in python 2 words = [word for word in words] s = StopWords() s.load_language(s.guess(words)) counted = defaultdict(int) for word in words: if not s.is_stop_word(word) and len(word) > 1: counted[word] += 1 return sorted(counted.items(), key=itemgetter(1), reverse=True)
def word_frequency(tweets): c = Counter() hash_c = Counter() at_c = Counter() s = StopWords() s.load_language("english") for tweet in tweets: for word in get_words( tweet[ HEADER_DICT['text'] ] ): if not s.is_stop_word(word): if c.has_key(word): c[ word ] += 1 else: c[ word ] = 1 for word in re.findall('@\w+', tweet[ HEADER_DICT['text'] ]): at_c[ word.lower() ] += 1 for word in re.findall('\#[\d\w]+', tweet[ HEADER_DICT['text'] ]): hash_c[ word.lower() ] += 1 print c.most_common(50) print hash_c.most_common(50) print at_c.most_common(50) #Making word clouds for your most common words, most common @replies and most common #hashtags. ctags = make_tags(c.most_common(100), maxsize=90, colors=COLOR_SCHEMES['audacity']) create_tag_image(ctags, 'c_most_common.png', size=(900, 600), fontname='Lobster') webbrowser.open('c_most_common.png') hash_ctags = make_tags(hash_c.most_common(100), maxsize=100, colors=COLOR_SCHEMES['citrus']) create_tag_image(hash_ctags, 'hash_c_most_common.png', size=(900, 600), fontname='Cuprum') webbrowser.open('hash_c_most_common.png') at_ctags = make_tags(at_c.most_common(100), maxsize=90) create_tag_image(at_ctags, 'at_c_most_common.png', size=(900, 600), fontname='Yanone Kaffeesatz') webbrowser.open('at_c_most_common.png')
def get_tag_counts(text): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ words = map(lambda x: x.lower(), re.findall(r'\w+', text, re.UNICODE)) words = list(words) #** Added this s = StopWords() s.load_language(s.guess(words)) counted = {} for word in words: print(word) if not s.is_stop_word(word) and len(word) > 1: if word in counted: #** if counted.has_key(word): counted[word] += 1 else: counted[word] = 1 #** iteritems to items return sorted(counted.items(), key=itemgetter(1), reverse=True)
def commRanking(self,numTopComms, prevTimeslots,xLablNum): import itertools, tfidf # from pymongo import MongoClient from pytagcloud.lang.stopwords import StopWords # from nltk.corpus import stopwords from wordcloud import make_wordcloud from PIL import Image '''Detect the evolving communities''' uniCommIdsEvol=self.uniCommIdsEvol timeslots=self.timeslots tempcommRanking = {} #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]} commRanking,fluctuation,lifetime = {},{},0 for Id in self.uniCommIds: uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0])) timeSlLen=len(uniCommIdsEvol[Id][0]) tempcommRanking[Id] = [] tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality # '''Checking Theseus Ship''' # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1]))) # tempcommRanking[Id].append(theseus) commRanking[Id] = np.prod(tempcommRanking[Id]) #Construct average jaccardian between timeslots for each dyn comm if timeSlLen not in fluctuation: fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)] else: fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)) lifetime=max(lifetime,timeSlLen) '''All the communities ranked in order of importance''' rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True) if numTopComms>len(rankedCommunities): numTopComms=len(rankedCommunities) '''Jaccardian for lifespans which appear only once are discarded (outliers)''' flux=[] for lifeT in range(lifetime+1): if lifeT in fluctuation and len(fluctuation[lifeT])>1: flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT])) else: flux.append(0) '''Constructing community size heatmap data''' commSizeHeatData = np.zeros([numTopComms, timeslots]) for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]): for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]): if commSizeHeatData[rCIdx, timesteps] != 0: commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps]) else: commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx]) normedHeatdata = commSizeHeatData/commSizeHeatData.max() '''Writing ranked communities to json files + MongoDB''' dataset_name=self.dataset_path.split('/') dataset_name=dataset_name[-1] #Mongo-------------------- # client = MongoClient() # db = client[dataset_name] # dyccos=db.dyccos #------------------------- rankedCommunitiesFinal = {} twitterDataFile = open(self.dataset_path + '/data/nonadaptive/results/rankedCommunities.json', "w")#, encoding="utf-8-sig") jsondata = dict() jsondata["ranked_communities"] = [] '''Create corpus and stopwords''' # stop = stopwords.words('english') stop = [] # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb')) # stop.extend(grstopwords) definiteStop = ['gt','amp','rt','via'] stop.extend(definiteStop) if not os.path.exists(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck"): idf = self.corpusExtraction(rankedCommunities[:numTopComms]) else: idf = pickle.load(open(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck", 'rb')) print('loaded corpus from file') #------------------------- regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE) regex2 = re.compile("\w+'?\w",re.UNICODE) width,height = 400,200 blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height) blank_image.paste(timeimage, (tmptime*width,height)) for rank, rcomms in enumerate(rankedCommunities[:numTopComms]): tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], [] strRank = '{0}'.format(str(rank).zfill(2)) rankedCommunitiesFinal[strRank] = [rcomms] rankedCommunitiesFinal[strRank].append(commRanking[rcomms]) rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3]) timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]] '''make and save wordclouds''' if not os.path.exists(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank)): os.makedirs(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank)) for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]): uscentr, tmptweetText = [], [] for us in users: uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]]) # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr}) tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText] seen = set() seen_add = seen.add tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)] commTwText.append({timeSlotApp[tmsl]: tmptweetText2}) #topic extraction topicList = " ".join(tmptweetText2) topicList = topicList.lower() topicList = regex1.sub('', topicList) topicList = regex2.findall(topicList) s = StopWords() s.load_language(s.guess(topicList)) topicList = collections.Counter(topicList) tmpkeys = topicList.keys() if len(topicList)>5: for i in list(tmpkeys): if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i): del topicList[i] else: for i in list(tmpkeys): if i in definiteStop or not i: del topicList[i] timeSlLen=len(uniCommIdsEvol[Id][0]) tmpTopic=tfidf.comm_tfidf(topicList,idf,10) topic.append({timeSlotApp[tmsl]: tmpTopic}) # tmpTopic = [x[0] for x in tmpTopic] '''wordcloud image''' popkeys = [x[0] for x in tmpTopic] popvals = [x[1] for x in tmpTopic] if len(popvals)<2: try: if popvals[0]<1: popvals[0]=1 except: pass '''Create intermediate image''' position = (rank+1)*2 backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255) locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height)) popusers = [x[0] for x in uscentr[:10]] popcentr = [x[1]*100 for x in uscentr[:10]] locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height)) # tmpkeywrds.extend(tmpTopic) if tmpTags: popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))] popTags = collections.Counter(popTags) popTags = popTags.most_common(10) else: popTags=[] if tmpUrls: if tmpUrls[0]: tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x] popUrls = collections.Counter(tmpUrls) popUrls = popUrls.most_common(10) else: popUrls=[] else: popUrls=[] commTweetIds = list(set(tmptweetids)) # popKeywords = collections.Counter(tmpkeywrds) # popKeywords = popKeywords.most_common(10) # popkeys = [x[0] for x in popKeywords] # popvals = [x[1] for x in popKeywords] # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf') dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText, 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic, 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2], 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs, 'popTags': popTags, 'popUrls': popUrls} jsondycco=dycco.copy() # dyccos.insert(dycco) jsondata["ranked_communities"].append(jsondycco) twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f","")) twitterDataFile.close() for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2]) blank_image.paste(timeimage, (tmptime*width,(position+2)*height)) imsize=blank_image.size blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS) blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50) makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum) return rankedCommunitiesFinal
def commRanking(self,numTopComms, prevTimeslots,xLablNum): import itertools, tfidf # from pymongo import MongoClient from pytagcloud.lang.stopwords import StopWords # from nltk.corpus import stopwords from wordcloud import make_wordcloud from PIL import Image '''Detect the evolving communities''' uniCommIdsEvol=self.uniCommIdsEvol timeslots=self.timeslots tempcommRanking = {} #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]} commRanking,fluctuation,lifetime = {},{},0 for Id in self.uniCommIds: uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0])) timeSlLen=len(uniCommIdsEvol[Id][0]) tempcommRanking[Id] = [] tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality # '''Checking Theseus Ship''' # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1]))) # tempcommRanking[Id].append(theseus) commRanking[Id] = np.prod(tempcommRanking[Id]) #Construct average jaccardian between timeslots for each dyn comm if timeSlLen not in fluctuation: fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)] else: fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)) lifetime=max(lifetime,timeSlLen) '''All the communities ranked in order of importance''' rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True) if numTopComms>len(rankedCommunities): numTopComms=len(rankedCommunities) '''Jaccardian for lifespans which appear only once are discarded (outliers)''' flux=[] for lifeT in range(lifetime+1): if lifeT in fluctuation and len(fluctuation[lifeT])>1: flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT])) else: flux.append(0) '''Constructing community size heatmap data''' commSizeHeatData = np.zeros([numTopComms, timeslots]) for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]): for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]): if commSizeHeatData[rCIdx, timesteps] != 0: commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps]) else: commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx]) normedHeatdata = commSizeHeatData/commSizeHeatData.max() '''Writing ranked communities to json files + MongoDB''' dataset_name=self.dataset_path.split('/') dataset_name=dataset_name[-1] #Mongo-------------------- # client = MongoClient() # db = client[dataset_name] # dyccos=db.dyccos #------------------------- rankedCommunitiesFinal = {} twitterDataFile = open(self.dataset_path + '/data/results/rankedCommunities.json', "w")#, encoding="utf-8-sig") jsondata = dict() jsondata["ranked_communities"] = [] '''Create corpus and stopwords''' # stop = stopwords.words('english') stop = [] # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb')) # stop.extend(grstopwords) definiteStop = ['gt','amp','rt','via'] stop.extend(definiteStop) if not os.path.exists(self.dataset_path + "/data/tmp/datasetCorpus.pck"): idf = self.corpusExtraction(rankedCommunities[:numTopComms]) else: idf = pickle.load(open(self.dataset_path + "/data/tmp/datasetCorpus.pck", 'rb')) print('loaded corpus from file') #------------------------- regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE) regex2 = re.compile("\w+'?\w",re.UNICODE) width,height = 400,200 blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height) blank_image.paste(timeimage, (tmptime*width,height)) for rank, rcomms in enumerate(rankedCommunities[:numTopComms]): tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], [] strRank = '{0}'.format(str(rank).zfill(2)) rankedCommunitiesFinal[strRank] = [rcomms] rankedCommunitiesFinal[strRank].append(commRanking[rcomms]) rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3]) timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]] '''make and save wordclouds''' if not os.path.exists(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank)): os.makedirs(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank)) for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]): uscentr, tmptweetText = [], [] for us in users: uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]]) # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]: tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us]) uscentr = sorted(uscentr, key=itemgetter(1), reverse=True) tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr}) tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText] seen = set() seen_add = seen.add tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)] commTwText.append({timeSlotApp[tmsl]: tmptweetText2}) #topic extraction topicList = " ".join(tmptweetText2) topicList = topicList.lower() topicList = regex1.sub('', topicList) topicList = regex2.findall(topicList) s = StopWords() s.load_language(s.guess(topicList)) topicList = collections.Counter(topicList) tmpkeys = topicList.keys() if len(topicList)>5: for i in list(tmpkeys): if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i): del topicList[i] else: for i in list(tmpkeys): if i in definiteStop or not i: del topicList[i] timeSlLen=len(uniCommIdsEvol[Id][0]) tmpTopic=tfidf.comm_tfidf(topicList,idf,10) topic.append({timeSlotApp[tmsl]: tmpTopic}) # tmpTopic = [x[0] for x in tmpTopic] '''wordcloud image''' popkeys = [x[0] for x in tmpTopic] popvals = [x[1] for x in tmpTopic] if len(popvals)<2: try: if popvals[0]<1: popvals[0]=1 except: pass '''Create intermediate image''' position = (rank+1)*2 backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255) locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height)) popusers = [x[0] for x in uscentr[:10]] popcentr = [x[1]*100 for x in uscentr[:10]] locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf' blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height)) # tmpkeywrds.extend(tmpTopic) if tmpTags: popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))] popTags = collections.Counter(popTags) popTags = popTags.most_common(10) else: popTags=[] if tmpUrls: if tmpUrls[0]: tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x] popUrls = collections.Counter(tmpUrls) popUrls = popUrls.most_common(10) else: popUrls=[] else: popUrls=[] commTweetIds = list(set(tmptweetids)) # popKeywords = collections.Counter(tmpkeywrds) # popKeywords = popKeywords.most_common(10) # popkeys = [x[0] for x in popKeywords] # popvals = [x[1] for x in popKeywords] # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf') dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText, 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic, 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2], 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs, 'popTags': popTags, 'popUrls': popUrls} jsondycco=dycco.copy() # dyccos.insert(dycco) jsondata["ranked_communities"].append(jsondycco) twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f","")) twitterDataFile.close() for tmptime in range(timeslots): timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2]) blank_image.paste(timeimage, (tmptime*width,(position+2)*height)) imsize=blank_image.size blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS) blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50) makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum) return rankedCommunitiesFinal