Python StopWords.load_languageの例、pytagcloud.lang.stopwords.StopWords.load_language Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cloudBWLD.py プロジェクト: eisnerj/ERSA

def get_tag_counts(text1, text2):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    words1 = map(lambda x:x.lower(), re.findall(r'\w+', text1, re.UNICODE))
    words2 = map(lambda x:x.lower(), re.findall(r'\w+', text2, re.UNICODE))
    
    s = StopWords()     
    s.load_language(s.guess(words1))
    s.load_language(s.guess(words2))
    
    counted = {}
    
    for word in words1:
        if not s.is_stop_word(word) and len(word) > 1:
            if counted.has_key(word):
                counted[word] += 1
            else: 
                counted[word] = 1

    for word in words2:
        if not s.is_stop_word(word) and len(word) > 1:
            if counted.has_key(word):
                counted[word] -= 1
            else: 
                counted[word] = -1
      
    return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)

コード例 #2

0

ファイルを表示

def get_tag_counts(text):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    text = unicode(text, 'UTF-8')
    words = map(lambda x:x.lower(), re.findall(r'\w+', text, re.UNICODE))

    # Workaround because the regex mangled the unicode
    # _PUNCTUATION = u'.,<>;:/?\\\"\'-_+=!@\#$%^&*()'
    # words = text.split()
    # words = map(lambda x:x.lower(), words)
    # words = map(lambda x:unicode(x, 'UTF-8'), words)
    # words = map(lambda x: x.translate(dict.fromkeys(map(ord, _PUNCTUATION))),
    #             words)

    s = StopWords()
    s.load_language(s.guess(words))
    
    counted = {}
    
    for word in words:
        if not s.is_stop_word(word) and len(word) > 1:
            if counted.has_key(word):
                counted[word] += 1
            else: 
                counted[word] = 1
      
    if len(counted.keys()) == 0:
        return None
    else:
        return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)

コード例 #3

0

ファイルを表示

def get_tag_counts(text):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    words = map(lambda x: x.lower(), re.findall(r"[\w']+", text, re.UNICODE))

    s = StopWords()
    s.load_language(s.guess(words))

    counted = defaultdict(int)

    for word in words:
        if not s.is_stop_word(word) and len(word) > 1:
            counted[word] += 1

    return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)

コード例 #4

0

ファイルを表示

ファイル: allPythonContent.py プロジェクト: Mondego/pyreco

def get_tag_counts(text):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE))
    
    s = StopWords()     
    s.load_language(s.guess(words))
    
    counted = defaultdict(int)
    
    for word in words:
        if not s.is_stop_word(word) and len(word) > 1:
            counted[word] += 1
      
    return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)

コード例 #5

0

ファイルを表示

def get_tag_counts(text):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE))
    # words above is an iterator, which would get consumed by guess(words),
    # So, convert it to a list, as it would have been in python 2
    words = [word for word in words]
    s = StopWords()     
    s.load_language(s.guess(words))
    
    counted = defaultdict(int)
    
    for word in words:
        if not s.is_stop_word(word) and len(word) > 1:
            counted[word] += 1
      
    return sorted(counted.items(), key=itemgetter(1), reverse=True)

コード例 #6

0

ファイルを表示

ファイル: analyze.py プロジェクト: MusharafZiaLone/twitter-archive-analysis

def word_frequency(tweets):
    c = Counter()
    hash_c = Counter()
    at_c = Counter()
    s = StopWords()     
    s.load_language("english")
    
    for tweet in tweets:
        for word in get_words( tweet[ HEADER_DICT['text'] ] ):
            if not s.is_stop_word(word):
                if c.has_key(word):
                    c[ word ] += 1
                else:
                    c[ word ] = 1
                    
        for word in re.findall('@\w+', tweet[ HEADER_DICT['text'] ]):
            at_c[ word.lower() ] += 1
        for word in re.findall('\#[\d\w]+', tweet[ HEADER_DICT['text'] ]):
            hash_c[ word.lower() ] += 1
            
    print c.most_common(50)
    print hash_c.most_common(50)
    print at_c.most_common(50)

    #Making word clouds for your most common words, most common @replies and most common #hashtags.

    ctags = make_tags(c.most_common(100), maxsize=90, 
                         colors=COLOR_SCHEMES['audacity'])
    create_tag_image(ctags, 'c_most_common.png', size=(900, 600), fontname='Lobster')
    webbrowser.open('c_most_common.png')

    hash_ctags = make_tags(hash_c.most_common(100), maxsize=100, 
                         colors=COLOR_SCHEMES['citrus'])
    create_tag_image(hash_ctags, 'hash_c_most_common.png', size=(900, 600), fontname='Cuprum')
    webbrowser.open('hash_c_most_common.png')

    at_ctags = make_tags(at_c.most_common(100), maxsize=90)
    create_tag_image(at_ctags, 'at_c_most_common.png', size=(900, 600), fontname='Yanone Kaffeesatz')
    webbrowser.open('at_c_most_common.png')

コード例 #7

0

ファイルを表示

ファイル: osint_twitter.py プロジェクト: vysecurity/osint_tools_security_auditing

def get_tag_counts(text):
    """
   Search tags in a given text. The language detection is based on stop lists.
   This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
   """
    words = map(lambda x: x.lower(), re.findall(r'\w+', text, re.UNICODE))
    words = list(words)  #** Added this

    s = StopWords()
    s.load_language(s.guess(words))

    counted = {}

    for word in words:
        print(word)
        if not s.is_stop_word(word) and len(word) > 1:
            if word in counted:  #** if counted.has_key(word):
                counted[word] += 1
            else:
                counted[word] = 1
        #** iteritems to items

    return sorted(counted.items(), key=itemgetter(1), reverse=True)

コード例 #8

0

ファイルを表示

ファイル: CommunityRanking_NONadaptive.py プロジェクト: U5732/community-evolution-analysis

    def commRanking(self,numTopComms, prevTimeslots,xLablNum):
        import itertools, tfidf 
        # from pymongo import MongoClient
        from pytagcloud.lang.stopwords import StopWords
        # from nltk.corpus import stopwords
        from wordcloud import  make_wordcloud
        from PIL import Image

        '''Detect the evolving communities'''
        uniCommIdsEvol=self.uniCommIdsEvol
        timeslots=self.timeslots

        tempcommRanking = {}
        #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]}
        commRanking,fluctuation,lifetime = {},{},0
        for Id in self.uniCommIds:
            uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0]))
            timeSlLen=len(uniCommIdsEvol[Id][0])
            tempcommRanking[Id] = []
            tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence
            tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability
            tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality
            # '''Checking Theseus Ship'''
            # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1])))
            # tempcommRanking[Id].append(theseus)
            commRanking[Id] = np.prod(tempcommRanking[Id])

            #Construct average jaccardian between timeslots for each dyn comm
            if timeSlLen not in fluctuation:
                fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)]
            else:
                fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1))
            lifetime=max(lifetime,timeSlLen)

        '''All the communities ranked in order of importance'''
        rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True)
        if numTopComms>len(rankedCommunities):
            numTopComms=len(rankedCommunities)

        '''Jaccardian for lifespans which appear only once are discarded (outliers)'''
        flux=[]
        for lifeT in range(lifetime+1):
            if lifeT in fluctuation and len(fluctuation[lifeT])>1:
                flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT]))
            else:
                flux.append(0)

        '''Constructing community size heatmap data'''
        commSizeHeatData = np.zeros([numTopComms, timeslots])
        for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]):
            for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]):
                if commSizeHeatData[rCIdx, timesteps] != 0:
                    commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps])
                else:
                    commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx])
        normedHeatdata = commSizeHeatData/commSizeHeatData.max()

        '''Writing ranked communities to json files + MongoDB'''
        dataset_name=self.dataset_path.split('/')
        dataset_name=dataset_name[-1]
        #Mongo--------------------
        # client = MongoClient()
        # db = client[dataset_name]
        # dyccos=db.dyccos
        #-------------------------
        rankedCommunitiesFinal = {}
        twitterDataFile = open(self.dataset_path + '/data/nonadaptive/results/rankedCommunities.json', "w")#, encoding="utf-8-sig")
        jsondata = dict()
        jsondata["ranked_communities"] = []

        '''Create corpus and stopwords'''
        # stop = stopwords.words('english')
        stop = []
        # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb'))
        # stop.extend(grstopwords)
        definiteStop = ['gt','amp','rt','via']
        stop.extend(definiteStop)
        if not os.path.exists(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck"):
            idf = self.corpusExtraction(rankedCommunities[:numTopComms])
        else:
            idf = pickle.load(open(self.dataset_path + "/data/nonadaptive/tmp/datasetCorpus.pck", 'rb'))
            print('loaded corpus from file')
        #-------------------------
        regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE)
        regex2 = re.compile("\w+'?\w",re.UNICODE)

        width,height = 400,200
        blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage
        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height)
            blank_image.paste(timeimage, (tmptime*width,height))

        for rank, rcomms in enumerate(rankedCommunities[:numTopComms]):
            tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], []
            strRank = '{0}'.format(str(rank).zfill(2))
            rankedCommunitiesFinal[strRank] = [rcomms]
            rankedCommunitiesFinal[strRank].append(commRanking[rcomms])
            rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3])
            timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]]

            '''make and save wordclouds'''
            if not os.path.exists(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank)):
                os.makedirs(self.dataset_path + "/data/nonadaptive/results/wordclouds/"+self.fileTitle+'/'+str(rank))

            for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]):
                uscentr, tmptweetText = [], []
                for us in users:
                    uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]])
                    # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                    if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr})
                tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText]
                seen = set()
                seen_add = seen.add
                tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)]
                commTwText.append({timeSlotApp[tmsl]: tmptweetText2})
                #topic extraction
                topicList = " ".join(tmptweetText2)
                topicList = topicList.lower()
                topicList = regex1.sub('', topicList)
                topicList = regex2.findall(topicList)
                s = StopWords()
                s.load_language(s.guess(topicList))
                topicList = collections.Counter(topicList)
                tmpkeys = topicList.keys()
                if len(topicList)>5:
                    for i in list(tmpkeys):
                            if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i):
                                del topicList[i]
                else:
                    for i in list(tmpkeys):
                        if i in definiteStop or not i:
                            del topicList[i]

                timeSlLen=len(uniCommIdsEvol[Id][0])
                tmpTopic=tfidf.comm_tfidf(topicList,idf,10)
                topic.append({timeSlotApp[tmsl]: tmpTopic})
                # tmpTopic = [x[0] for x in tmpTopic]
                '''wordcloud image'''
                popkeys = [x[0] for x in tmpTopic]
                popvals = [x[1] for x in tmpTopic]
                if len(popvals)<2:
                    try:
                        if popvals[0]<1:
                            popvals[0]=1
                    except:
                        pass
                '''Create intermediate image'''
                position = (rank+1)*2
                backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255)
                locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height))
                popusers = [x[0] for x in uscentr[:10]]
                popcentr = [x[1]*100 for x in uscentr[:10]]
                locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height))
                # tmpkeywrds.extend(tmpTopic)

            if tmpTags:
                popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))]
                popTags = collections.Counter(popTags)
                popTags = popTags.most_common(10)
            else:
                popTags=[]
            if tmpUrls:
                if tmpUrls[0]:
                    tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x]
                    popUrls = collections.Counter(tmpUrls)
                    popUrls = popUrls.most_common(10)
                else:
                    popUrls=[]
            else:
                    popUrls=[]
            commTweetIds = list(set(tmptweetids))
            # popKeywords = collections.Counter(tmpkeywrds)
            # popKeywords = popKeywords.most_common(10)
            # popkeys = [x[0] for x in popKeywords]
            # popvals = [x[1] for x in popKeywords]
            # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/nonadaptive/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf')
            dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText,
                 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic,
                 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2],
                 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs,
                 'popTags': popTags, 'popUrls': popUrls}
            jsondycco=dycco.copy()
            # dyccos.insert(dycco)
            jsondata["ranked_communities"].append(jsondycco)
        twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f",""))
        twitterDataFile.close()

        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2])
            blank_image.paste(timeimage, (tmptime*width,(position+2)*height))
        imsize=blank_image.size
        blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS)
        blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50)

        makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum)
        return rankedCommunitiesFinal

コード例 #9

0

ファイルを表示

ファイル: CommunityRanking.py プロジェクト: nudtchengqing/community-evolution-analysis

    def commRanking(self,numTopComms, prevTimeslots,xLablNum):
        import itertools, tfidf 
        # from pymongo import MongoClient
        from pytagcloud.lang.stopwords import StopWords
        # from nltk.corpus import stopwords
        from wordcloud import  make_wordcloud
        from PIL import Image

        '''Detect the evolving communities'''
        uniCommIdsEvol=self.uniCommIdsEvol
        timeslots=self.timeslots

        tempcommRanking = {}
        #structure: tempcommRanking={Id:[persistence,stability,commCentrality,degreeness]}
        commRanking,fluctuation,lifetime = {},{},0
        for Id in self.uniCommIds:
            uniqueTimeSlLen = len(set(uniCommIdsEvol[Id][0]))
            timeSlLen=len(uniCommIdsEvol[Id][0])
            tempcommRanking[Id] = []
            tempcommRanking[Id].append(uniqueTimeSlLen / timeslots)#persistence
            tempcommRanking[Id].append((sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1) + 1) / (timeslots + 1))#stability
            tempcommRanking[Id].append(product([x+1 for x in uniCommIdsEvol[Id][1]]) / uniqueTimeSlLen)#commCentrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][4]) / timeslots)#Degreeness
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][5])/timeSlLen)#degree centrality
            # tempcommRanking[Id].append(sum(uniCommIdsEvol[Id][6])/timeSlLen)#betweeness centrality
            # '''Checking Theseus Ship'''
            # theseus=1+len(list(set(uniCommIdsEvol[Id][3][0]) & set(uniCommIdsEvol[Id][3][-1]))) / len(set(np.append(uniCommIdsEvol[Id][3][0], uniCommIdsEvol[Id][3][-1])))
            # tempcommRanking[Id].append(theseus)
            commRanking[Id] = np.prod(tempcommRanking[Id])

            #Construct average jaccardian between timeslots for each dyn comm
            if timeSlLen not in fluctuation:
                fluctuation[timeSlLen]=[(sum(uniCommIdsEvol[Id][7])/(timeSlLen-1))] #[1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1)]
            else:
                fluctuation[timeSlLen].append((sum(uniCommIdsEvol[Id][7])/(timeSlLen-1)))#1-sum(np.diff(list(set(uniCommIdsEvol[Id][0]))) == 1)/(lifetime-1))
            lifetime=max(lifetime,timeSlLen)

        '''All the communities ranked in order of importance'''
        rankedCommunities = sorted(commRanking, key=commRanking.get, reverse=True)
        if numTopComms>len(rankedCommunities):
            numTopComms=len(rankedCommunities)

        '''Jaccardian for lifespans which appear only once are discarded (outliers)'''
        flux=[]
        for lifeT in range(lifetime+1):
            if lifeT in fluctuation and len(fluctuation[lifeT])>1:
                flux.append(sum(fluctuation[lifeT])/len(fluctuation[lifeT]))
            else:
                flux.append(0)

        '''Constructing community size heatmap data'''
        commSizeHeatData = np.zeros([numTopComms, timeslots])
        for rCIdx, comms in enumerate(rankedCommunities[0:numTopComms]):
            for sizeIdx, timesteps in enumerate(uniCommIdsEvol[comms][0]):
                if commSizeHeatData[rCIdx, timesteps] != 0:
                    commSizeHeatData[rCIdx, timesteps] = max(np.log(uniCommIdsEvol[comms][2][sizeIdx]),commSizeHeatData[rCIdx, timesteps])
                else:
                    commSizeHeatData[rCIdx, timesteps] = np.log(uniCommIdsEvol[comms][2][sizeIdx])
        normedHeatdata = commSizeHeatData/commSizeHeatData.max()

        '''Writing ranked communities to json files + MongoDB'''
        dataset_name=self.dataset_path.split('/')
        dataset_name=dataset_name[-1]
        #Mongo--------------------
        # client = MongoClient()
        # db = client[dataset_name]
        # dyccos=db.dyccos
        #-------------------------
        rankedCommunitiesFinal = {}
        twitterDataFile = open(self.dataset_path + '/data/results/rankedCommunities.json', "w")#, encoding="utf-8-sig")
        jsondata = dict()
        jsondata["ranked_communities"] = []

        '''Create corpus and stopwords'''
        # stop = stopwords.words('english')
        stop = []
        # grstopwords=pickle.load(open("./greek_stopwords.pck", 'rb'))
        # stop.extend(grstopwords)
        definiteStop = ['gt','amp','rt','via']
        stop.extend(definiteStop)
        if not os.path.exists(self.dataset_path + "/data/tmp/datasetCorpus.pck"):
            idf = self.corpusExtraction(rankedCommunities[:numTopComms])
        else:
            idf = pickle.load(open(self.dataset_path + "/data/tmp/datasetCorpus.pck", 'rb'))
            print('loaded corpus from file')
        #-------------------------
        regex1 = re.compile("(?:\@|#|https?\://)\S+",re.UNICODE)
        regex2 = re.compile("\w+'?\w",re.UNICODE)

        width,height = 400,200
        blank_image = Image.new("RGB", (timeslots*width, (numTopComms*2+2)*height),(255,255,255)) #make blank for colage
        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2], width=width, height=height)
            blank_image.paste(timeimage, (tmptime*width,height))

        for rank, rcomms in enumerate(rankedCommunities[:numTopComms]):
            tmslUsrs, tmpTags, tmptweetids, commTwText, tmpUrls, topic, tmpkeywrds = [], [], [], [], [], [], []
            strRank = '{0}'.format(str(rank).zfill(2))
            rankedCommunitiesFinal[strRank] = [rcomms]
            rankedCommunitiesFinal[strRank].append(commRanking[rcomms])
            rankedCommunitiesFinal[strRank].append(uniCommIdsEvol[rcomms][3])
            timeSlotApp = [self.timeLimit[x] for x in uniCommIdsEvol[rcomms][0]]

            '''make and save wordclouds'''
            if not os.path.exists(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank)):
                os.makedirs(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'/'+str(rank))

            for tmsl, users in enumerate(uniCommIdsEvol[rcomms][3]):
                uscentr, tmptweetText = [], []
                for us in users:
                    uscentr.append([us, self.userPgRnkBag[uniCommIdsEvol[rcomms][0][tmsl]][us]])
                    # uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                    if us in self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpTags.extend(self.tagBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmpUrls.append(self.urlBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetids.extend(self.tweetIdBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                    if us in self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]]:
                        tmptweetText.extend(self.tweetTextBag[uniCommIdsEvol[rcomms][0][tmsl]][us])
                uscentr = sorted(uscentr, key=itemgetter(1), reverse=True)
                tmslUsrs.append({str(uniCommIdsEvol[rcomms][0][tmsl]): uscentr})
                tmptweetText = [i.replace("\n", "").replace('\t',' ') for i in tmptweetText]
                seen = set()
                seen_add = seen.add
                tmptweetText2 = [x for x in tmptweetText if x not in seen and not seen_add(x)]
                commTwText.append({timeSlotApp[tmsl]: tmptweetText2})
                #topic extraction
                topicList = " ".join(tmptweetText2)
                topicList = topicList.lower()
                topicList = regex1.sub('', topicList)
                topicList = regex2.findall(topicList)
                s = StopWords()
                s.load_language(s.guess(topicList))
                topicList = collections.Counter(topicList)
                tmpkeys = topicList.keys()
                if len(topicList)>5:
                    for i in list(tmpkeys):
                            if not i or i in stop or i.startswith(('htt','(@','t.co')) or len(i)<=2 or s.is_stop_word(i):
                                del topicList[i]
                else:
                    for i in list(tmpkeys):
                        if i in definiteStop or not i:
                            del topicList[i]

                timeSlLen=len(uniCommIdsEvol[Id][0])
                tmpTopic=tfidf.comm_tfidf(topicList,idf,10)
                topic.append({timeSlotApp[tmsl]: tmpTopic})
                # tmpTopic = [x[0] for x in tmpTopic]
                '''wordcloud image'''
                popkeys = [x[0] for x in tmpTopic]
                popvals = [x[1] for x in tmpTopic]
                if len(popvals)<2:
                    try:
                        if popvals[0]<1:
                            popvals[0]=1
                    except:
                        pass
                '''Create intermediate image'''
                position = (rank+1)*2
                backgroundcolor = int((1-(normedHeatdata[rank,uniCommIdsEvol[rcomms][0][tmsl]]))*255)
                locimage = make_wordcloud(popkeys,popvals, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,position*height))
                popusers = [x[0] for x in uscentr[:10]]
                popcentr = [x[1]*100 for x in uscentr[:10]]
                locimage = make_wordcloud(popusers,popcentr, width=width, height=height,backgroundweight=backgroundcolor)#, fname=self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'/'+timeSlotApp[tmsl]+'usrs.pdf'
                blank_image.paste(locimage, (uniCommIdsEvol[rcomms][0][tmsl]*width,(position+1)*height))
                # tmpkeywrds.extend(tmpTopic)

            if tmpTags:
                popTags = [x.lower() for x in list(itertools.chain.from_iterable(tmpTags))]
                popTags = collections.Counter(popTags)
                popTags = popTags.most_common(10)
            else:
                popTags=[]
            if tmpUrls:
                if tmpUrls[0]:
                    tmpUrls=[x.lower() for x in list(itertools.chain.from_iterable(tmpUrls)) if x]
                    popUrls = collections.Counter(tmpUrls)
                    popUrls = popUrls.most_common(10)
                else:
                    popUrls=[]
            else:
                    popUrls=[]
            commTweetIds = list(set(tmptweetids))
            # popKeywords = collections.Counter(tmpkeywrds)
            # popKeywords = popKeywords.most_common(10)
            # popkeys = [x[0] for x in popKeywords]
            # popvals = [x[1] for x in popKeywords]
            # make_wordcloud(popkeys,popvals,self.dataset_path + '/data/results/wordclouds/'+self.fileTitle+'/'+str(rank)+'.pdf')
            dycco={'community label': rcomms, 'rank': rank, 'timeslot appearance': timeSlotApp,# 'text': commTwText,
                 'persistence:': tempcommRanking[rcomms][0],'total score':commRanking[rcomms],'topic': topic,
                 'stability': tempcommRanking[rcomms][1],'community centrality': tempcommRanking[rcomms][2],
                 'community size per slot': uniCommIdsEvol[rcomms][2], 'users:centrality per timeslot': tmslUsrs,
                 'popTags': popTags, 'popUrls': popUrls}
            jsondycco=dycco.copy()
            # dyccos.insert(dycco)
            jsondata["ranked_communities"].append(jsondycco)
        twitterDataFile.write(json.dumps(jsondata, sort_keys=True))#,ensure_ascii=False).replace("\u200f",""))
        twitterDataFile.close()

        for tmptime in range(timeslots):
            timeimage = make_wordcloud([self.timeLimit[tmptime],'the date'],[10,2])
            blank_image.paste(timeimage, (tmptime*width,(position+2)*height))
        imsize=blank_image.size
        blank_image = blank_image.resize((round(imsize[0]/2),round(imsize[1]/2)),Image.ANTIALIAS)
        blank_image.save(self.dataset_path + "/data/results/wordclouds/"+self.fileTitle+'_collage.pdf', quality=50)

        makefigures(commSizeHeatData,flux,self.fileTitle,self.day_month,commRanking,numTopComms,timeslots,uniCommIdsEvol,rankedCommunities,self.commPerTmslt,self.uniCommIds,prevTimeslots,self.dataset_path,self.xLablNum)
        return rankedCommunitiesFinal