コード例 #1
0
ファイル: LDASampler.py プロジェクト: ChangUk/TempRepo
 def __init__(self, ntopics, userlist = None):
     self.TOPICS = ntopics                                       # NUMBER OF TOPICS
     self.documents = {}                                         # TRAINING DATA: {DocID: [WordID1, WordID1, WordID2, ...]}
     self.indD = {}                                              # MAP DOCUMENT INTO INDEX: self.indD = {DocID: INDEX}
     self.indV = {}                                              # MAP WORD INTO INDEX: self.indV = {VocabID: INDEX}
     self.DOCS = 0                                               # NUMBER OF DOCUMENTS
     self.VOCABS = 0                                             # NUMBER OF VOCABULARIES
     self.alpha = np.ones(self.TOPICS)
     for i in range(self.TOPICS):
         self.alpha[i] *= 0.01                                   # np.random.gamma(0.1, 1)
     self.beta = 0.01                                            # np.random.gamma(0.1, 1)
     
     # DBAdapter
     self.dbAdapter = DBAdapter("/home/changuk/data/CNN/TwitterData.sqlite")
     
     # Load data from SQLite database
     if userlist == None:
         self.loadData()
     else:
         self.loadData(userlist)
     
     for doc in self.documents:
         random.shuffle(self.documents[doc])                     # SHUFFLE WORDS IN EACH DOCUMENT
     self.theta = np.zeros((self.DOCS, self.TOPICS))             # SPACE FOR THETA MATRIX WITH 0s
     self.phi = np.zeros((self.TOPICS, self.VOCABS))             # SPACE FOR PHI MATRIX WITH 0s
コード例 #2
0
ファイル: experiment.py プロジェクト: ChangUk/TempRepo
 def __init__(self):
     self.dbAdapter = DBAdapter(StaticVariable.ROOTPATH + "TwitterData.sqlite")
     self.tool = MyTools()
     
     # Get valid user list
     self.validUsers = self.dbAdapter.getValidUserList()
     print("Valid users: " + str(len(self.validUsers)))
     
     # Add index on valid users
     self.indU = {}      # UserID : index
     USERS = 0
     for user in self.validUsers:
         self.indU[user] = USERS
         USERS += 1
         
     self.getSeedUsers()
     self.getFriendList()
     self.filterSmallEgoNetwork()
     
     self.getLikeVectors()
     self.getAuthorshipOnLikedTweets()
     self.getLikeCount()
     self.getMentionCount()
     self.getMutualFriendsCount()
コード例 #3
0
from TradeEvaluator import TradeEvaluator
from DBAdapter import DBAdapter
from datetime import datetime, timedelta
import time
import sys
from Utils import *
from ConsultarMarquetBalance_sql import ConultarOnline
from threading import *
import os

path = os.getcwd()
ROOT_path = os.sep.join(path.split(os.sep)[:-2])
sdbInstance = 'sqlite:///'+ ROOT_path +'\BBDD\krakenTrader.db'
DBA = DBAdapter(sdbInstance)

def EnviromentSetup():
    '''establese o crea los elementos necesatirios para que el robot funcione'''
    LogsFolder = ROOT_path + "/LOGs"
    SetupLogsFolder(LogsFolder)

def WorkerConsultarMarquet():
	ConultarOnline(DBA)

def WorkerTradeEvaluator():
    Lock = threading.Lock()
    espera = 60
    TE = TradeEvaluator()
    while True:
        while not Lock.acquire():
            LogEvent('No se puede bloquear. Waitng  1s...')
            time.sleep(1)
コード例 #4
0
ファイル: LDASampler.py プロジェクト: ChangUk/TempRepo
class LDASampler(object):
    def __init__(self, ntopics, userlist = None):
        self.TOPICS = ntopics                                       # NUMBER OF TOPICS
        self.documents = {}                                         # TRAINING DATA: {DocID: [WordID1, WordID1, WordID2, ...]}
        self.indD = {}                                              # MAP DOCUMENT INTO INDEX: self.indD = {DocID: INDEX}
        self.indV = {}                                              # MAP WORD INTO INDEX: self.indV = {VocabID: INDEX}
        self.DOCS = 0                                               # NUMBER OF DOCUMENTS
        self.VOCABS = 0                                             # NUMBER OF VOCABULARIES
        self.alpha = np.ones(self.TOPICS)
        for i in range(self.TOPICS):
            self.alpha[i] *= 0.01                                   # np.random.gamma(0.1, 1)
        self.beta = 0.01                                            # np.random.gamma(0.1, 1)
        
        # DBAdapter
        self.dbAdapter = DBAdapter("/home/changuk/data/CNN/TwitterData.sqlite")
        
        # Load data from SQLite database
        if userlist == None:
            self.loadData()
        else:
            self.loadData(userlist)
        
        for doc in self.documents:
            random.shuffle(self.documents[doc])                     # SHUFFLE WORDS IN EACH DOCUMENT
        self.theta = np.zeros((self.DOCS, self.TOPICS))             # SPACE FOR THETA MATRIX WITH 0s
        self.phi = np.zeros((self.TOPICS, self.VOCABS))             # SPACE FOR PHI MATRIX WITH 0s
        
    def loadData(self, userlist = None):
        # Load user list of the ego network
        if userlist == None:
            self.completeUsers = self.dbAdapter.getCompleteUserList()
            users = self.completeUsers
        else:
            users = userlist
        
        # Load tweets for each user
        cnt = 0
        for user in users:
            print("Load users... " + str(cnt) + "/" + str(len(users)))
            cnt += 1
            
            if not user in self.documents:
                self.documents[user] = []
                self.indD[user] = self.DOCS
                self.DOCS += 1
            
            try:
                cursor = self.dbAdapter.getConnection().cursor()
                cursor.execute("SELECT text FROM tweet WHERE author = " + str(user) + " AND isMention = 0")
            except:
                continue
            
            while True:
                try:
                    tweet = cursor.fetchone()
                    if not tweet:
                        break
                except:
                    continue
                
                texts = nltk.word_tokenize(tweet[0])
                tokens = nltk.pos_tag(texts)
                for token in tokens:
                    if token[1].startswith("N"):
                        word = WordNetLemmatizer().lemmatize(token[0], 'n')
                    elif token[1].startswith("V"):
                        word = WordNetLemmatizer().lemmatize(token[0], 'v')
                    elif token[1].startswith("J"):
                        word = WordNetLemmatizer().lemmatize(token[0], 'a')
                    else:
                        continue
                    
                    self.documents[user].append(word)
                    if not word in self.indV:
                        self.indV[word] = self.VOCABS
                        self.VOCABS += 1
                        
            cursor.close()
        
    def assignTopics(self, doc, word, pos):                         # DROW TOPIC SAMPLE FROM FULL-CONDITIONAL DISTRIBUTION
        d = self.indD[doc]
        w = self.indV[word]
        z = self.topicAssignments[d][pos]                           # TOPIC ASSIGNMENT OF WORDS FOR EACH DOCUMENT
        self.cntTW[z, w] -= 1
        self.cntDT[d, z] -= 1
        self.cntT[z] -= 1
        self.lenD[d] -= 1
        
        # FULL-CONDITIONAL DISTRIBUTION
        prL = (self.cntDT[d] + self.alpha) / (self.lenD[d] + np.sum(self.alpha))
        prR = (self.cntTW[:,w] + self.beta) / (self.cntT + self.beta * self.VOCABS)
        prFullCond = prL * prR                                      # FULL-CONDITIONAL DISTRIBUTION
        prFullCond /= np.sum(prFullCond)                            # TO OBTAIN PROBABILITY
        # NOTE: 'prFullCond' is MULTINOMIAL DISTRIBUTION WITH THE LENGTH, NUMBER OF TOPICS, NOT A VALUE
        new_z = np.random.multinomial(1, prFullCond).argmax()       # RANDOM SAMPLING FROM FULL-CONDITIONAL DISTRIBUTION
        self.topicAssignments[d][pos] = new_z
        self.cntTW[new_z, w] += 1
        self.cntDT[d, new_z] += 1
        self.cntT[new_z] += 1
        self.lenD[d] += 1
        
    def LogLikelihood(self):                                        # FIND (JOINT) LOG-LIKELIHOOD VALUE
        l = 0
        for z in range(self.TOPICS):                                # log p(w|z,\beta)
            l += gammaln(self.VOCABS * self.beta)
            l -= self.VOCABS * gammaln(self.beta)
            l += np.sum(gammaln(self.cntTW[z] + self.beta))
            l -= gammaln(np.sum(self.cntTW[z] + self.beta))
        for doc in self.documents:                                  # log p(z|\alpha)
            d = self.indD[doc]
            l += gammaln(np.sum(self.alpha))
            l -= np.sum(gammaln(self.alpha))
            l += np.sum(gammaln(self.cntDT[d] + self.alpha))
            l -= gammaln(np.sum(self.cntDT[d] + self.alpha))
        return l
        
    def findAlphaBeta(self):
        # ADJUST ALPHA AND BETA BY USING MINKA'S FIXED-POINT ITERATION
        numerator = 0
        denominator = 0
        for d in range(self.DOCS):
            numerator += psi(self.cntDT[d] + self.alpha) - psi(self.alpha)
            denominator += psi(np.sum(self.cntDT[d] + self.alpha)) - psi(np.sum(self.alpha))
        self.alpha *= numerator / denominator                                   # UPDATE ALPHA
        numerator = 0
        denominator = 0
        for z in range(self.TOPICS):
            numerator += np.sum(psi(self.cntTW[z] + self.beta) - psi(self.beta))
            denominator += psi(np.sum(self.cntTW[z] + self.beta)) - psi(self.VOCABS * self.beta)
        self.beta = (self.beta * numerator) / (self.VOCABS * denominator)       # UPDATE BETA
        
    def findThetaPhi(self):
        th = np.zeros((self.DOCS, self.TOPICS))                     # SPACE FOR THETA
        ph = np.zeros((self.TOPICS, self.VOCABS))                   # SPACE FOR PHI
        for d in range(self.DOCS):
            for z in range(self.TOPICS):
                th[d][z] = (self.cntDT[d][z] + self.alpha[z]) / (self.lenD[d] + np.sum(self.alpha))
        for z in range(self.TOPICS):
            for w in range(self.VOCABS):
                ph[z][w] = (self.cntTW[z][w] + self.beta) / (self.cntT[z] + self.beta * self.VOCABS)
        return ph, th
        
    def run(self, nsamples, burnin, interval):                   # GIBBS SAMPLER KERNEL
        if nsamples <= burnin:                                      # BURNIN CHECK
            print("ERROR: BURN-IN POINT EXCEEDS THE NUMBER OF SAMPLES")
            sys.exit(0)
        print("# of DOCS:", self.DOCS)                              # PRINT TRAINING DATA INFORMATION
        print("# of TOPICS:", self.TOPICS)
        print("# of VOCABS:", self.VOCABS)
        
        # MAKE SPACE FOR TOPIC-ASSIGNMENT MATRICES WITH 0s
        self.topicAssignments = {}                                  # {INDEX OF DOC: [TOPIC ASSIGNMENT]}
        for doc in self.documents:
            d = self.indD[doc]
            self.topicAssignments[d] = [0 for word in self.documents[doc]]
        self.cntTW = np.zeros((self.TOPICS, self.VOCABS))           # NUMBER OF TOPICS ASSIGNED TO A WORD
        self.cntDT = np.zeros((self.DOCS, self.TOPICS))             # NUMBER OF TOPICS ASSIGNED IN A DOCUMENT
        self.cntT = np.zeros(self.TOPICS)                           # ASSIGNMENT COUNT FOR EACH TOPIC
        self.lenD = np.zeros(self.DOCS)                             # ASSIGNMENT COUNT FOR EACH DOCUMENT = LENGTH OF DOCUMENT
        
        # RANDOMLY ASSIGN TOPIC TO EACH WORD
        for doc in self.documents:
            for i, word in enumerate(self.documents[doc]):
                d = self.indD[doc]
                w = self.indV[word]
                rt = random.randint(0, self.TOPICS-1)               # RANDOM TOPIC ASSIGNMENT
                self.topicAssignments[d][i] = rt                    # RANDOMLY ASSIGN TOPIC TO EACH WORD
                self.cntTW[rt, w] += 1
                self.cntDT[d, rt] += 1
                self.cntT[rt] += 1
                self.lenD[d] += 1
                
        # COLLAPSED GIBBS SAMPLING
        print("INITIAL STATE")
        print("\tLikelihood:", self.LogLikelihood())               # FIND (JOINT) LOG-LIKELIHOOD
#         print("\tAlpha:", end="")
#         for i in range(self.TOPICS):
#             print(" %.5f" % self.alpha[i], end="")
#         print("\n\tBeta: %.5f" % self.beta)
        SAMPLES = 0
        for s in range(nsamples):
            for doc in self.documents:
                for i, word in enumerate(self.documents[doc]):
                    self.assignTopics(doc, word, i)                 # DROW TOPIC SAMPLE FROM FULL-CONDITIONAL DISTRIBUTION
            self.findAlphaBeta()                                    # UPDATE ALPHA AND BETA VALUES
            lik = self.LogLikelihood()
            print("SAMPLE #" + str(s))
            print("\tLikelihood:", lik)
#             print("\tAlpha:", end="")
#             for i in range(self.TOPICS):
#                 print(" %.5f" % self.alpha[i], end="")
#             print("\n\tBeta: %.5f" % self.beta)
            if s > burnin and s % interval == 0:                    # FIND PHI AND THETA AFTER BURN-IN POINT
                ph, th = self.findThetaPhi()
                self.theta += th
                self.phi += ph
                SAMPLES += 1
        self.theta /= SAMPLES                                       # AVERAGING GIBBS SAMPLES OF THETA
        self.phi /= SAMPLES                                         # AVERAGING GIBBS SAMPLES OF PHI
        return lik
    
    def getTopicVectors(self):
        topicVectors = {}
        for user in self.indD.keys():
            pos = self.indD[user]
            topicVectors[user] = self.theta[pos].tolist()
        return topicVectors
コード例 #5
0
ファイル: experiment.py プロジェクト: ChangUk/TempRepo
class Experiment:
    def __init__(self):
        self.dbAdapter = DBAdapter(StaticVariable.ROOTPATH + "TwitterData.sqlite")
        self.tool = MyTools()
        
        # Get valid user list
        self.validUsers = self.dbAdapter.getValidUserList()
        print("Valid users: " + str(len(self.validUsers)))
        
        # Add index on valid users
        self.indU = {}      # UserID : index
        USERS = 0
        for user in self.validUsers:
            self.indU[user] = USERS
            USERS += 1
            
        self.getSeedUsers()
        self.getFriendList()
        self.filterSmallEgoNetwork()
        
        self.getLikeVectors()
        self.getAuthorshipOnLikedTweets()
        self.getLikeCount()
        self.getMentionCount()
        self.getMutualFriendsCount()
        
    def getSeedUsers(self):
        # Path to save dump file
        FILE_EGOUSERS       = StaticVariable.ROOTPATH + "egousers.pickle"
        
        # Get seed user list
        if os.path.exists(FILE_EGOUSERS) == True:
            file_egousers = open(FILE_EGOUSERS, "rb")
            self.egousers = pickle.load(file_egousers)
            file_egousers.close()
        else:
            self.egousers = []                  # INTEGER ARRAY
            seedCandidates = self.dbAdapter.getSeedUserList()
            for seed in seedCandidates:
                if seed not in self.validUsers:
                    continue
                if seed not in self.egousers:
                    self.egousers.append(seed)
            file_egousers = open(FILE_EGOUSERS, "wb")
            pickle.dump(self.egousers, file_egousers)
            file_egousers.close()
            
    def getFriendList(self):
        # Path to save dump file
        FILE_FRIENDLIST     = StaticVariable.ROOTPATH + "friendlist.pickle"
        
        # Get FriendList
        print("Getting friend list...")
        if os.path.exists(FILE_FRIENDLIST) == True:
            file_friendlist = open(FILE_FRIENDLIST, "rb")
            self.friendList = pickle.load(file_friendlist)
            file_friendlist.close()
        else:
            self.friendList = {}                # KEY: INTEGER / VALUES INTEGER ARRAY
            for validUser in self.validUsers:
                tmpFriendList = self.dbAdapter.getFriendship(validUser)
                validFriends = []
                for friend in tmpFriendList:
                    if friend in self.validUsers and friend not in validFriends:
                        validFriends.append(friend)
                self.friendList[validUser] = validFriends
            file_friendlist = open(FILE_FRIENDLIST, "wb")
            pickle.dump(self.friendList, file_friendlist)
            file_friendlist.close()
            
    def filterSmallEgoNetwork(self):
        # Filter small ego network
        print("Filtering ego users...")
        print("\t" + str(len(self.egousers)) + " users => ", end="")
        tmp = []
        for egouser in self.egousers:
            if len(self.friendList[egouser]) > 100:
                tmp.append(egouser)
        self.egousers = tmp
        print(str(len(self.egousers)) + " users")
        allUsers = {}
        for egouser in self.egousers:
            allUsers[egouser] = None
            for friend in self.friendList[egouser]:
                allUsers[friend] = None  
        print("\t" + str(len(allUsers)) + " users are being in " + str(len(self.egousers)) + " ego networks")
        print()
        
        # Average number of members for each ego network
        nFriends = 0
        for egouser in self.egousers:
            nFriends += len(self.friendList[egouser]) + 1
        print("Average number of network members: " + str(nFriends / len(self.egousers)))
    
    def getTopicVectors(self):
        # Path to save dump file
        FILE_TOPICVECTORS   = StaticVariable.ROOTPATH + "topic_vectors.pickle"
        
        # Gibbs sampling setting
        NTOPICS             = 30
        GIBBS_SAMPLES       = 100
        BURNIN_POINT        = 50
        SAMPLING_INTERVAL   = 2
        
        print("Getting topic vectors...")
        if os.path.exists(FILE_TOPICVECTORS):
            file_topic_vectors = open(FILE_TOPICVECTORS, "rb")
            self.topic_vectors = pickle.load(file_topic_vectors)
            file_topic_vectors.close()
        else:
            self.topic_vectors = {}
            userlist = dict()
            for egouser in self.egousers:
                if egouser not in userlist:
                    userlist.append(egouser)
                for friend in self.friendList[egouser]:
                    if friend not in userlist:
                        userlist.append(friend)
                        
            sampler = LDASampler(NTOPICS, userlist)
            sampler.run(GIBBS_SAMPLES, BURNIN_POINT, SAMPLING_INTERVAL)
            self.topic_vectors = sampler.getTopicVectors()
            
            file_topic_vectors = open(FILE_TOPICVECTORS, "wb")
            pickle.dump(self.topic_vectors, file_topic_vectors)
            file_topic_vectors.close()
        print("\tCalculated topic vectors of " + str(len(self.topic_vectors.keys())) + " users")
    
    def getLikeVectors(self):
        # Path to save dump file
        FILE_LIKEVECTORS = StaticVariable.ROOTPATH + "like_vectors.pickle"
        
        print("Getting like vectors...")
        if os.path.exists(FILE_LIKEVECTORS) == True:
            file_like_vectors = open(FILE_LIKEVECTORS, "rb")
            self.like_vectors = pickle.load(file_like_vectors)
            file_like_vectors.close()
        else:
            self.like_vectors = {}      # {user: [tweet, ...], ...}
            userlist = []
            for egouser in self.egousers:
                if egouser not in userlist:
                    userlist.append(egouser)
                for friend in self.friendList[egouser]:
                    if friend not in userlist:
                        userlist.append(friend)
                        
            for user in userlist:
                self.like_vectors[user] = self.dbAdapter.getLikingTweets(user)
            
            file_like_vectors = open(FILE_LIKEVECTORS, "wb")
            pickle.dump(self.like_vectors, file_like_vectors)
            file_like_vectors.close()
        print("\tCalculated Like vectors of " + str(len(self.like_vectors.keys())) + " users")
        
    def getAuthorshipOnLikedTweets(self):
        # Path to save dump file
        FILE_AUTHORSHIP_ON_LIKEDTWEET = StaticVariable.ROOTPATH + "authorship_on_likedtweets.pickle"
        
        print("Getting authorship on liked tweets...")
        if os.path.exists(FILE_AUTHORSHIP_ON_LIKEDTWEET) == True:
            file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "rb")
            self.authorship_on_likedtweets = pickle.load(file_authorship_on_likedtweets)
            file_authorship_on_likedtweets.close()
        else:
            self.authorship_on_likedtweets = {}     # {egouser: {member: [tweet, ...], ...}, ...}
            for egouser in self.egousers:
                membersInEgoNetwork = {}
                membersInEgoNetwork[egouser] = []
                for friend in self.friendList[egouser]:
                    membersInEgoNetwork[friend] = []
                
                # Find liked tweets in ego network
                likedTweets = {}
                for member in membersInEgoNetwork:
                    for tweet in self.like_vectors[member]:
                        likedTweets[tweet] = None
                        
                # Find tweet list for each member of ego network
                tweetList = self.dbAdapter.getTweetListByAuthor(membersInEgoNetwork)
                
                for member in membersInEgoNetwork:
                    for tweet in tweetList[member]:
                        if tweet in likedTweets:
                            membersInEgoNetwork[member].append(tweet)
                
                self.authorship_on_likedtweets[egouser] = membersInEgoNetwork
            file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "wb")
            pickle.dump(self.authorship_on_likedtweets, file_authorship_on_likedtweets)
            file_authorship_on_likedtweets.close()
        print("\tFound authorship on liked tweets for each ego network")
        
    def getMentionCount(self):
        # Path to save dump file
        FILE_MENTIONCOUNT = StaticVariable.ROOTPATH + "mention_count.pickle"
        print("Getting mention count...")
        if os.path.exists(FILE_MENTIONCOUNT) == True:
            file_mentioncount = open(FILE_MENTIONCOUNT, "rb")
            self.mention_count = pickle.load(file_mentioncount)
            file_mentioncount.close()
        else:
            self.mention_count = {}
            for egouser in self.egousers:
                mentioncounts = {}
                for friend in self.friendList[egouser]:
                    if friend in self.mention_count and egouser in self.mention_count[friend]:
                        mentioncounts[friend] = self.mention_count[friend][egouser]
                    else:
                        mentioncounts[friend] = self.dbAdapter.getMentionCount(egouser, friend)
                self.mention_count[egouser] = mentioncounts
            file_mentioncount = open(FILE_MENTIONCOUNT, "wb")
            pickle.dump(self.mention_count, file_mentioncount)
            file_mentioncount.close()
        print("\tCalculated Mention counts: " + str(sum([len(self.mention_count[egouser]) for egouser in self.egousers])) + " records")
        
    def getLikeCount(self):
        # Path to save dump file
        FILE_LIKECOUNT = StaticVariable.ROOTPATH + "like_count.pickle"
        
        print("Getting like count for a user...")
        if os.path.exists(FILE_LIKECOUNT) == True:
            file_likecount = open(FILE_LIKECOUNT, "rb")
            self.like_count = pickle.load(file_likecount)
            file_likecount.close()
        else:
            self.like_count = {}
            for egouser in self.egousers:
                likecounts = {}
                for friend in self.friendList[egouser]:
                    likecounts[friend] = self.dbAdapter.getLikeCount(egouser, friend)
                self.like_count[egouser] = likecounts
            file_likecount = open(FILE_LIKECOUNT, "wb")
            pickle.dump(self.like_count, file_likecount)
            file_likecount.close()
        print("\tCalculated Like counts: " + str(sum([len(self.like_count[egouser]) for egouser in self.egousers])) + " records")
        
    def getMutualFriendsCount(self):
        # Path to save dump file
        FILE_MUTUALFRIENDS = StaticVariable.ROOTPATH + "mutual_friends_count.pickle"
        
        print("Getting mutual friends count for a user...")
        if os.path.exists(FILE_MUTUALFRIENDS) == True:
            file_mutualfriends = open(FILE_MUTUALFRIENDS, "rb")
            self.mutual_friends_count = pickle.load(file_mutualfriends)
            file_mutualfriends.close()
        else:
            self.mutual_friends_count = {}
            for egouser in self.egousers:
                mutualFriendsCount = {}
                for friend in self.friendList[egouser]:
                    if friend in self.mutual_friends_count and egouser in self.mutual_friends_count[friend]:
                        mutualFriendsCount[friend] = self.mutual_friends_count[friend][egouser]
                    else:
                        mutualFriendsCount[friend] = self.dbAdapter.getMutualFriendsCount(egouser, friend, self.friendList[egouser], self.friendList[friend])
                self.mutual_friends_count[egouser] = mutualFriendsCount
            file_mutualfriends = open(FILE_MUTUALFRIENDS, "wb")
            pickle.dump(self.mutual_friends_count, file_mutualfriends)
            file_mutualfriends.close()
        print("\tCalculated Mutual friends counts: " + str(sum([len(self.mutual_friends_count[egouser]) for egouser in self.egousers])) + " records")
        
    def getID(self, index):
        if index < 0 or index >= len(self.validUsers):
            return -1
        return self.validUsers[index]
    
    def loadClusters(self):
        # Path to save dump file
        FILE_CLUSTERS    = StaticVariable.ROOTPATH + "clusters.pickle"
        
        if os.path.exists(FILE_CLUSTERS) == True:
            dumpfile_cluster = open(FILE_CLUSTERS, "rb")
            self.clusters = pickle.load(dumpfile_cluster)
            dumpfile_cluster.close()
        else:
            self.clusters = {}           # {1: [[2, 3, 4], [6, 7, 8, 9]], 2: [[1, 3, 4], [5, 6, 7, 8]]}
            
            clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*")
            if len(clusterFiles) == 0:
                # Check if there is input files for clustering
                networkFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/*.pairs")
                if len(networkFiles) == 0:
                    print("Making input files... ")
                    # Make network file(*.pairs) from self.friendList{}
                    for egouser in self.egousers:
                        users = []
#                         users.append(egouser)
                        for friend in self.friendList[egouser]:
                            users.append(friend)
                        
                        file = open(StaticVariable.ROOTPATH + "FastModularity/" + str(self.indU[egouser]) + ".pairs", "w")
                        for user in users:
                            for friend in self.friendList[user]:
                                file.write(str(self.indU[user]) + "\t" + str(self.indU[friend]) + "\n")
                        file.close()
                    print("\tdone!")
                
                # Do clustering by executing shell script
                print("Clustering... ")
#                 subprocess.Popen([StaticVariable.ROOTPATH + "FastModularity/doClustering.sh"]).communicate()
                subprocess.Popen(['for file in ' + StaticVariable.ROOTPATH + 'FastModularity/*.pairs; '
                                  'do ' + StaticVariable.ROOTPATH + 'FastModularity/FastCommunityMH -f "$file"; done'],
                                 shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                print("\tdone!")
            
            clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*")
            # Load clustering information into memory
            for filepath in clusterFiles:
                ind = int(os.path.basename(filepath))
                egouser = self.getID(ind)
                file_cluster = open(filepath, "r")
                while (True):
                    line = file_cluster.readline()
                    if not line:
                        break
                    tokens = line.split("\t")
                    memberset = []
                    for token in tokens:
                        try:
                            memberID = self.getID(int(token))
                            if memberID != egouser:
                                memberset.append(memberID)
                        except:
                            continue
                    if egouser not in self.clusters.keys():
                        self.clusters[egouser] = []
                    self.clusters[egouser].append(memberset)
                file_cluster.close()
            
            dumpfile_cluster = open(FILE_CLUSTERS, "wb")
            pickle.dump(self.clusters, dumpfile_cluster)
            dumpfile_cluster.close()
            
        nClusters = 0
        for egouser in self.clusters:
            nClusters += len(self.clusters[egouser])
        
        print("Loading clusters...")
        print("\t" + str(nClusters) + " clusters of " + str(len(self.egousers)) + " ego networks are loaded.")
        print()
    
    def show(self):
        plt.show()