コード例 #1
0
ファイル: experiment.py プロジェクト: ChangUk/TempRepo
class Experiment:
    def __init__(self):
        self.dbAdapter = DBAdapter(StaticVariable.ROOTPATH + "TwitterData.sqlite")
        self.tool = MyTools()
        
        # Get valid user list
        self.validUsers = self.dbAdapter.getValidUserList()
        print("Valid users: " + str(len(self.validUsers)))
        
        # Add index on valid users
        self.indU = {}      # UserID : index
        USERS = 0
        for user in self.validUsers:
            self.indU[user] = USERS
            USERS += 1
            
        self.getSeedUsers()
        self.getFriendList()
        self.filterSmallEgoNetwork()
        
        self.getLikeVectors()
        self.getAuthorshipOnLikedTweets()
        self.getLikeCount()
        self.getMentionCount()
        self.getMutualFriendsCount()
        
    def getSeedUsers(self):
        # Path to save dump file
        FILE_EGOUSERS       = StaticVariable.ROOTPATH + "egousers.pickle"
        
        # Get seed user list
        if os.path.exists(FILE_EGOUSERS) == True:
            file_egousers = open(FILE_EGOUSERS, "rb")
            self.egousers = pickle.load(file_egousers)
            file_egousers.close()
        else:
            self.egousers = []                  # INTEGER ARRAY
            seedCandidates = self.dbAdapter.getSeedUserList()
            for seed in seedCandidates:
                if seed not in self.validUsers:
                    continue
                if seed not in self.egousers:
                    self.egousers.append(seed)
            file_egousers = open(FILE_EGOUSERS, "wb")
            pickle.dump(self.egousers, file_egousers)
            file_egousers.close()
            
    def getFriendList(self):
        # Path to save dump file
        FILE_FRIENDLIST     = StaticVariable.ROOTPATH + "friendlist.pickle"
        
        # Get FriendList
        print("Getting friend list...")
        if os.path.exists(FILE_FRIENDLIST) == True:
            file_friendlist = open(FILE_FRIENDLIST, "rb")
            self.friendList = pickle.load(file_friendlist)
            file_friendlist.close()
        else:
            self.friendList = {}                # KEY: INTEGER / VALUES INTEGER ARRAY
            for validUser in self.validUsers:
                tmpFriendList = self.dbAdapter.getFriendship(validUser)
                validFriends = []
                for friend in tmpFriendList:
                    if friend in self.validUsers and friend not in validFriends:
                        validFriends.append(friend)
                self.friendList[validUser] = validFriends
            file_friendlist = open(FILE_FRIENDLIST, "wb")
            pickle.dump(self.friendList, file_friendlist)
            file_friendlist.close()
            
    def filterSmallEgoNetwork(self):
        # Filter small ego network
        print("Filtering ego users...")
        print("\t" + str(len(self.egousers)) + " users => ", end="")
        tmp = []
        for egouser in self.egousers:
            if len(self.friendList[egouser]) > 100:
                tmp.append(egouser)
        self.egousers = tmp
        print(str(len(self.egousers)) + " users")
        allUsers = {}
        for egouser in self.egousers:
            allUsers[egouser] = None
            for friend in self.friendList[egouser]:
                allUsers[friend] = None  
        print("\t" + str(len(allUsers)) + " users are being in " + str(len(self.egousers)) + " ego networks")
        print()
        
        # Average number of members for each ego network
        nFriends = 0
        for egouser in self.egousers:
            nFriends += len(self.friendList[egouser]) + 1
        print("Average number of network members: " + str(nFriends / len(self.egousers)))
    
    def getTopicVectors(self):
        # Path to save dump file
        FILE_TOPICVECTORS   = StaticVariable.ROOTPATH + "topic_vectors.pickle"
        
        # Gibbs sampling setting
        NTOPICS             = 30
        GIBBS_SAMPLES       = 100
        BURNIN_POINT        = 50
        SAMPLING_INTERVAL   = 2
        
        print("Getting topic vectors...")
        if os.path.exists(FILE_TOPICVECTORS):
            file_topic_vectors = open(FILE_TOPICVECTORS, "rb")
            self.topic_vectors = pickle.load(file_topic_vectors)
            file_topic_vectors.close()
        else:
            self.topic_vectors = {}
            userlist = dict()
            for egouser in self.egousers:
                if egouser not in userlist:
                    userlist.append(egouser)
                for friend in self.friendList[egouser]:
                    if friend not in userlist:
                        userlist.append(friend)
                        
            sampler = LDASampler(NTOPICS, userlist)
            sampler.run(GIBBS_SAMPLES, BURNIN_POINT, SAMPLING_INTERVAL)
            self.topic_vectors = sampler.getTopicVectors()
            
            file_topic_vectors = open(FILE_TOPICVECTORS, "wb")
            pickle.dump(self.topic_vectors, file_topic_vectors)
            file_topic_vectors.close()
        print("\tCalculated topic vectors of " + str(len(self.topic_vectors.keys())) + " users")
    
    def getLikeVectors(self):
        # Path to save dump file
        FILE_LIKEVECTORS = StaticVariable.ROOTPATH + "like_vectors.pickle"
        
        print("Getting like vectors...")
        if os.path.exists(FILE_LIKEVECTORS) == True:
            file_like_vectors = open(FILE_LIKEVECTORS, "rb")
            self.like_vectors = pickle.load(file_like_vectors)
            file_like_vectors.close()
        else:
            self.like_vectors = {}      # {user: [tweet, ...], ...}
            userlist = []
            for egouser in self.egousers:
                if egouser not in userlist:
                    userlist.append(egouser)
                for friend in self.friendList[egouser]:
                    if friend not in userlist:
                        userlist.append(friend)
                        
            for user in userlist:
                self.like_vectors[user] = self.dbAdapter.getLikingTweets(user)
            
            file_like_vectors = open(FILE_LIKEVECTORS, "wb")
            pickle.dump(self.like_vectors, file_like_vectors)
            file_like_vectors.close()
        print("\tCalculated Like vectors of " + str(len(self.like_vectors.keys())) + " users")
        
    def getAuthorshipOnLikedTweets(self):
        # Path to save dump file
        FILE_AUTHORSHIP_ON_LIKEDTWEET = StaticVariable.ROOTPATH + "authorship_on_likedtweets.pickle"
        
        print("Getting authorship on liked tweets...")
        if os.path.exists(FILE_AUTHORSHIP_ON_LIKEDTWEET) == True:
            file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "rb")
            self.authorship_on_likedtweets = pickle.load(file_authorship_on_likedtweets)
            file_authorship_on_likedtweets.close()
        else:
            self.authorship_on_likedtweets = {}     # {egouser: {member: [tweet, ...], ...}, ...}
            for egouser in self.egousers:
                membersInEgoNetwork = {}
                membersInEgoNetwork[egouser] = []
                for friend in self.friendList[egouser]:
                    membersInEgoNetwork[friend] = []
                
                # Find liked tweets in ego network
                likedTweets = {}
                for member in membersInEgoNetwork:
                    for tweet in self.like_vectors[member]:
                        likedTweets[tweet] = None
                        
                # Find tweet list for each member of ego network
                tweetList = self.dbAdapter.getTweetListByAuthor(membersInEgoNetwork)
                
                for member in membersInEgoNetwork:
                    for tweet in tweetList[member]:
                        if tweet in likedTweets:
                            membersInEgoNetwork[member].append(tweet)
                
                self.authorship_on_likedtweets[egouser] = membersInEgoNetwork
            file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "wb")
            pickle.dump(self.authorship_on_likedtweets, file_authorship_on_likedtweets)
            file_authorship_on_likedtweets.close()
        print("\tFound authorship on liked tweets for each ego network")
        
    def getMentionCount(self):
        # Path to save dump file
        FILE_MENTIONCOUNT = StaticVariable.ROOTPATH + "mention_count.pickle"
        print("Getting mention count...")
        if os.path.exists(FILE_MENTIONCOUNT) == True:
            file_mentioncount = open(FILE_MENTIONCOUNT, "rb")
            self.mention_count = pickle.load(file_mentioncount)
            file_mentioncount.close()
        else:
            self.mention_count = {}
            for egouser in self.egousers:
                mentioncounts = {}
                for friend in self.friendList[egouser]:
                    if friend in self.mention_count and egouser in self.mention_count[friend]:
                        mentioncounts[friend] = self.mention_count[friend][egouser]
                    else:
                        mentioncounts[friend] = self.dbAdapter.getMentionCount(egouser, friend)
                self.mention_count[egouser] = mentioncounts
            file_mentioncount = open(FILE_MENTIONCOUNT, "wb")
            pickle.dump(self.mention_count, file_mentioncount)
            file_mentioncount.close()
        print("\tCalculated Mention counts: " + str(sum([len(self.mention_count[egouser]) for egouser in self.egousers])) + " records")
        
    def getLikeCount(self):
        # Path to save dump file
        FILE_LIKECOUNT = StaticVariable.ROOTPATH + "like_count.pickle"
        
        print("Getting like count for a user...")
        if os.path.exists(FILE_LIKECOUNT) == True:
            file_likecount = open(FILE_LIKECOUNT, "rb")
            self.like_count = pickle.load(file_likecount)
            file_likecount.close()
        else:
            self.like_count = {}
            for egouser in self.egousers:
                likecounts = {}
                for friend in self.friendList[egouser]:
                    likecounts[friend] = self.dbAdapter.getLikeCount(egouser, friend)
                self.like_count[egouser] = likecounts
            file_likecount = open(FILE_LIKECOUNT, "wb")
            pickle.dump(self.like_count, file_likecount)
            file_likecount.close()
        print("\tCalculated Like counts: " + str(sum([len(self.like_count[egouser]) for egouser in self.egousers])) + " records")
        
    def getMutualFriendsCount(self):
        # Path to save dump file
        FILE_MUTUALFRIENDS = StaticVariable.ROOTPATH + "mutual_friends_count.pickle"
        
        print("Getting mutual friends count for a user...")
        if os.path.exists(FILE_MUTUALFRIENDS) == True:
            file_mutualfriends = open(FILE_MUTUALFRIENDS, "rb")
            self.mutual_friends_count = pickle.load(file_mutualfriends)
            file_mutualfriends.close()
        else:
            self.mutual_friends_count = {}
            for egouser in self.egousers:
                mutualFriendsCount = {}
                for friend in self.friendList[egouser]:
                    if friend in self.mutual_friends_count and egouser in self.mutual_friends_count[friend]:
                        mutualFriendsCount[friend] = self.mutual_friends_count[friend][egouser]
                    else:
                        mutualFriendsCount[friend] = self.dbAdapter.getMutualFriendsCount(egouser, friend, self.friendList[egouser], self.friendList[friend])
                self.mutual_friends_count[egouser] = mutualFriendsCount
            file_mutualfriends = open(FILE_MUTUALFRIENDS, "wb")
            pickle.dump(self.mutual_friends_count, file_mutualfriends)
            file_mutualfriends.close()
        print("\tCalculated Mutual friends counts: " + str(sum([len(self.mutual_friends_count[egouser]) for egouser in self.egousers])) + " records")
        
    def getID(self, index):
        if index < 0 or index >= len(self.validUsers):
            return -1
        return self.validUsers[index]
    
    def loadClusters(self):
        # Path to save dump file
        FILE_CLUSTERS    = StaticVariable.ROOTPATH + "clusters.pickle"
        
        if os.path.exists(FILE_CLUSTERS) == True:
            dumpfile_cluster = open(FILE_CLUSTERS, "rb")
            self.clusters = pickle.load(dumpfile_cluster)
            dumpfile_cluster.close()
        else:
            self.clusters = {}           # {1: [[2, 3, 4], [6, 7, 8, 9]], 2: [[1, 3, 4], [5, 6, 7, 8]]}
            
            clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*")
            if len(clusterFiles) == 0:
                # Check if there is input files for clustering
                networkFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/*.pairs")
                if len(networkFiles) == 0:
                    print("Making input files... ")
                    # Make network file(*.pairs) from self.friendList{}
                    for egouser in self.egousers:
                        users = []
#                         users.append(egouser)
                        for friend in self.friendList[egouser]:
                            users.append(friend)
                        
                        file = open(StaticVariable.ROOTPATH + "FastModularity/" + str(self.indU[egouser]) + ".pairs", "w")
                        for user in users:
                            for friend in self.friendList[user]:
                                file.write(str(self.indU[user]) + "\t" + str(self.indU[friend]) + "\n")
                        file.close()
                    print("\tdone!")
                
                # Do clustering by executing shell script
                print("Clustering... ")
#                 subprocess.Popen([StaticVariable.ROOTPATH + "FastModularity/doClustering.sh"]).communicate()
                subprocess.Popen(['for file in ' + StaticVariable.ROOTPATH + 'FastModularity/*.pairs; '
                                  'do ' + StaticVariable.ROOTPATH + 'FastModularity/FastCommunityMH -f "$file"; done'],
                                 shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                print("\tdone!")
            
            clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*")
            # Load clustering information into memory
            for filepath in clusterFiles:
                ind = int(os.path.basename(filepath))
                egouser = self.getID(ind)
                file_cluster = open(filepath, "r")
                while (True):
                    line = file_cluster.readline()
                    if not line:
                        break
                    tokens = line.split("\t")
                    memberset = []
                    for token in tokens:
                        try:
                            memberID = self.getID(int(token))
                            if memberID != egouser:
                                memberset.append(memberID)
                        except:
                            continue
                    if egouser not in self.clusters.keys():
                        self.clusters[egouser] = []
                    self.clusters[egouser].append(memberset)
                file_cluster.close()
            
            dumpfile_cluster = open(FILE_CLUSTERS, "wb")
            pickle.dump(self.clusters, dumpfile_cluster)
            dumpfile_cluster.close()
            
        nClusters = 0
        for egouser in self.clusters:
            nClusters += len(self.clusters[egouser])
        
        print("Loading clusters...")
        print("\t" + str(nClusters) + " clusters of " + str(len(self.egousers)) + " ego networks are loaded.")
        print()
    
    def show(self):
        plt.show()