class LDASampler(object): def __init__(self, ntopics, userlist = None): self.TOPICS = ntopics # NUMBER OF TOPICS self.documents = {} # TRAINING DATA: {DocID: [WordID1, WordID1, WordID2, ...]} self.indD = {} # MAP DOCUMENT INTO INDEX: self.indD = {DocID: INDEX} self.indV = {} # MAP WORD INTO INDEX: self.indV = {VocabID: INDEX} self.DOCS = 0 # NUMBER OF DOCUMENTS self.VOCABS = 0 # NUMBER OF VOCABULARIES self.alpha = np.ones(self.TOPICS) for i in range(self.TOPICS): self.alpha[i] *= 0.01 # np.random.gamma(0.1, 1) self.beta = 0.01 # np.random.gamma(0.1, 1) # DBAdapter self.dbAdapter = DBAdapter("/home/changuk/data/CNN/TwitterData.sqlite") # Load data from SQLite database if userlist == None: self.loadData() else: self.loadData(userlist) for doc in self.documents: random.shuffle(self.documents[doc]) # SHUFFLE WORDS IN EACH DOCUMENT self.theta = np.zeros((self.DOCS, self.TOPICS)) # SPACE FOR THETA MATRIX WITH 0s self.phi = np.zeros((self.TOPICS, self.VOCABS)) # SPACE FOR PHI MATRIX WITH 0s def loadData(self, userlist = None): # Load user list of the ego network if userlist == None: self.completeUsers = self.dbAdapter.getCompleteUserList() users = self.completeUsers else: users = userlist # Load tweets for each user cnt = 0 for user in users: print("Load users... " + str(cnt) + "/" + str(len(users))) cnt += 1 if not user in self.documents: self.documents[user] = [] self.indD[user] = self.DOCS self.DOCS += 1 try: cursor = self.dbAdapter.getConnection().cursor() cursor.execute("SELECT text FROM tweet WHERE author = " + str(user) + " AND isMention = 0") except: continue while True: try: tweet = cursor.fetchone() if not tweet: break except: continue texts = nltk.word_tokenize(tweet[0]) tokens = nltk.pos_tag(texts) for token in tokens: if token[1].startswith("N"): word = WordNetLemmatizer().lemmatize(token[0], 'n') elif token[1].startswith("V"): word = WordNetLemmatizer().lemmatize(token[0], 'v') elif token[1].startswith("J"): word = WordNetLemmatizer().lemmatize(token[0], 'a') else: continue self.documents[user].append(word) if not word in self.indV: self.indV[word] = self.VOCABS self.VOCABS += 1 cursor.close() def assignTopics(self, doc, word, pos): # DROW TOPIC SAMPLE FROM FULL-CONDITIONAL DISTRIBUTION d = self.indD[doc] w = self.indV[word] z = self.topicAssignments[d][pos] # TOPIC ASSIGNMENT OF WORDS FOR EACH DOCUMENT self.cntTW[z, w] -= 1 self.cntDT[d, z] -= 1 self.cntT[z] -= 1 self.lenD[d] -= 1 # FULL-CONDITIONAL DISTRIBUTION prL = (self.cntDT[d] + self.alpha) / (self.lenD[d] + np.sum(self.alpha)) prR = (self.cntTW[:,w] + self.beta) / (self.cntT + self.beta * self.VOCABS) prFullCond = prL * prR # FULL-CONDITIONAL DISTRIBUTION prFullCond /= np.sum(prFullCond) # TO OBTAIN PROBABILITY # NOTE: 'prFullCond' is MULTINOMIAL DISTRIBUTION WITH THE LENGTH, NUMBER OF TOPICS, NOT A VALUE new_z = np.random.multinomial(1, prFullCond).argmax() # RANDOM SAMPLING FROM FULL-CONDITIONAL DISTRIBUTION self.topicAssignments[d][pos] = new_z self.cntTW[new_z, w] += 1 self.cntDT[d, new_z] += 1 self.cntT[new_z] += 1 self.lenD[d] += 1 def LogLikelihood(self): # FIND (JOINT) LOG-LIKELIHOOD VALUE l = 0 for z in range(self.TOPICS): # log p(w|z,\beta) l += gammaln(self.VOCABS * self.beta) l -= self.VOCABS * gammaln(self.beta) l += np.sum(gammaln(self.cntTW[z] + self.beta)) l -= gammaln(np.sum(self.cntTW[z] + self.beta)) for doc in self.documents: # log p(z|\alpha) d = self.indD[doc] l += gammaln(np.sum(self.alpha)) l -= np.sum(gammaln(self.alpha)) l += np.sum(gammaln(self.cntDT[d] + self.alpha)) l -= gammaln(np.sum(self.cntDT[d] + self.alpha)) return l def findAlphaBeta(self): # ADJUST ALPHA AND BETA BY USING MINKA'S FIXED-POINT ITERATION numerator = 0 denominator = 0 for d in range(self.DOCS): numerator += psi(self.cntDT[d] + self.alpha) - psi(self.alpha) denominator += psi(np.sum(self.cntDT[d] + self.alpha)) - psi(np.sum(self.alpha)) self.alpha *= numerator / denominator # UPDATE ALPHA numerator = 0 denominator = 0 for z in range(self.TOPICS): numerator += np.sum(psi(self.cntTW[z] + self.beta) - psi(self.beta)) denominator += psi(np.sum(self.cntTW[z] + self.beta)) - psi(self.VOCABS * self.beta) self.beta = (self.beta * numerator) / (self.VOCABS * denominator) # UPDATE BETA def findThetaPhi(self): th = np.zeros((self.DOCS, self.TOPICS)) # SPACE FOR THETA ph = np.zeros((self.TOPICS, self.VOCABS)) # SPACE FOR PHI for d in range(self.DOCS): for z in range(self.TOPICS): th[d][z] = (self.cntDT[d][z] + self.alpha[z]) / (self.lenD[d] + np.sum(self.alpha)) for z in range(self.TOPICS): for w in range(self.VOCABS): ph[z][w] = (self.cntTW[z][w] + self.beta) / (self.cntT[z] + self.beta * self.VOCABS) return ph, th def run(self, nsamples, burnin, interval): # GIBBS SAMPLER KERNEL if nsamples <= burnin: # BURNIN CHECK print("ERROR: BURN-IN POINT EXCEEDS THE NUMBER OF SAMPLES") sys.exit(0) print("# of DOCS:", self.DOCS) # PRINT TRAINING DATA INFORMATION print("# of TOPICS:", self.TOPICS) print("# of VOCABS:", self.VOCABS) # MAKE SPACE FOR TOPIC-ASSIGNMENT MATRICES WITH 0s self.topicAssignments = {} # {INDEX OF DOC: [TOPIC ASSIGNMENT]} for doc in self.documents: d = self.indD[doc] self.topicAssignments[d] = [0 for word in self.documents[doc]] self.cntTW = np.zeros((self.TOPICS, self.VOCABS)) # NUMBER OF TOPICS ASSIGNED TO A WORD self.cntDT = np.zeros((self.DOCS, self.TOPICS)) # NUMBER OF TOPICS ASSIGNED IN A DOCUMENT self.cntT = np.zeros(self.TOPICS) # ASSIGNMENT COUNT FOR EACH TOPIC self.lenD = np.zeros(self.DOCS) # ASSIGNMENT COUNT FOR EACH DOCUMENT = LENGTH OF DOCUMENT # RANDOMLY ASSIGN TOPIC TO EACH WORD for doc in self.documents: for i, word in enumerate(self.documents[doc]): d = self.indD[doc] w = self.indV[word] rt = random.randint(0, self.TOPICS-1) # RANDOM TOPIC ASSIGNMENT self.topicAssignments[d][i] = rt # RANDOMLY ASSIGN TOPIC TO EACH WORD self.cntTW[rt, w] += 1 self.cntDT[d, rt] += 1 self.cntT[rt] += 1 self.lenD[d] += 1 # COLLAPSED GIBBS SAMPLING print("INITIAL STATE") print("\tLikelihood:", self.LogLikelihood()) # FIND (JOINT) LOG-LIKELIHOOD # print("\tAlpha:", end="") # for i in range(self.TOPICS): # print(" %.5f" % self.alpha[i], end="") # print("\n\tBeta: %.5f" % self.beta) SAMPLES = 0 for s in range(nsamples): for doc in self.documents: for i, word in enumerate(self.documents[doc]): self.assignTopics(doc, word, i) # DROW TOPIC SAMPLE FROM FULL-CONDITIONAL DISTRIBUTION self.findAlphaBeta() # UPDATE ALPHA AND BETA VALUES lik = self.LogLikelihood() print("SAMPLE #" + str(s)) print("\tLikelihood:", lik) # print("\tAlpha:", end="") # for i in range(self.TOPICS): # print(" %.5f" % self.alpha[i], end="") # print("\n\tBeta: %.5f" % self.beta) if s > burnin and s % interval == 0: # FIND PHI AND THETA AFTER BURN-IN POINT ph, th = self.findThetaPhi() self.theta += th self.phi += ph SAMPLES += 1 self.theta /= SAMPLES # AVERAGING GIBBS SAMPLES OF THETA self.phi /= SAMPLES # AVERAGING GIBBS SAMPLES OF PHI return lik def getTopicVectors(self): topicVectors = {} for user in self.indD.keys(): pos = self.indD[user] topicVectors[user] = self.theta[pos].tolist() return topicVectors