def __init__(self, ntopics, userlist = None): self.TOPICS = ntopics # NUMBER OF TOPICS self.documents = {} # TRAINING DATA: {DocID: [WordID1, WordID1, WordID2, ...]} self.indD = {} # MAP DOCUMENT INTO INDEX: self.indD = {DocID: INDEX} self.indV = {} # MAP WORD INTO INDEX: self.indV = {VocabID: INDEX} self.DOCS = 0 # NUMBER OF DOCUMENTS self.VOCABS = 0 # NUMBER OF VOCABULARIES self.alpha = np.ones(self.TOPICS) for i in range(self.TOPICS): self.alpha[i] *= 0.01 # np.random.gamma(0.1, 1) self.beta = 0.01 # np.random.gamma(0.1, 1) # DBAdapter self.dbAdapter = DBAdapter("/home/changuk/data/CNN/TwitterData.sqlite") # Load data from SQLite database if userlist == None: self.loadData() else: self.loadData(userlist) for doc in self.documents: random.shuffle(self.documents[doc]) # SHUFFLE WORDS IN EACH DOCUMENT self.theta = np.zeros((self.DOCS, self.TOPICS)) # SPACE FOR THETA MATRIX WITH 0s self.phi = np.zeros((self.TOPICS, self.VOCABS)) # SPACE FOR PHI MATRIX WITH 0s
def __init__(self): self.dbAdapter = DBAdapter(StaticVariable.ROOTPATH + "TwitterData.sqlite") self.tool = MyTools() # Get valid user list self.validUsers = self.dbAdapter.getValidUserList() print("Valid users: " + str(len(self.validUsers))) # Add index on valid users self.indU = {} # UserID : index USERS = 0 for user in self.validUsers: self.indU[user] = USERS USERS += 1 self.getSeedUsers() self.getFriendList() self.filterSmallEgoNetwork() self.getLikeVectors() self.getAuthorshipOnLikedTweets() self.getLikeCount() self.getMentionCount() self.getMutualFriendsCount()
from TradeEvaluator import TradeEvaluator from DBAdapter import DBAdapter from datetime import datetime, timedelta import time import sys from Utils import * from ConsultarMarquetBalance_sql import ConultarOnline from threading import * import os path = os.getcwd() ROOT_path = os.sep.join(path.split(os.sep)[:-2]) sdbInstance = 'sqlite:///'+ ROOT_path +'\BBDD\krakenTrader.db' DBA = DBAdapter(sdbInstance) def EnviromentSetup(): '''establese o crea los elementos necesatirios para que el robot funcione''' LogsFolder = ROOT_path + "/LOGs" SetupLogsFolder(LogsFolder) def WorkerConsultarMarquet(): ConultarOnline(DBA) def WorkerTradeEvaluator(): Lock = threading.Lock() espera = 60 TE = TradeEvaluator() while True: while not Lock.acquire(): LogEvent('No se puede bloquear. Waitng 1s...') time.sleep(1)
class LDASampler(object): def __init__(self, ntopics, userlist = None): self.TOPICS = ntopics # NUMBER OF TOPICS self.documents = {} # TRAINING DATA: {DocID: [WordID1, WordID1, WordID2, ...]} self.indD = {} # MAP DOCUMENT INTO INDEX: self.indD = {DocID: INDEX} self.indV = {} # MAP WORD INTO INDEX: self.indV = {VocabID: INDEX} self.DOCS = 0 # NUMBER OF DOCUMENTS self.VOCABS = 0 # NUMBER OF VOCABULARIES self.alpha = np.ones(self.TOPICS) for i in range(self.TOPICS): self.alpha[i] *= 0.01 # np.random.gamma(0.1, 1) self.beta = 0.01 # np.random.gamma(0.1, 1) # DBAdapter self.dbAdapter = DBAdapter("/home/changuk/data/CNN/TwitterData.sqlite") # Load data from SQLite database if userlist == None: self.loadData() else: self.loadData(userlist) for doc in self.documents: random.shuffle(self.documents[doc]) # SHUFFLE WORDS IN EACH DOCUMENT self.theta = np.zeros((self.DOCS, self.TOPICS)) # SPACE FOR THETA MATRIX WITH 0s self.phi = np.zeros((self.TOPICS, self.VOCABS)) # SPACE FOR PHI MATRIX WITH 0s def loadData(self, userlist = None): # Load user list of the ego network if userlist == None: self.completeUsers = self.dbAdapter.getCompleteUserList() users = self.completeUsers else: users = userlist # Load tweets for each user cnt = 0 for user in users: print("Load users... " + str(cnt) + "/" + str(len(users))) cnt += 1 if not user in self.documents: self.documents[user] = [] self.indD[user] = self.DOCS self.DOCS += 1 try: cursor = self.dbAdapter.getConnection().cursor() cursor.execute("SELECT text FROM tweet WHERE author = " + str(user) + " AND isMention = 0") except: continue while True: try: tweet = cursor.fetchone() if not tweet: break except: continue texts = nltk.word_tokenize(tweet[0]) tokens = nltk.pos_tag(texts) for token in tokens: if token[1].startswith("N"): word = WordNetLemmatizer().lemmatize(token[0], 'n') elif token[1].startswith("V"): word = WordNetLemmatizer().lemmatize(token[0], 'v') elif token[1].startswith("J"): word = WordNetLemmatizer().lemmatize(token[0], 'a') else: continue self.documents[user].append(word) if not word in self.indV: self.indV[word] = self.VOCABS self.VOCABS += 1 cursor.close() def assignTopics(self, doc, word, pos): # DROW TOPIC SAMPLE FROM FULL-CONDITIONAL DISTRIBUTION d = self.indD[doc] w = self.indV[word] z = self.topicAssignments[d][pos] # TOPIC ASSIGNMENT OF WORDS FOR EACH DOCUMENT self.cntTW[z, w] -= 1 self.cntDT[d, z] -= 1 self.cntT[z] -= 1 self.lenD[d] -= 1 # FULL-CONDITIONAL DISTRIBUTION prL = (self.cntDT[d] + self.alpha) / (self.lenD[d] + np.sum(self.alpha)) prR = (self.cntTW[:,w] + self.beta) / (self.cntT + self.beta * self.VOCABS) prFullCond = prL * prR # FULL-CONDITIONAL DISTRIBUTION prFullCond /= np.sum(prFullCond) # TO OBTAIN PROBABILITY # NOTE: 'prFullCond' is MULTINOMIAL DISTRIBUTION WITH THE LENGTH, NUMBER OF TOPICS, NOT A VALUE new_z = np.random.multinomial(1, prFullCond).argmax() # RANDOM SAMPLING FROM FULL-CONDITIONAL DISTRIBUTION self.topicAssignments[d][pos] = new_z self.cntTW[new_z, w] += 1 self.cntDT[d, new_z] += 1 self.cntT[new_z] += 1 self.lenD[d] += 1 def LogLikelihood(self): # FIND (JOINT) LOG-LIKELIHOOD VALUE l = 0 for z in range(self.TOPICS): # log p(w|z,\beta) l += gammaln(self.VOCABS * self.beta) l -= self.VOCABS * gammaln(self.beta) l += np.sum(gammaln(self.cntTW[z] + self.beta)) l -= gammaln(np.sum(self.cntTW[z] + self.beta)) for doc in self.documents: # log p(z|\alpha) d = self.indD[doc] l += gammaln(np.sum(self.alpha)) l -= np.sum(gammaln(self.alpha)) l += np.sum(gammaln(self.cntDT[d] + self.alpha)) l -= gammaln(np.sum(self.cntDT[d] + self.alpha)) return l def findAlphaBeta(self): # ADJUST ALPHA AND BETA BY USING MINKA'S FIXED-POINT ITERATION numerator = 0 denominator = 0 for d in range(self.DOCS): numerator += psi(self.cntDT[d] + self.alpha) - psi(self.alpha) denominator += psi(np.sum(self.cntDT[d] + self.alpha)) - psi(np.sum(self.alpha)) self.alpha *= numerator / denominator # UPDATE ALPHA numerator = 0 denominator = 0 for z in range(self.TOPICS): numerator += np.sum(psi(self.cntTW[z] + self.beta) - psi(self.beta)) denominator += psi(np.sum(self.cntTW[z] + self.beta)) - psi(self.VOCABS * self.beta) self.beta = (self.beta * numerator) / (self.VOCABS * denominator) # UPDATE BETA def findThetaPhi(self): th = np.zeros((self.DOCS, self.TOPICS)) # SPACE FOR THETA ph = np.zeros((self.TOPICS, self.VOCABS)) # SPACE FOR PHI for d in range(self.DOCS): for z in range(self.TOPICS): th[d][z] = (self.cntDT[d][z] + self.alpha[z]) / (self.lenD[d] + np.sum(self.alpha)) for z in range(self.TOPICS): for w in range(self.VOCABS): ph[z][w] = (self.cntTW[z][w] + self.beta) / (self.cntT[z] + self.beta * self.VOCABS) return ph, th def run(self, nsamples, burnin, interval): # GIBBS SAMPLER KERNEL if nsamples <= burnin: # BURNIN CHECK print("ERROR: BURN-IN POINT EXCEEDS THE NUMBER OF SAMPLES") sys.exit(0) print("# of DOCS:", self.DOCS) # PRINT TRAINING DATA INFORMATION print("# of TOPICS:", self.TOPICS) print("# of VOCABS:", self.VOCABS) # MAKE SPACE FOR TOPIC-ASSIGNMENT MATRICES WITH 0s self.topicAssignments = {} # {INDEX OF DOC: [TOPIC ASSIGNMENT]} for doc in self.documents: d = self.indD[doc] self.topicAssignments[d] = [0 for word in self.documents[doc]] self.cntTW = np.zeros((self.TOPICS, self.VOCABS)) # NUMBER OF TOPICS ASSIGNED TO A WORD self.cntDT = np.zeros((self.DOCS, self.TOPICS)) # NUMBER OF TOPICS ASSIGNED IN A DOCUMENT self.cntT = np.zeros(self.TOPICS) # ASSIGNMENT COUNT FOR EACH TOPIC self.lenD = np.zeros(self.DOCS) # ASSIGNMENT COUNT FOR EACH DOCUMENT = LENGTH OF DOCUMENT # RANDOMLY ASSIGN TOPIC TO EACH WORD for doc in self.documents: for i, word in enumerate(self.documents[doc]): d = self.indD[doc] w = self.indV[word] rt = random.randint(0, self.TOPICS-1) # RANDOM TOPIC ASSIGNMENT self.topicAssignments[d][i] = rt # RANDOMLY ASSIGN TOPIC TO EACH WORD self.cntTW[rt, w] += 1 self.cntDT[d, rt] += 1 self.cntT[rt] += 1 self.lenD[d] += 1 # COLLAPSED GIBBS SAMPLING print("INITIAL STATE") print("\tLikelihood:", self.LogLikelihood()) # FIND (JOINT) LOG-LIKELIHOOD # print("\tAlpha:", end="") # for i in range(self.TOPICS): # print(" %.5f" % self.alpha[i], end="") # print("\n\tBeta: %.5f" % self.beta) SAMPLES = 0 for s in range(nsamples): for doc in self.documents: for i, word in enumerate(self.documents[doc]): self.assignTopics(doc, word, i) # DROW TOPIC SAMPLE FROM FULL-CONDITIONAL DISTRIBUTION self.findAlphaBeta() # UPDATE ALPHA AND BETA VALUES lik = self.LogLikelihood() print("SAMPLE #" + str(s)) print("\tLikelihood:", lik) # print("\tAlpha:", end="") # for i in range(self.TOPICS): # print(" %.5f" % self.alpha[i], end="") # print("\n\tBeta: %.5f" % self.beta) if s > burnin and s % interval == 0: # FIND PHI AND THETA AFTER BURN-IN POINT ph, th = self.findThetaPhi() self.theta += th self.phi += ph SAMPLES += 1 self.theta /= SAMPLES # AVERAGING GIBBS SAMPLES OF THETA self.phi /= SAMPLES # AVERAGING GIBBS SAMPLES OF PHI return lik def getTopicVectors(self): topicVectors = {} for user in self.indD.keys(): pos = self.indD[user] topicVectors[user] = self.theta[pos].tolist() return topicVectors
class Experiment: def __init__(self): self.dbAdapter = DBAdapter(StaticVariable.ROOTPATH + "TwitterData.sqlite") self.tool = MyTools() # Get valid user list self.validUsers = self.dbAdapter.getValidUserList() print("Valid users: " + str(len(self.validUsers))) # Add index on valid users self.indU = {} # UserID : index USERS = 0 for user in self.validUsers: self.indU[user] = USERS USERS += 1 self.getSeedUsers() self.getFriendList() self.filterSmallEgoNetwork() self.getLikeVectors() self.getAuthorshipOnLikedTweets() self.getLikeCount() self.getMentionCount() self.getMutualFriendsCount() def getSeedUsers(self): # Path to save dump file FILE_EGOUSERS = StaticVariable.ROOTPATH + "egousers.pickle" # Get seed user list if os.path.exists(FILE_EGOUSERS) == True: file_egousers = open(FILE_EGOUSERS, "rb") self.egousers = pickle.load(file_egousers) file_egousers.close() else: self.egousers = [] # INTEGER ARRAY seedCandidates = self.dbAdapter.getSeedUserList() for seed in seedCandidates: if seed not in self.validUsers: continue if seed not in self.egousers: self.egousers.append(seed) file_egousers = open(FILE_EGOUSERS, "wb") pickle.dump(self.egousers, file_egousers) file_egousers.close() def getFriendList(self): # Path to save dump file FILE_FRIENDLIST = StaticVariable.ROOTPATH + "friendlist.pickle" # Get FriendList print("Getting friend list...") if os.path.exists(FILE_FRIENDLIST) == True: file_friendlist = open(FILE_FRIENDLIST, "rb") self.friendList = pickle.load(file_friendlist) file_friendlist.close() else: self.friendList = {} # KEY: INTEGER / VALUES INTEGER ARRAY for validUser in self.validUsers: tmpFriendList = self.dbAdapter.getFriendship(validUser) validFriends = [] for friend in tmpFriendList: if friend in self.validUsers and friend not in validFriends: validFriends.append(friend) self.friendList[validUser] = validFriends file_friendlist = open(FILE_FRIENDLIST, "wb") pickle.dump(self.friendList, file_friendlist) file_friendlist.close() def filterSmallEgoNetwork(self): # Filter small ego network print("Filtering ego users...") print("\t" + str(len(self.egousers)) + " users => ", end="") tmp = [] for egouser in self.egousers: if len(self.friendList[egouser]) > 100: tmp.append(egouser) self.egousers = tmp print(str(len(self.egousers)) + " users") allUsers = {} for egouser in self.egousers: allUsers[egouser] = None for friend in self.friendList[egouser]: allUsers[friend] = None print("\t" + str(len(allUsers)) + " users are being in " + str(len(self.egousers)) + " ego networks") print() # Average number of members for each ego network nFriends = 0 for egouser in self.egousers: nFriends += len(self.friendList[egouser]) + 1 print("Average number of network members: " + str(nFriends / len(self.egousers))) def getTopicVectors(self): # Path to save dump file FILE_TOPICVECTORS = StaticVariable.ROOTPATH + "topic_vectors.pickle" # Gibbs sampling setting NTOPICS = 30 GIBBS_SAMPLES = 100 BURNIN_POINT = 50 SAMPLING_INTERVAL = 2 print("Getting topic vectors...") if os.path.exists(FILE_TOPICVECTORS): file_topic_vectors = open(FILE_TOPICVECTORS, "rb") self.topic_vectors = pickle.load(file_topic_vectors) file_topic_vectors.close() else: self.topic_vectors = {} userlist = dict() for egouser in self.egousers: if egouser not in userlist: userlist.append(egouser) for friend in self.friendList[egouser]: if friend not in userlist: userlist.append(friend) sampler = LDASampler(NTOPICS, userlist) sampler.run(GIBBS_SAMPLES, BURNIN_POINT, SAMPLING_INTERVAL) self.topic_vectors = sampler.getTopicVectors() file_topic_vectors = open(FILE_TOPICVECTORS, "wb") pickle.dump(self.topic_vectors, file_topic_vectors) file_topic_vectors.close() print("\tCalculated topic vectors of " + str(len(self.topic_vectors.keys())) + " users") def getLikeVectors(self): # Path to save dump file FILE_LIKEVECTORS = StaticVariable.ROOTPATH + "like_vectors.pickle" print("Getting like vectors...") if os.path.exists(FILE_LIKEVECTORS) == True: file_like_vectors = open(FILE_LIKEVECTORS, "rb") self.like_vectors = pickle.load(file_like_vectors) file_like_vectors.close() else: self.like_vectors = {} # {user: [tweet, ...], ...} userlist = [] for egouser in self.egousers: if egouser not in userlist: userlist.append(egouser) for friend in self.friendList[egouser]: if friend not in userlist: userlist.append(friend) for user in userlist: self.like_vectors[user] = self.dbAdapter.getLikingTweets(user) file_like_vectors = open(FILE_LIKEVECTORS, "wb") pickle.dump(self.like_vectors, file_like_vectors) file_like_vectors.close() print("\tCalculated Like vectors of " + str(len(self.like_vectors.keys())) + " users") def getAuthorshipOnLikedTweets(self): # Path to save dump file FILE_AUTHORSHIP_ON_LIKEDTWEET = StaticVariable.ROOTPATH + "authorship_on_likedtweets.pickle" print("Getting authorship on liked tweets...") if os.path.exists(FILE_AUTHORSHIP_ON_LIKEDTWEET) == True: file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "rb") self.authorship_on_likedtweets = pickle.load(file_authorship_on_likedtweets) file_authorship_on_likedtweets.close() else: self.authorship_on_likedtweets = {} # {egouser: {member: [tweet, ...], ...}, ...} for egouser in self.egousers: membersInEgoNetwork = {} membersInEgoNetwork[egouser] = [] for friend in self.friendList[egouser]: membersInEgoNetwork[friend] = [] # Find liked tweets in ego network likedTweets = {} for member in membersInEgoNetwork: for tweet in self.like_vectors[member]: likedTweets[tweet] = None # Find tweet list for each member of ego network tweetList = self.dbAdapter.getTweetListByAuthor(membersInEgoNetwork) for member in membersInEgoNetwork: for tweet in tweetList[member]: if tweet in likedTweets: membersInEgoNetwork[member].append(tweet) self.authorship_on_likedtweets[egouser] = membersInEgoNetwork file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "wb") pickle.dump(self.authorship_on_likedtweets, file_authorship_on_likedtweets) file_authorship_on_likedtweets.close() print("\tFound authorship on liked tweets for each ego network") def getMentionCount(self): # Path to save dump file FILE_MENTIONCOUNT = StaticVariable.ROOTPATH + "mention_count.pickle" print("Getting mention count...") if os.path.exists(FILE_MENTIONCOUNT) == True: file_mentioncount = open(FILE_MENTIONCOUNT, "rb") self.mention_count = pickle.load(file_mentioncount) file_mentioncount.close() else: self.mention_count = {} for egouser in self.egousers: mentioncounts = {} for friend in self.friendList[egouser]: if friend in self.mention_count and egouser in self.mention_count[friend]: mentioncounts[friend] = self.mention_count[friend][egouser] else: mentioncounts[friend] = self.dbAdapter.getMentionCount(egouser, friend) self.mention_count[egouser] = mentioncounts file_mentioncount = open(FILE_MENTIONCOUNT, "wb") pickle.dump(self.mention_count, file_mentioncount) file_mentioncount.close() print("\tCalculated Mention counts: " + str(sum([len(self.mention_count[egouser]) for egouser in self.egousers])) + " records") def getLikeCount(self): # Path to save dump file FILE_LIKECOUNT = StaticVariable.ROOTPATH + "like_count.pickle" print("Getting like count for a user...") if os.path.exists(FILE_LIKECOUNT) == True: file_likecount = open(FILE_LIKECOUNT, "rb") self.like_count = pickle.load(file_likecount) file_likecount.close() else: self.like_count = {} for egouser in self.egousers: likecounts = {} for friend in self.friendList[egouser]: likecounts[friend] = self.dbAdapter.getLikeCount(egouser, friend) self.like_count[egouser] = likecounts file_likecount = open(FILE_LIKECOUNT, "wb") pickle.dump(self.like_count, file_likecount) file_likecount.close() print("\tCalculated Like counts: " + str(sum([len(self.like_count[egouser]) for egouser in self.egousers])) + " records") def getMutualFriendsCount(self): # Path to save dump file FILE_MUTUALFRIENDS = StaticVariable.ROOTPATH + "mutual_friends_count.pickle" print("Getting mutual friends count for a user...") if os.path.exists(FILE_MUTUALFRIENDS) == True: file_mutualfriends = open(FILE_MUTUALFRIENDS, "rb") self.mutual_friends_count = pickle.load(file_mutualfriends) file_mutualfriends.close() else: self.mutual_friends_count = {} for egouser in self.egousers: mutualFriendsCount = {} for friend in self.friendList[egouser]: if friend in self.mutual_friends_count and egouser in self.mutual_friends_count[friend]: mutualFriendsCount[friend] = self.mutual_friends_count[friend][egouser] else: mutualFriendsCount[friend] = self.dbAdapter.getMutualFriendsCount(egouser, friend, self.friendList[egouser], self.friendList[friend]) self.mutual_friends_count[egouser] = mutualFriendsCount file_mutualfriends = open(FILE_MUTUALFRIENDS, "wb") pickle.dump(self.mutual_friends_count, file_mutualfriends) file_mutualfriends.close() print("\tCalculated Mutual friends counts: " + str(sum([len(self.mutual_friends_count[egouser]) for egouser in self.egousers])) + " records") def getID(self, index): if index < 0 or index >= len(self.validUsers): return -1 return self.validUsers[index] def loadClusters(self): # Path to save dump file FILE_CLUSTERS = StaticVariable.ROOTPATH + "clusters.pickle" if os.path.exists(FILE_CLUSTERS) == True: dumpfile_cluster = open(FILE_CLUSTERS, "rb") self.clusters = pickle.load(dumpfile_cluster) dumpfile_cluster.close() else: self.clusters = {} # {1: [[2, 3, 4], [6, 7, 8, 9]], 2: [[1, 3, 4], [5, 6, 7, 8]]} clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*") if len(clusterFiles) == 0: # Check if there is input files for clustering networkFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/*.pairs") if len(networkFiles) == 0: print("Making input files... ") # Make network file(*.pairs) from self.friendList{} for egouser in self.egousers: users = [] # users.append(egouser) for friend in self.friendList[egouser]: users.append(friend) file = open(StaticVariable.ROOTPATH + "FastModularity/" + str(self.indU[egouser]) + ".pairs", "w") for user in users: for friend in self.friendList[user]: file.write(str(self.indU[user]) + "\t" + str(self.indU[friend]) + "\n") file.close() print("\tdone!") # Do clustering by executing shell script print("Clustering... ") # subprocess.Popen([StaticVariable.ROOTPATH + "FastModularity/doClustering.sh"]).communicate() subprocess.Popen(['for file in ' + StaticVariable.ROOTPATH + 'FastModularity/*.pairs; ' 'do ' + StaticVariable.ROOTPATH + 'FastModularity/FastCommunityMH -f "$file"; done'], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print("\tdone!") clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*") # Load clustering information into memory for filepath in clusterFiles: ind = int(os.path.basename(filepath)) egouser = self.getID(ind) file_cluster = open(filepath, "r") while (True): line = file_cluster.readline() if not line: break tokens = line.split("\t") memberset = [] for token in tokens: try: memberID = self.getID(int(token)) if memberID != egouser: memberset.append(memberID) except: continue if egouser not in self.clusters.keys(): self.clusters[egouser] = [] self.clusters[egouser].append(memberset) file_cluster.close() dumpfile_cluster = open(FILE_CLUSTERS, "wb") pickle.dump(self.clusters, dumpfile_cluster) dumpfile_cluster.close() nClusters = 0 for egouser in self.clusters: nClusters += len(self.clusters[egouser]) print("Loading clusters...") print("\t" + str(nClusters) + " clusters of " + str(len(self.egousers)) + " ego networks are loaded.") print() def show(self): plt.show()