def addUserData(username, db): """ Adds a username and all of its ratings to a MalDB. Arguments: username - the username to add db - the MalDB instance to add the data to """ animelist = WebGrab.getAnimeList(username) userid = WebGrab.getUserId(username) addAnimeList(db, userid, username, animelist)
def post(self): self.response.headers['Content-Type'] = 'text/plain' # Fill the queue with most recent online users # Get the next username on the list usernamelist = WebGrab.getRecentOnlineUsernames() if usernamelist == []: self.response.out.write('Webgrab got 0 results<br>') logging.debug('Webgrab got 0 results') # Create task queue items for each user for i, username in enumerate(usernamelist): taskqueue.add(url='/extract', params={'username' : username}, name="user_extract-%s-%s" % (username, int(time.time())), queue_name="user-extract")
def post(self): self.response.headers["Content-Type"] = "text/html" username = self.request.get("username") logging.debug("Got request to queue %s" % cgi.escape(username)) # Verify the user profile is real try: userid = WebGrab.getUserId(username) except urllib2.URLError: self.response.out.write("Could not find user %s" % cgi.escape(username)) return except WebGrab.UnknownUser: self.response.out.write("Could not find user %s" % cgi.escape(username)) return # Enter the user into the taskqueue taskqueue.add( url="/extract", params={"username": username}, name="user_extract-%s-%s" % (username, int(time.time())), queue_name="user-extract", )
def post(self): self.response.headers["Content-Type"] = "text/html" username = self.request.get("username") self.response.out.write("Getting %s" % username) logging.debug("Getting %s" % username) # Get the users animelist and id animelist = WebGrab.getAnimeList(username) # Limit the number of animes to use if len(animelist) > MAX_ANIMES_TO_USE: animelist = random.sample(animelist, MAX_ANIMES_TO_USE) # Go through each rating in the new list and create a map from # id to rating ratingMap = {} nameMap = {} ratingSum = 0.0 ratingSumSquares = 0.0 trueCount = 0 for anime in animelist: animeid = anime["id"] rating = anime["score"] ratingSum += rating ratingSumSquares += rating * rating nameMap[str(animeid)] = anime["title"] if rating != 0: trueCount += 1 if trueCount != 0: mean = ratingSum / trueCount stddev = math.sqrt((ratingSumSquares / trueCount) - mean * mean) else: mean = 0 stddev = 0 # Normalize all ratings if stddev < 0.1: # Standard deviation seems to indicate no variance, so set # all the animes to the average for anime in animelist: ratingMap[str(anime["id"])] = 0.0 else: for anime in animelist: rating = anime["score"] animeid = str(anime["id"]) if rating == 0: # No rating, default to average ratingMap[animeid] = 0.0 else: ratingMap[animeid] = (rating - mean) / stddev # Get anime objects, creating new ones if necessary animes = self.getAnimeObjects(nameMap) # Get all topic objects, making new ones as needed topics = self.getTopicObjects(ratingMap.keys(), animes) # Deserialize the topic maps topicMaps = [0] * len(topics) for i, topic in enumerate(topics): topicMaps[i] = eval(str(topic.animes)) # Get the topic weights for this user topicWeights = [0.1] * len(topics) for i, topic in enumerate(topics): for animeid in ratingMap: if animeid in topicMaps[i]: topicWeights[i] += topicMaps[i][animeid] * ratingMap[animeid] # Normalize by averaging over all ratings for i, weight in enumerate(topicWeights): topicWeights[i] /= len(ratingMap) # Now using the user weights, calculate error predictions from all # ratings ratingErrors = {} for animeid in ratingMap: ratingSum = 0.0 for i, weight in enumerate(topicWeights): if animeid in topicMaps[i]: ratingSum += weight * topicMaps[i][animeid] ratingErrors[animeid] = ratingSum - ratingMap[animeid] # Move the topic->anime weights using gradient descent for i, topic in enumerate(topics): key_union = set(ratingErrors.keys()) | set(topicMaps[i].keys()) for animeid in key_union: if animeid not in topicMaps[i]: topicMaps[i][animeid] = 0.0 if animeid not in ratingErrors: ratingErrors[animeid] = 0.0 topicMaps[i][animeid] -= LEARNING_RATE * ( ratingErrors[animeid] * topicWeights[i] + REGULARIZATION_FACTOR * topicMaps[i][animeid] ) # Make sure the weight meets the threshold for keeping it if abs(topicMaps[i][animeid]) < THRESHOLD_WEIGHT: del topicMaps[i][animeid] # Write the final map topic.animes = db.Blob(str(topicMaps[i])) # Batch update everything db.put(animes + topics)