def pruneProducers(self, socgr, gumap, pddb, cutoff=1): """ Removes producers with less than the given number of photos. @param socgr: graph of users @param groups: list of groups @param pddb: an open database of {producer:[photo]} @param cutoff: producers with this many photos or less will be pruned (default 1) """ # TODO NORM maybe also prune groups with >n users #FIXME HIGH if we prune users, then we also need to prune groups that #point to this user delu = [] #for u in socgr.vs[NID]: # if u in pddb: # if len(pddb[u]) > cutoff: # continue # del pddb[u] # delu.append(u) delg = [] for g in gumap: if g in pddb: if len(pddb[g]) > cutoff: continue del pddb[g] delg.append(g) #socgr.delete_vertices([v.index for v in socgr.vs.select(id_in=set(delu))]) for g in delg: del gumap[g] LOG.info("producer db: pruned %s users, %s groups" % (len(delu), len(delg)))
def commitUserPhotos(self, users, pddb): """ Gets the photos of the given users and saves these to a database @param users: a list of user ids @param pddb: an open database of {producer:[photo]} """ if type(users) != set and len(users) > 16: users = set(users) # efficient membership test def run(nsid): # OPT HIGH decide whether we want this many, or whether "faves" only will do stream = list(self.data_walker(self.people_getPublicPhotos, user_id=nsid, per_page=500)) faves = list(p for p in self.data_walker(self.favorites_getPublicList, user_id=nsid, per_page=500) if p.get("owner") in users) return stream, faves def post(nsid, i, (stream, faves)): photos = [p.get(NID) for p in chain(stream, faves)] if len(photos) >= 4096: LOG.info("producer db (user): got %s photos for user %s" % (len(photos), nsid)) pddb[nsid] = photos
def scrapeIDs(self, seed, size): if type(size) != int: raise TypeError def next(ss, qq): id = qq.popleft() if id in ss: return None node = self.makeID(id) qq.extend(node.out.keys()) ss.add_node(node) return id s = NodeSample() q = deque([self.getNSID(seed)]) while len(s) < size: id = next(s, q) if id is not None: LOG.info("id sample: %s/%s (added %s)" % (len(s), size, id)) s.build() return s
def post(gid, i, photos): if len(photos) >= 4096: LOG.info("producer db (group): got %s photos for group %s" % (len(photos), gid)) pddb[gid] = [p.get(NID) for p in photos]
""" if vkdb.writeback is not True: raise ValueError("[vkdb] must have writeback=True") def syncer(i, (key, items)): vkdb.sync() for i, (key, items) in enumerate_cb(kvdb.iteritems(), syncer, every=0x10000): for item in items: if item in vkdb: vkdb[item].append(key) else: vkdb[item] = [key] vkdb.sync() LOG.info("%s db: inverted %s keys to %s items" % (name, len(kvdb), len(vkdb))) def commitTagClusters(self, tags, tcdb): """ Gets the clusters of all the given tags and saves these to a database @param tags: a list of tags @param tcdb: an open database of {tag:[cluster]} """ def run(tag): try: # FIXME HIGH verify that this does the right thing for unicode tags # atm all evidence points to flickr not doing clustering anaylses for them... clusters = self.tags_getClusters(tag=tag).getchildren()[0].getchildren() except FlickrError, e: