def writeTGraphs(self, dir): def run(nsid): prod = self.pgdb[nsid] g = prod.createTGraph(self.totalsize, self.pgdb) g.write(os.path.join(dir, FMT_EXT % nsid)) exec_unique(self.pgdb.iterkeys(), lambda nsid: os.path.exists(os.path.join(dir, FMT_EXT % nsid)), run, None, "tgraphs db: object files", LOG.info)
def writeIndexes(self, dir): def run(nsid): prod = self.phdb[nsid] g = prod.createIndex() g.write(os.path.join(dir, FMT_EXT % nsid)) exec_unique(self.phdb.iterkeys(), lambda nsid: os.path.exists(os.path.join(dir, FMT_EXT % nsid)), run, None, "indexes db: object files", LOG.info)
def generateIndexes(self): """ DOCUMENT """ name = "indexes" # generate Producer objects def run_p(nsid): prod = Producer(nsid) prod.initContent(self.pddb[nsid], self.dtdb) prod.inferScores() prod.repDoc() prod.repTag() self.phdb[nsid] = prod self.phsb[nsid] = prod.state exec_unique(self.pddb.iterkeys(), self.phsb, run_p, None, "%s db: producers" % name, LOG.info) # generate content arcs between producers def run_r(nsid): prod = self.phdb[nsid] if prod.state != P_ARC: rels = self.inferRelProds(prod) pmap = dict((rnsid, ProducerRelation(rattr, self.inferProdArc(prod, self.phdb[rnsid]))) for rnsid, rattr in rels.iteritems()) prod.initProdArcs(pmap) self.phdb[nsid] = prod self.phsb[nsid] = prod.state exec_unique(self.phdb.iterkeys(), lambda nsid: self.phsb[nsid] >= P_ARC, run_r, None, "%s db: relations" % name, LOG.info, steps=0x10000) total = len(self.phdb) lab_p, id_p = zip(*(("%s (%s)\\n%s" % (nsid, prod.size(), '\\n'.join(prod.rep_t[0:4])), (nsid, i)) for i, (nsid, prod) in enumerate(self.phdb.iteritems()))) if self.phdb else ([], []) id_p = dict(id_p) # generate producer graph arc_s, arc_t, edges, score = edge_array(total, 'd') for i, prod in enumerate(self.phdb.itervalues()): for vx in prod.docgr.vs.select(prod.prange()): arc_s.append(i) arc_t.append(id_p[vx[NID]]) score.append(vx[NAA]) sz = [log(1+prod.size()) for prod in self.phdb.itervalues()] v_attr = {NID: list(self.phdb.iterkeys()), "label": lab_p, NAT: sz, AAT: sz} self.prodgr = Graph(total, edges=list(edges), directed=True, vertex_attrs=v_attr, edge_attrs={AAT: score}) LOG.info("%s db: generated producer graph" % name)
def commitPhotoTags(self, photos, dtdb): """ Gets the tags of the given photos and saves these to a database @param photos: a list of photo ids @param dtdb: an open database of {photo:[tag]} """ def run(phid): tags = self.tags_getListPhoto(photo_id=phid).getchildren()[0].getchildren()[0].getchildren() return tags def post(phid, i, tags): # filter out "machine-tags" dtdb[phid] = [intern_force(tag.text) for tag in tags if tag.text and ":" not in tag.text] exec_unique(photos, dtdb, run, post, "photo-tag db", LOG.info, workers=True)
def scrapeGroups(self, users): """ Scrapes all groups of the given users. @return: {group:[user]} """ gumap = {} def run(nsid): groups = self.people_getPublicGroups(user_id=nsid).getchildren()[0].getchildren() return groups def post(nsid, i, groups): for g in groups: gid = g.get("nsid") if gid in gumap: gumap[gid].append(nsid) else: gumap[gid] = [nsid] exec_unique(users, gumap, run, post, "gid sample db", LOG.info, workers=True) return gumap
def generateTGraphs(self): """ DOCUMENT """ name = "tgraphs" tot_s = len(self.comm) id_p = dict(("%04d" % i, i) for i in xrange(0, tot_s)) # generate docsets for new producers def run_p(nsid): prod = Producer(nsid) prod.initContent(set(chain(*(self.pddb[self.prodgr.vs[p][NID]] for p in self.comm[id_p[nsid]]))), self.dtdb, True) prod.inferScores() prod.repTag(cover=0) # TWEAK self.pgdb[nsid] = prod self.pgsb[nsid] = prod.state exec_unique(id_p, self.pgsb, run_p, None, "%s db: producers" % name, LOG.info) tot_p = len(self.prodgr.vs) edges, arc_a = infer_arcs(self.comm, tot_p, ratio=2*log(1+tot_p)) # TWEAK # relax for tgraphs self.sprdgr = Graph(tot_s, list(edges), directed=True, vertex_attrs={NID:list("%04d" % i for i in xrange(0, tot_s)), "label":[len(com) for com in self.comm]}) g = self.sprdgr LOG.info("%s db: generated producer graph" % name) # generate content arcs between producers def run_r(nsid): prod = self.pgdb[nsid] if prod.state != P_ARC: rprod = g.vs.select(g.successors(id_p[nsid]))[NID] pmap = dict((rnsid, ProducerRelation(None, *self.inferProdArc(prod, self.pgdb[rnsid], show_tag=True))) for rnsid in rprod) prod.initProdArcs(pmap, has_tags=True) self.pgdb[nsid] = prod self.pgsb[nsid] = prod.state exec_unique(self.pgdb.iterkeys(), lambda nsid: self.pgsb[nsid] >= P_ARC, run_r, None, "%s db: relations" % name, LOG.info, steps=0x10000)
@param pddb: an open database of {producer:[photo]} """ if type(users) != set and len(users) > 16: users = set(users) # efficient membership test def run(nsid): # OPT HIGH decide whether we want this many, or whether "faves" only will do stream = list(self.data_walker(self.people_getPublicPhotos, user_id=nsid, per_page=500)) faves = list(p for p in self.data_walker(self.favorites_getPublicList, user_id=nsid, per_page=500) if p.get("owner") in users) return stream, faves def post(nsid, i, (stream, faves)): photos = [p.get(NID) for p in chain(stream, faves)] if len(photos) >= 4096: LOG.info("producer db (user): got %s photos for user %s" % (len(photos), nsid)) pddb[nsid] = photos exec_unique(users, pddb, run, post, "producer db (user)", LOG.info, workers=True) def commitGroupPhotos(self, gumap, pddb): """ Gets the photos of the given pools and saves these to a database @param gumap: a map of {group:[user]} @param pddb: an open database of {producer:[photo]} """ def run(gid): try: userphotos = (self.data_walker(self.groups_pools_getPhotos, group_id=gid, user_id=nsid, per_page=500, code_ignore=[2]) for nsid in gumap[gid]) photos = list(chain(*userphotos)) except FlickrError, e: if FlickrError_code(e) == 2: