Beispiel #1
0
	def writeTGraphs(self, dir):
		def run(nsid):
			prod = self.pgdb[nsid]
			g = prod.createTGraph(self.totalsize, self.pgdb)
			g.write(os.path.join(dir, FMT_EXT % nsid))
		exec_unique(self.pgdb.iterkeys(), lambda nsid: os.path.exists(os.path.join(dir, FMT_EXT % nsid)),
		  run, None, "tgraphs db: object files", LOG.info)
Beispiel #2
0
	def writeIndexes(self, dir):
		def run(nsid):
			prod = self.phdb[nsid]
			g = prod.createIndex()
			g.write(os.path.join(dir, FMT_EXT % nsid))
		exec_unique(self.phdb.iterkeys(), lambda nsid: os.path.exists(os.path.join(dir, FMT_EXT % nsid)),
		  run, None, "indexes db: object files", LOG.info)
Beispiel #3
0
	def generateIndexes(self):
		"""
		DOCUMENT
		"""
		name = "indexes"

		# generate Producer objects
		def run_p(nsid):
			prod = Producer(nsid)
			prod.initContent(self.pddb[nsid], self.dtdb)
			prod.inferScores()
			prod.repDoc()
			prod.repTag()
			self.phdb[nsid] = prod
			self.phsb[nsid] = prod.state
		exec_unique(self.pddb.iterkeys(), self.phsb, run_p, None, "%s db: producers" % name, LOG.info)

		# generate content arcs between producers
		def run_r(nsid):
			prod = self.phdb[nsid]
			if prod.state != P_ARC:
				rels = self.inferRelProds(prod)
				pmap = dict((rnsid, ProducerRelation(rattr, self.inferProdArc(prod,
				  self.phdb[rnsid]))) for rnsid, rattr in rels.iteritems())
				prod.initProdArcs(pmap)
				self.phdb[nsid] = prod
			self.phsb[nsid] = prod.state
		exec_unique(self.phdb.iterkeys(), lambda nsid: self.phsb[nsid] >= P_ARC, run_r, None,
		  "%s db: relations" % name, LOG.info, steps=0x10000)

		total = len(self.phdb)
		lab_p, id_p = zip(*(("%s (%s)\\n%s" % (nsid, prod.size(), '\\n'.join(prod.rep_t[0:4])),
		  (nsid, i)) for i, (nsid, prod) in enumerate(self.phdb.iteritems()))) if self.phdb else ([], [])
		id_p = dict(id_p)

		# generate producer graph
		arc_s, arc_t, edges, score = edge_array(total, 'd')
		for i, prod in enumerate(self.phdb.itervalues()):
			for vx in prod.docgr.vs.select(prod.prange()):
				arc_s.append(i)
				arc_t.append(id_p[vx[NID]])
				score.append(vx[NAA])

		sz = [log(1+prod.size()) for prod in self.phdb.itervalues()]
		v_attr = {NID: list(self.phdb.iterkeys()), "label": lab_p, NAT: sz, AAT: sz}

		self.prodgr = Graph(total, edges=list(edges), directed=True, vertex_attrs=v_attr, edge_attrs={AAT: score})
		LOG.info("%s db: generated producer graph" % name)
Beispiel #4
0
	def commitPhotoTags(self, photos, dtdb):
		"""
		Gets the tags of the given photos and saves these to a database

		@param photos: a list of photo ids
		@param dtdb: an open database of {photo:[tag]}
		"""
		def run(phid):
			tags = self.tags_getListPhoto(photo_id=phid).getchildren()[0].getchildren()[0].getchildren()
			return tags

		def post(phid, i, tags):
			# filter out "machine-tags"
			dtdb[phid] = [intern_force(tag.text) for tag in tags if tag.text and ":" not in tag.text]

		exec_unique(photos, dtdb, run, post, "photo-tag db", LOG.info, workers=True)
Beispiel #5
0
	def scrapeGroups(self, users):
		"""
		Scrapes all groups of the given users.

		@return: {group:[user]}
		"""
		gumap = {}

		def run(nsid):
			groups = self.people_getPublicGroups(user_id=nsid).getchildren()[0].getchildren()
			return groups

		def post(nsid, i, groups):
			for g in groups:
				gid = g.get("nsid")
				if gid in gumap:
					gumap[gid].append(nsid)
				else:
					gumap[gid] = [nsid]

		exec_unique(users, gumap, run, post, "gid sample db", LOG.info, workers=True)
		return gumap
Beispiel #6
0
	def generateTGraphs(self):
		"""
		DOCUMENT
		"""
		name = "tgraphs"

		tot_s = len(self.comm)
		id_p = dict(("%04d" % i, i) for i in xrange(0, tot_s))

		# generate docsets for new producers
		def run_p(nsid):
			prod = Producer(nsid)
			prod.initContent(set(chain(*(self.pddb[self.prodgr.vs[p][NID]] for p in self.comm[id_p[nsid]]))), self.dtdb, True)
			prod.inferScores()
			prod.repTag(cover=0) # TWEAK
			self.pgdb[nsid] = prod
			self.pgsb[nsid] = prod.state
		exec_unique(id_p, self.pgsb, run_p, None, "%s db: producers" % name, LOG.info)

		tot_p = len(self.prodgr.vs)
		edges, arc_a = infer_arcs(self.comm, tot_p, ratio=2*log(1+tot_p)) # TWEAK # relax for tgraphs
		self.sprdgr = Graph(tot_s, list(edges), directed=True,
		  vertex_attrs={NID:list("%04d" % i for i in xrange(0, tot_s)), "label":[len(com) for com in self.comm]})
		g = self.sprdgr
		LOG.info("%s db: generated producer graph" % name)

		# generate content arcs between producers
		def run_r(nsid):
			prod = self.pgdb[nsid]
			if prod.state != P_ARC:
				rprod = g.vs.select(g.successors(id_p[nsid]))[NID]
				pmap = dict((rnsid, ProducerRelation(None,
				  *self.inferProdArc(prod, self.pgdb[rnsid], show_tag=True))) for rnsid in rprod)
				prod.initProdArcs(pmap, has_tags=True)
				self.pgdb[nsid] = prod
			self.pgsb[nsid] = prod.state
		exec_unique(self.pgdb.iterkeys(), lambda nsid: self.pgsb[nsid] >= P_ARC, run_r, None,
		  "%s db: relations" % name, LOG.info, steps=0x10000)
Beispiel #7
0
		@param pddb: an open database of {producer:[photo]}
		"""
		if type(users) != set and len(users) > 16: users = set(users) # efficient membership test
		def run(nsid):
			# OPT HIGH decide whether we want this many, or whether "faves" only will do
			stream = list(self.data_walker(self.people_getPublicPhotos, user_id=nsid, per_page=500))
			faves = list(p for p in self.data_walker(self.favorites_getPublicList, user_id=nsid, per_page=500) if p.get("owner") in users)
			return stream, faves

		def post(nsid, i, (stream, faves)):
			photos = [p.get(NID) for p in chain(stream, faves)]
			if len(photos) >= 4096:
				LOG.info("producer db (user): got %s photos for user %s" % (len(photos), nsid))
			pddb[nsid] = photos

		exec_unique(users, pddb, run, post, "producer db (user)", LOG.info, workers=True)


	def commitGroupPhotos(self, gumap, pddb):
		"""
		Gets the photos of the given pools and saves these to a database

		@param gumap: a map of {group:[user]}
		@param pddb: an open database of {producer:[photo]}
		"""
		def run(gid):
			try:
				userphotos = (self.data_walker(self.groups_pools_getPhotos, group_id=gid, user_id=nsid, per_page=500, code_ignore=[2]) for nsid in gumap[gid])
				photos = list(chain(*userphotos))
			except FlickrError, e:
				if FlickrError_code(e) == 2: