Ejemplo n.º 1
0
		def scoreDoc(id, k):
			eseq = g.es.select(g.adjacent(id, IN))
			try:
				return union_ind(k*e[AAT]/sc_t[e.source-self.base_t] for e in eseq)
			except IndexError:
				print list(e.source for e in eseq)
				raise
Ejemplo n.º 2
0
	def createTGraph(self, totalsize, pgdb, display=False, node_attr={
	  "style": ("filled", "filled"),
	  "fillcolor":("firebrick1", "limegreen"),
	  "shape":("ellipse","doublecircle"),
	}):
		"""
		Creates a graph representing this producer as a tgraph.

		@param totalsize: total number of documents in the entire world
		@param pgdb: an open database of {prid:Producer} (for tgraphs)
		@param display: whether to generate for display (adds attributes to
		       pretty up the graph)
		@param node_attr: {attr:(tag,prod)} node attributes for graphviz; each
		       attribute should be mapped to a (tag,prod) pair that holds the
		       attribute value for the respective type of node; this only has
		       an effect if <display> is True
		       an effect if <display> is True
		"""

		# estimate total size from producer's own perspective
		# the formula is pulled out of my ass but should give passable results
		# - neighbours are not independent => total lower than this
		# - neighbours are not entire network => total higher than this
		total = union_ind(chain([self.size()], (pgdb[self.docgr.vs[pid]["id"]].size() for pid in self.prange())), totalsize)
		# print "producer %s (%s): total size of network estimated to be %s (actual %s)" % (self.nsid, self.size(), total, totalsize)

		gg = self.docgr.copy()
		del gg.vs[NAA]
		gg["base_t"] = 0
		gg["base_g"] = self.base_p - self.base_t

		# node-attrs for prange
		gg.vs[self.base_p:][NAT] = [pgdb[gg.vs[pid][NID]].size()/float(total) for pid in self.prange()]

		# infer arcs between tags
		mem = [filter(lambda id: id in self.drange(), gg.successors(tid)) for tid in self.trange()]
		edges, arc_a = infer_arcs(mem, total)

		gg.delete_vertices(self.drange())
		gg.add_edges(edges)
		#assert gg.es[-len(edges):][AAT] == [None] * len(edges)
		gg.es[-len(edges):][AAT] = arc_a

		if display:
			gg.vs["label"] = gg.vs[NID]
			del gg.vs[NID]
			for attr, val in node_attr.iteritems():
				gg.vs[attr] = [val[0] for i in self.drange()] + [val[1] for i in self.trange()] + [val[2] for i in self.prange()]

		return gg
Ejemplo n.º 3
0
	def selectTagsFromClusters(self, tset_s, tset_t):
		"""
		Selects tags from the intersection between each cluster for a source
		tag, and the target tagset. The representatives of the cluster are also
		selected, if the intersection is large enough.

		@param tset_s: source tag-set
		@param tset_t: target tag-set
		@return: (rtags, htags), where rtags = {rtag:[tag]} associates tags on
		         the target side to related tags on the source side, and htags
		         = {htag:e_attr} associates "high-level" tags (which might not
		         exist on the target side) to appropriate arc-attributes.
		"""
		#LOG.debug("III enter selectTagsFromClusters: %s %s" % (len(tset_s), len(tset_t)))
		rtags = {}
		htags = {}
		if type(tset_t) != set:
			tset_t = set(tset_t)

		for tag in tset_s:
			for cluster in self.tcdb[tag]:
				tset_x = tset_t.intersection(cluster)

				# add intersection to rtags
				for rtag in tset_x:
					if rtag in rtags:
						rtags[rtag].append(tag)
					else:
						rtags[rtag] = [tag]

				# if intersection is big enough, add "representative" tags of
				# this cluster to htags
				if 3*len(tset_x) > len(cluster): # TWEAK
					# on flickr, this is the first 3 tags
					attr = len(tset_x)/float(len(cluster))
					for rtag in cluster[0:3]:
						if rtag in htags:
							htags[rtag].append(attr)
							rtags[rtag].append(tag)
						else:
							htags[rtag] = [attr]
							rtags[rtag] = [tag]

		#LOG.debug("XXX exit selectTagsFromClusters: %s %s" % (len(tset_s), len(tset_t)))
		return rtags, dict((htag, union_ind(attrs)) for htag, attrs in htags.iteritems())
Ejemplo n.º 4
0
	def inferScores(self, init=0.5):
		"""
		Infer scores for docs and tags.

		DOCUMENT more detail
		"""
		g = self.docgr

		# doc-tag weight is P(t|d)
		# tags and docs are considered as bags of meaning
		# a producer = union of tags = union of docs

		# Infer P(t|this) = union_ind(P(t|d) over all d attached to t)
		#
		# Justification: roughly, 1 match out of any is satisfactory. We have
		# no further information so assume P(t|d) independent over d.
		sc_t = list(union_ind(g.es.select(g.adjacent(id, OUT))[AAT]) for id in self.trange())

		# Infer P(d|this) = union_ind(P(d|t) over all t attached to d)
		#
		# We assume that P(d|this) = P(d). This is NOT theoretically sound, but
		# it doesn't matter because this heuristic is only used within this
		# producer, to rank documents. (In reality, P(d|this) >> P(d).)
		#
		# We rewrite P(d|t) in terms of P(t|d); this results in a formula with
		# P(d) on both sides; we use iterconverge to find a non-zero solution.
		#
		# Special case: if there is only 1 tag, its weight is 1.0, and its arc
		# weight is 1.0, then iteration will always return the inital value.
		# So we'll arbitrarily choose init=0.5 by default.
		sc_d = []
		def scoreDoc(id, k):
			eseq = g.es.select(g.adjacent(id, IN))
			try:
				return union_ind(k*e[AAT]/sc_t[e.source-self.base_t] for e in eseq)
			except IndexError:
				print list(e.source for e in eseq)
				raise
		for id in self.drange():
			sc_d.append(iterconverge(partial(scoreDoc, id), (0,1), init, eps=2**-32, maxsteps=0x40))

		self.docgr.vs[NAA] = sc_d + sc_t