Beispiel #1
0
	def slice_and_return_siblings(self, query):
		sliced = self.records
		result_contexts = {}
			
		for attr, val in query.items():
			hier = self.hiers[attr]
			val_ori_id = hier.nid[val]
			sibling_ids = hier.idd[hier.ipd[val_ori_id][0]]
			# print attr
			# sib_names = [hier.ind[x] for x in sibling_ids]
			# print sib_names
			query_copy = query.copy()
			print val_ori_id
			for sid in sibling_ids:
				s_name = hier.ind[sid]
				sliced = self.records
				if sid == val_ori_id:
					continue
				cell_name = ''
				for attr2, val2 in query_copy.items():
					if attr2 == attr:
						val_id = sid
						value = s_name
					else:
						val_id = self.hiers[attr2].nid[val2]
						value = val2
					cell_name += attr2 + "|" + value + ";"
					legal_vals = get_all_legal_vals(self.hiers[attr2], val_id)
					sliced = sliced.loc[sliced[attr2].isin(legal_vals)]
				
				if len(sliced['DocID']) > 0:
					result_contexts[cell_name] = sliced['DocID']

		return result_contexts#[x['DocID'] for x in result_contexts]
Beispiel #2
0
    def slice_and_return_siblings(self, query):
        sliced = self.records
        result_contexts = {}

        for attr, val in query.items():
            hier = self.hiers[attr]
            val_ori_id = hier.nid[val]
            sibling_ids = hier.idd[hier.ipd[val_ori_id][0]]
            # print attr
            # sib_names = [hier.ind[x] for x in sibling_ids]
            # print sib_names
            query_copy = query.copy()
            print val_ori_id
            for sid in sibling_ids:
                s_name = hier.ind[sid]
                sliced = self.records
                if sid == val_ori_id:
                    continue
                cell_name = ''
                for attr2, val2 in query_copy.items():
                    if attr2 == attr:
                        val_id = sid
                        value = s_name
                    else:
                        val_id = self.hiers[attr2].nid[val2]
                        value = val2
                    cell_name += attr2 + "|" + value + ";"
                    legal_vals = get_all_legal_vals(self.hiers[attr2], val_id)
                    sliced = sliced.loc[sliced[attr2].isin(legal_vals)]

                if len(sliced['DocID']) > 0:
                    result_contexts[cell_name] = sliced['DocID']

        return result_contexts  #[x['DocID'] for x in result_contexts]
Beispiel #3
0
    def slice(self, args):
        sliced = self.records

        for attr, val in args.items():
            if attr == "Date":
                print "do not support date for now"
            else:
                val_id = self.hiers[attr].nid[val]
                legal_vals = get_all_legal_vals(self.hiers[attr], val_id)
                sliced = sliced.loc[sliced[attr].isin(legal_vals)]

        return sliced.copy()
Beispiel #4
0
	def slice(self, args):
		sliced = self.records
		
		for attr, val in args.items():
			if attr == "Date":
				print "do not support date for now"
			else:
				val_id = self.hiers[attr].nid[val]
				legal_vals = get_all_legal_vals(self.hiers[attr], val_id)
				sliced = sliced.loc[sliced[attr].isin(legal_vals)]

		return sliced.copy()
Beispiel #5
0
    def slice_and_return_parents(self, query):
        sliced = self.records
        result_contexts = {}
        if len(query.items()) == 1:
            return {'all': [-1]}
            #return [[-1]]
        for attr_2, val_2 in query.items():
            sliced = self.records
            query_copy = query.copy()
            cell_name = ''
            for attr, val in query_copy.items():
                if attr_2 != attr:
                    cell_name += attr + "_" + str(val) + "_"
                    val_id = self.hiers[attr].nid[val]
                    legal_vals = get_all_legal_vals(self.hiers[attr], val_id)
                    sliced = sliced.loc[sliced[attr].isin(legal_vals)]
            result_contexts[cell_name] = sliced['DocID']

        return result_contexts  #[x['DocID'] for x in result_contexts]
Beispiel #6
0
	def slice_and_return_parents(self, query):
		sliced = self.records
		result_contexts = {}
		if len(query.items()) == 1:
			return {'all':[-1]}
			#return [[-1]]	
		for attr_2, val_2 in query.items():
			sliced = self.records
			query_copy = query.copy()
			cell_name = ''
			for attr, val in query_copy.items():
				if attr_2 != attr:
					cell_name += attr + "_" + str(val) + "_"
					val_id = self.hiers[attr].nid[val]
					legal_vals = get_all_legal_vals(self.hiers[attr], val_id)
					sliced = sliced.loc[sliced[attr].isin(legal_vals)]
			result_contexts[cell_name] = sliced['DocID']

		return result_contexts#[x['DocID'] for x in result_contexts]			
Beispiel #7
0
def count_ne_cells(dt, freq_data):
	
	def get_ne_siblings_cnt(cell):
		tokens = cell.split('_')
		index = 0
		count = 0
		sib_list = []
		for hier_name in hier_names:
			tokens_copy = list(tokens)
			hier_obj = hiers[hier_name]
			val_ori_id = int(tokens[index])
			# print hier_name
			# print val_ori_id
			# import ipdb
			# ipdb.set_trace()
			if val_ori_id == 0:
				sibling_ids = []
			else:
				sibling_ids = hier_obj.idd[hier_obj.ipd[val_ori_id][0]]
			for sid in sibling_ids:
				if sid == val_ori_id:
					pass
				tokens_copy[index] = sid
				tmp_str = '_'.join(str(x) for x in tokens_copy)
				if tmp_str in non_empty_cells:
					count += 1
					sib_list.append(tmp_str)
			index += 1
		return count, sib_list


	non_empty_cells = {}	# with siblings, format: {cell_str:(phrase_set, doc_num, sibling_num)}
	count = dt.records.shape[0]
	hiers = dt.hiers
	hier_names = hiers.keys()
	hier_values = {}
	total_uni_phrase = 0

	for hier_name, hier_obj in hiers.items():
		allnodes = get_all_legal_vals(hier_obj, 0)
		node_dict = {}	# the ancestor dict
		for node in allnodes:
			node_list = get_all_ancestors(hier_obj, node)
			node_dict[node] = node_list
		hier_values[hier_name] = node_dict


	children_dict = {}
	children_dict_by_dim = {}

	for i in range(count):
		if i % 1000 == 0:
			print i
		total_uni_phrase += len(freq_data[i])

		record = dt.records.loc[i]
		tmp_valid_lists = []	# all ancesters organized by dimensions
		is_valid_doc = True
		for hier_name in hier_names:
			hier_value = record[hier_name]
			if hier_value not in hier_values[hier_name]:
				print record
				is_valid_doc = False
				break
			else:
				tmp_valid_lists.append(hier_values[hier_name][hier_value])

		#print tmp_valid_lists
		if is_valid_doc:
			for element in itertools.product(*tmp_valid_lists):
				# print element
				tmp_str = '_'.join(str(x) for x in element)
				if tmp_str not in non_empty_cells:
					non_empty_cells[tmp_str] = {'phrases': Set(), 'docN': 0}
				non_empty_cells[tmp_str]['docN'] += 1
				phrases = freq_data[i].keys()
				non_empty_cells[tmp_str]['phrases'].update(phrases)

	

	for cell_str in non_empty_cells:
		tokens = cell_str.split('_')
		if cell_str not in children_dict:
			children_dict[cell_str] = Set()
			children_dict_by_dim[cell_str] = {}

		for idx, hier_name in enumerate(hier_names):

			hier_obj = hiers[hier_name]
			parent = get_direct_parent(hier_obj, int(tokens[idx]))
			if parent != None:
				tokens_copy = list(tokens)
				tokens_copy[idx] = parent
				tmp_str = '_'.join(str(x) for x in tokens_copy)
				if tmp_str not in children_dict:
					children_dict[tmp_str] = Set()
					children_dict_by_dim[tmp_str] = {}
				children_dict[tmp_str].add(cell_str)
				if hier_name not in children_dict_by_dim[tmp_str]:
					children_dict_by_dim[tmp_str][hier_name] = []
				children_dict_by_dim[tmp_str][hier_name].append(cell_str)
				# if tmp_str == '0_0':
					# print children_dict_by_dim[tmp_str]


	print 'computing average sibling number'
	# print non_empty_cells
	for cell in non_empty_cells:
		num_sib, sib_list = get_ne_siblings_cnt(cell)
		non_empty_cells[cell]['sibN'] = num_sib
		non_empty_cells[cell]['siblings'] = sib_list
		non_empty_cells[cell]['phraseN'] = len(non_empty_cells[cell]['phrases'])


	# how to generate the cuboid graph, actually it's just on the sibling space
	ne_cell_strs = Set(non_empty_cells.keys())

	cuboids = []

	while len(ne_cell_strs) > 0:
		cell_str = ne_cell_strs.pop()
		cuboid_set = Set()
		queue = Set([cell_str])
		while True:
			if len(queue) == 0:
				break
			cell_str = queue.pop()
			cuboid_set.add(cell_str)
			siblings = non_empty_cells[cell_str]['siblings']
			for sib in siblings:
				if sib not in cuboid_set:
					queue.add(sib)

		for cell_str in cuboid_set:
			if cell_str in ne_cell_strs:
				ne_cell_strs.remove(cell_str)

		cuboids.append((cuboid_set, Set()))
	

	for cuboid_pair in cuboids:
		cuboid = cuboid_pair[0]
		child_cells = cuboid_pair[1]
		for cell_str in cuboid:
			child_cells.update(children_dict[cell_str])

	sorted_cuboids = []

	while len(cuboids) > 0:
		cuboids = sorted(cuboids, key=lambda x: len(x[1]))
		if len(cuboids[0][1]) > 0:
			import ipdb
			ipdb.set_trace()			
			print 'error'
		target_cuboid = cuboids[0][0]
		sorted_cuboids.append(target_cuboid)
		cuboids.pop(0)
		for cuboid_pair in cuboids:
			child_cells = cuboid_pair[1]
			for cell_str in target_cuboid:
				if cell_str in child_cells:
					child_cells.remove(cell_str)

	

	#print non_empty_cells
	print 'Total cell number: %s' % len(non_empty_cells)
	print 'Average Unique Phrase Count in Each Doc %s' % (total_uni_phrase / float(count))

	# l = [len(non_empty_cells[x][0]) for x in non_empty_cells]
	l = [non_empty_cells[x]['phraseN'] for x in non_empty_cells]
	print 'Average Phrase Count: %.4f' % (reduce(lambda x, y: x + y, l) / float(len(l)))

	l = [non_empty_cells[x]['sibN'] for x in non_empty_cells]
	print 'Average Sibling Count: %.4f' % (reduce(lambda x, y: x + y, l) / float(len(l)))

	raw_cost = {}
	merge_cost = {}
	lmda = 38.11
	for cell_str in non_empty_cells:
		docN = non_empty_cells[cell_str]['docN']
		raw_cost[cell_str] = lmda * docN * math.log(docN, 2)
		merge_cost[cell_str] = {}
		for hier_name in children_dict_by_dim[cell_str]:
			m_cost = 0
			for subcell in children_dict_by_dim[cell_str][hier_name]:
				m_cost += non_empty_cells[subcell]['phraseN']
			merge_cost[cell_str][hier_name] = m_cost

	# import ipdb
	# ipdb.set_trace()

	textcube = TextCube(dt, non_empty_cells, sorted_cuboids, children_dict, 
		children_dict_by_dim, hier_names, raw_cost, merge_cost)

	pickle.dump(textcube, open('1d_cube.dump', 'wb'))