def slice_and_return_siblings(self, query): sliced = self.records result_contexts = {} for attr, val in query.items(): hier = self.hiers[attr] val_ori_id = hier.nid[val] sibling_ids = hier.idd[hier.ipd[val_ori_id][0]] # print attr # sib_names = [hier.ind[x] for x in sibling_ids] # print sib_names query_copy = query.copy() print val_ori_id for sid in sibling_ids: s_name = hier.ind[sid] sliced = self.records if sid == val_ori_id: continue cell_name = '' for attr2, val2 in query_copy.items(): if attr2 == attr: val_id = sid value = s_name else: val_id = self.hiers[attr2].nid[val2] value = val2 cell_name += attr2 + "|" + value + ";" legal_vals = get_all_legal_vals(self.hiers[attr2], val_id) sliced = sliced.loc[sliced[attr2].isin(legal_vals)] if len(sliced['DocID']) > 0: result_contexts[cell_name] = sliced['DocID'] return result_contexts#[x['DocID'] for x in result_contexts]
def slice_and_return_siblings(self, query): sliced = self.records result_contexts = {} for attr, val in query.items(): hier = self.hiers[attr] val_ori_id = hier.nid[val] sibling_ids = hier.idd[hier.ipd[val_ori_id][0]] # print attr # sib_names = [hier.ind[x] for x in sibling_ids] # print sib_names query_copy = query.copy() print val_ori_id for sid in sibling_ids: s_name = hier.ind[sid] sliced = self.records if sid == val_ori_id: continue cell_name = '' for attr2, val2 in query_copy.items(): if attr2 == attr: val_id = sid value = s_name else: val_id = self.hiers[attr2].nid[val2] value = val2 cell_name += attr2 + "|" + value + ";" legal_vals = get_all_legal_vals(self.hiers[attr2], val_id) sliced = sliced.loc[sliced[attr2].isin(legal_vals)] if len(sliced['DocID']) > 0: result_contexts[cell_name] = sliced['DocID'] return result_contexts #[x['DocID'] for x in result_contexts]
def slice(self, args): sliced = self.records for attr, val in args.items(): if attr == "Date": print "do not support date for now" else: val_id = self.hiers[attr].nid[val] legal_vals = get_all_legal_vals(self.hiers[attr], val_id) sliced = sliced.loc[sliced[attr].isin(legal_vals)] return sliced.copy()
def slice_and_return_parents(self, query): sliced = self.records result_contexts = {} if len(query.items()) == 1: return {'all': [-1]} #return [[-1]] for attr_2, val_2 in query.items(): sliced = self.records query_copy = query.copy() cell_name = '' for attr, val in query_copy.items(): if attr_2 != attr: cell_name += attr + "_" + str(val) + "_" val_id = self.hiers[attr].nid[val] legal_vals = get_all_legal_vals(self.hiers[attr], val_id) sliced = sliced.loc[sliced[attr].isin(legal_vals)] result_contexts[cell_name] = sliced['DocID'] return result_contexts #[x['DocID'] for x in result_contexts]
def slice_and_return_parents(self, query): sliced = self.records result_contexts = {} if len(query.items()) == 1: return {'all':[-1]} #return [[-1]] for attr_2, val_2 in query.items(): sliced = self.records query_copy = query.copy() cell_name = '' for attr, val in query_copy.items(): if attr_2 != attr: cell_name += attr + "_" + str(val) + "_" val_id = self.hiers[attr].nid[val] legal_vals = get_all_legal_vals(self.hiers[attr], val_id) sliced = sliced.loc[sliced[attr].isin(legal_vals)] result_contexts[cell_name] = sliced['DocID'] return result_contexts#[x['DocID'] for x in result_contexts]
def count_ne_cells(dt, freq_data): def get_ne_siblings_cnt(cell): tokens = cell.split('_') index = 0 count = 0 sib_list = [] for hier_name in hier_names: tokens_copy = list(tokens) hier_obj = hiers[hier_name] val_ori_id = int(tokens[index]) # print hier_name # print val_ori_id # import ipdb # ipdb.set_trace() if val_ori_id == 0: sibling_ids = [] else: sibling_ids = hier_obj.idd[hier_obj.ipd[val_ori_id][0]] for sid in sibling_ids: if sid == val_ori_id: pass tokens_copy[index] = sid tmp_str = '_'.join(str(x) for x in tokens_copy) if tmp_str in non_empty_cells: count += 1 sib_list.append(tmp_str) index += 1 return count, sib_list non_empty_cells = {} # with siblings, format: {cell_str:(phrase_set, doc_num, sibling_num)} count = dt.records.shape[0] hiers = dt.hiers hier_names = hiers.keys() hier_values = {} total_uni_phrase = 0 for hier_name, hier_obj in hiers.items(): allnodes = get_all_legal_vals(hier_obj, 0) node_dict = {} # the ancestor dict for node in allnodes: node_list = get_all_ancestors(hier_obj, node) node_dict[node] = node_list hier_values[hier_name] = node_dict children_dict = {} children_dict_by_dim = {} for i in range(count): if i % 1000 == 0: print i total_uni_phrase += len(freq_data[i]) record = dt.records.loc[i] tmp_valid_lists = [] # all ancesters organized by dimensions is_valid_doc = True for hier_name in hier_names: hier_value = record[hier_name] if hier_value not in hier_values[hier_name]: print record is_valid_doc = False break else: tmp_valid_lists.append(hier_values[hier_name][hier_value]) #print tmp_valid_lists if is_valid_doc: for element in itertools.product(*tmp_valid_lists): # print element tmp_str = '_'.join(str(x) for x in element) if tmp_str not in non_empty_cells: non_empty_cells[tmp_str] = {'phrases': Set(), 'docN': 0} non_empty_cells[tmp_str]['docN'] += 1 phrases = freq_data[i].keys() non_empty_cells[tmp_str]['phrases'].update(phrases) for cell_str in non_empty_cells: tokens = cell_str.split('_') if cell_str not in children_dict: children_dict[cell_str] = Set() children_dict_by_dim[cell_str] = {} for idx, hier_name in enumerate(hier_names): hier_obj = hiers[hier_name] parent = get_direct_parent(hier_obj, int(tokens[idx])) if parent != None: tokens_copy = list(tokens) tokens_copy[idx] = parent tmp_str = '_'.join(str(x) for x in tokens_copy) if tmp_str not in children_dict: children_dict[tmp_str] = Set() children_dict_by_dim[tmp_str] = {} children_dict[tmp_str].add(cell_str) if hier_name not in children_dict_by_dim[tmp_str]: children_dict_by_dim[tmp_str][hier_name] = [] children_dict_by_dim[tmp_str][hier_name].append(cell_str) # if tmp_str == '0_0': # print children_dict_by_dim[tmp_str] print 'computing average sibling number' # print non_empty_cells for cell in non_empty_cells: num_sib, sib_list = get_ne_siblings_cnt(cell) non_empty_cells[cell]['sibN'] = num_sib non_empty_cells[cell]['siblings'] = sib_list non_empty_cells[cell]['phraseN'] = len(non_empty_cells[cell]['phrases']) # how to generate the cuboid graph, actually it's just on the sibling space ne_cell_strs = Set(non_empty_cells.keys()) cuboids = [] while len(ne_cell_strs) > 0: cell_str = ne_cell_strs.pop() cuboid_set = Set() queue = Set([cell_str]) while True: if len(queue) == 0: break cell_str = queue.pop() cuboid_set.add(cell_str) siblings = non_empty_cells[cell_str]['siblings'] for sib in siblings: if sib not in cuboid_set: queue.add(sib) for cell_str in cuboid_set: if cell_str in ne_cell_strs: ne_cell_strs.remove(cell_str) cuboids.append((cuboid_set, Set())) for cuboid_pair in cuboids: cuboid = cuboid_pair[0] child_cells = cuboid_pair[1] for cell_str in cuboid: child_cells.update(children_dict[cell_str]) sorted_cuboids = [] while len(cuboids) > 0: cuboids = sorted(cuboids, key=lambda x: len(x[1])) if len(cuboids[0][1]) > 0: import ipdb ipdb.set_trace() print 'error' target_cuboid = cuboids[0][0] sorted_cuboids.append(target_cuboid) cuboids.pop(0) for cuboid_pair in cuboids: child_cells = cuboid_pair[1] for cell_str in target_cuboid: if cell_str in child_cells: child_cells.remove(cell_str) #print non_empty_cells print 'Total cell number: %s' % len(non_empty_cells) print 'Average Unique Phrase Count in Each Doc %s' % (total_uni_phrase / float(count)) # l = [len(non_empty_cells[x][0]) for x in non_empty_cells] l = [non_empty_cells[x]['phraseN'] for x in non_empty_cells] print 'Average Phrase Count: %.4f' % (reduce(lambda x, y: x + y, l) / float(len(l))) l = [non_empty_cells[x]['sibN'] for x in non_empty_cells] print 'Average Sibling Count: %.4f' % (reduce(lambda x, y: x + y, l) / float(len(l))) raw_cost = {} merge_cost = {} lmda = 38.11 for cell_str in non_empty_cells: docN = non_empty_cells[cell_str]['docN'] raw_cost[cell_str] = lmda * docN * math.log(docN, 2) merge_cost[cell_str] = {} for hier_name in children_dict_by_dim[cell_str]: m_cost = 0 for subcell in children_dict_by_dim[cell_str][hier_name]: m_cost += non_empty_cells[subcell]['phraseN'] merge_cost[cell_str][hier_name] = m_cost # import ipdb # ipdb.set_trace() textcube = TextCube(dt, non_empty_cells, sorted_cuboids, children_dict, children_dict_by_dim, hier_names, raw_cost, merge_cost) pickle.dump(textcube, open('1d_cube.dump', 'wb'))