def __init__(self, rel, node1, node2, edge_num): # try: # JudgeLegal.legal_relation(rel) # except ConfigError as e: # raise e self.label = rel['label'] self.source = rel['source'] self.target = rel['target'] self.rel = rel self.node1 = node1 self.node2 = node2 self.edge_num = edge_num self.in_distribution = get_distribution(rel['in'], node2, edge_num) self.out_distribution = get_distribution(rel['out'], node1, edge_num) self.special_case = self.in_distribution.is_special() self.extend_i = self.in_distribution.need_extend() self.extend_o = self.out_distribution.need_extend() if self.extend_o > 0: self.node1 += self.extend_o if self.extend_i > 0: self.node2 += self.extend_i self.has_middle = False if 'middle' in rel: self.middle = rel['middle'] self.has_middle = True self.has_community = False if 'community' in rel: commu = rel['community'] self.com_amount = commu['amount'] # self.com_distribution = get_distribution(commu['distribution'], self.com_amount) self.noise_threshold = commu['noise']['threshold'] self.noise_param_c = commu['noise']['param-c'] self.overlap = commu['overlap'] self.has_community = True self.has_attr = False self.attr = {} if 'attr' in rel: self.has_attr = True self.attr = rel['attr']
def mine_c45(table, result): """ An entry point for C45 algorithm. _table_ - a dict representing data table in the following format: { "<column name>': [<column values>], "<column name>': [<column values>], ... } _result_: a string representing a name of column indicating a result. """ tree = [] # Special case when there is a mixed strategy if len(table.keys()) == 1: key_distr = get_distribution(table[result]) for k in key_distr[1].keys(): tree.append([ 'probability=%f' % (key_distr[1][k] / key_distr[0]), '%s=%s' % (result, k) ]) return tree # All other cases col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append( ['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def mine_c45(table, result): """ An entry point for C45 algorithm. _table_ - a dict representing data table in the following format: { "<column name>': [<column values>], "<column name>': [<column values>], ... } _result_: a string representing a name of column indicating a result. """ tree = [] # Special case when there is a mixed strategy if len(table.keys()) == 1: key_distr = get_distribution(table[result]) for k in key_distr[1].keys(): tree.append(['probability=%f' % (key_distr[1][k] / key_distr[0]), '%s=%s' % (result, k)]) return tree # All other cases col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append(['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def generate_with_com(self): """ yield one line every time format: [row_i, col_j] type(row_i) = int type(col_j) = set :return: None """ # all_e = 0 if not self.has_community: return com_cnt = self.com_amount out_max_d = self.rel['out']['max-d'] # in_max_d = self.rel['in']['max-d'] param_c = self.noise_param_c over_lap = self.overlap out_threshold = round(out_max_d * self.noise_threshold) row_axis = get_community_size(com_cnt, self.node1) col_axis = get_community_size(com_cnt, self.node2) const_a = math.exp(-1 / param_c) const_b = const_a - math.exp(-out_max_d / param_c) start_i, start_j = 0, 0 ul_col, ul_row, lr_col, lr_row = 0, 0, 0, 0 for i in range(com_cnt): out_pl = get_distribution(self.rel['out'], row_axis[i], -1) in_pl = get_distribution(self.rel['in'], col_axis[i], -1) block_is_even = col_axis[i] % 2 == 1 # upper left if over_lap > 0 and i > 0: ul_col = int(col_axis[i - 1] * over_lap) ul_row = int(row_axis[i - 1] * over_lap) ul_in_pl = get_distribution(self.rel['in'], ul_col, -1) ul_out_pl = get_distribution(self.rel['out'], ul_row, -1) ul_is_even = ul_col % 2 == 1 # lower right if over_lap > 0 and i < com_cnt - 1: lr_col = int(col_axis[i + 1] * over_lap) lr_row = int(row_axis[i + 1] * over_lap) lr_in_pl = get_distribution(self.rel['in'], lr_col, -1) lr_out_pl = get_distribution(self.rel['out'], lr_row, -1) lr_is_even = lr_col % 2 == 1 for row in range(row_axis[i]): a_line_set = set() d_out = out_pl.get_d() a_i = start_i + row for _ in range(d_out): j = in_pl.get_j() j = transform(block_is_even, j, col_axis[i] - 1) a_j = start_j + j a_line_set.add(a_j) # when d_out > threshold, add noise if d_out > out_threshold: y = random.random() d_extra = int(-param_c * math.log(const_a - y * const_b)) for _ in range(d_extra): j = in_pl.get_j() j = transform(block_is_even, j, col_axis[i] - 1) j = scale(j, 0, col_axis[i] - 1, 0, self.node2 - col_axis[i] - 1) if j > start_j: j += col_axis[i] a_line_set.add(j) # add overlap community in upper left if over_lap > 0 and i > 0 and row < ul_row: d_over_ul = ul_out_pl.get_d() for _ in range(d_over_ul): j = ul_in_pl.get_j() j = transform(ul_is_even, j, ul_col - 1) a_j = start_j - j a_line_set.add(a_j) # add overlap community in lower right if over_lap > 0 and i < com_cnt - 1 and row > (row_axis[i] - lr_row): d_over_lr = lr_out_pl.get_d() for _ in range(d_over_lr): j = lr_in_pl.get_j() j = transform(lr_is_even, j, lr_col - 1) a_j = start_j + col_axis[i] + j a_line_set.add(a_j) # all_e += len(a_line_set) yield [a_i, a_line_set] start_i += row_axis[i] start_j += col_axis[i]