Esempio n. 1
0
 def __init__(self, rel, node1, node2, edge_num):
     # try:
     #     JudgeLegal.legal_relation(rel)
     # except ConfigError as e:
     #     raise e
     self.label = rel['label']
     self.source = rel['source']
     self.target = rel['target']
     self.rel = rel
     self.node1 = node1
     self.node2 = node2
     self.edge_num = edge_num
     self.in_distribution = get_distribution(rel['in'], node2, edge_num)
     self.out_distribution = get_distribution(rel['out'], node1, edge_num)
     self.special_case = self.in_distribution.is_special()
     self.extend_i = self.in_distribution.need_extend()
     self.extend_o = self.out_distribution.need_extend()
     if self.extend_o > 0:
         self.node1 += self.extend_o
     if self.extend_i > 0:
         self.node2 += self.extend_i
     self.has_middle = False
     if 'middle' in rel:
         self.middle = rel['middle']
         self.has_middle = True
     self.has_community = False
     if 'community' in rel:
         commu = rel['community']
         self.com_amount = commu['amount']
         # self.com_distribution = get_distribution(commu['distribution'], self.com_amount)
         self.noise_threshold = commu['noise']['threshold']
         self.noise_param_c = commu['noise']['param-c']
         self.overlap = commu['overlap']
         self.has_community = True
     self.has_attr = False
     self.attr = {}
     if 'attr' in rel:
         self.has_attr = True
         self.attr = rel['attr']
Esempio n. 2
0
def mine_c45(table, result):
    """ An entry point for C45 algorithm.

        _table_ - a dict representing data table in the following format:
        {
            "<column name>': [<column values>],
            "<column name>': [<column values>],
            ...
        }

        _result_: a string representing a name of column indicating a result.
    """
    tree = []

    # Special case when there is a mixed strategy
    if len(table.keys()) == 1:
        key_distr = get_distribution(table[result])
        for k in key_distr[1].keys():
            tree.append([
                'probability=%f' % (key_distr[1][k] / key_distr[0]),
                '%s=%s' % (result, k)
            ])

        return tree

    # All other cases
    col = max([(k, gain(table, k, result))
               for k in table.keys() if k != result],
              key=lambda x: x[1])[0]
    for subt in get_subtables(table, col):
        v = subt[col][0]
        if is_mono(subt[result]):
            tree.append(
                ['%s=%s' % (col, v),
                 '%s=%s' % (result, subt[result][0])])
        else:
            del subt[col]
            tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result))

    return tree
Esempio n. 3
0
def mine_c45(table, result):
    """ An entry point for C45 algorithm.

        _table_ - a dict representing data table in the following format:
        {
            "<column name>': [<column values>],
            "<column name>': [<column values>],
            ...
        }

        _result_: a string representing a name of column indicating a result.
    """
    tree = []
    
    # Special case when there is a mixed strategy
    if len(table.keys()) == 1:
        key_distr = get_distribution(table[result])
        for k in key_distr[1].keys():
            tree.append(['probability=%f' % (key_distr[1][k] / key_distr[0]),
                         '%s=%s' % (result, k)])
        
        return tree
    
    # All other cases
    col = max([(k, gain(table, k, result)) for k in table.keys() if k != result],
              key=lambda x: x[1])[0]
    for subt in get_subtables(table, col):
        v = subt[col][0]
        if is_mono(subt[result]):
            tree.append(['%s=%s' % (col, v),
                         '%s=%s' % (result, subt[result][0])])
        else:
            del subt[col]
            tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result))
    
    return tree
Esempio n. 4
0
 def generate_with_com(self):
     """
     yield one line every time
     format: [row_i, col_j]
             type(row_i) = int
             type(col_j) = set
     :return: None
     """
     # all_e = 0
     if not self.has_community:
         return
     com_cnt = self.com_amount
     out_max_d = self.rel['out']['max-d']
     # in_max_d = self.rel['in']['max-d']
     param_c = self.noise_param_c
     over_lap = self.overlap
     out_threshold = round(out_max_d * self.noise_threshold)
     row_axis = get_community_size(com_cnt, self.node1)
     col_axis = get_community_size(com_cnt, self.node2)
     const_a = math.exp(-1 / param_c)
     const_b = const_a - math.exp(-out_max_d / param_c)
     start_i, start_j = 0, 0
     ul_col, ul_row, lr_col, lr_row = 0, 0, 0, 0
     for i in range(com_cnt):
         out_pl = get_distribution(self.rel['out'], row_axis[i], -1)
         in_pl = get_distribution(self.rel['in'], col_axis[i], -1)
         block_is_even = col_axis[i] % 2 == 1
         # upper left
         if over_lap > 0 and i > 0:
             ul_col = int(col_axis[i - 1] * over_lap)
             ul_row = int(row_axis[i - 1] * over_lap)
             ul_in_pl = get_distribution(self.rel['in'], ul_col, -1)
             ul_out_pl = get_distribution(self.rel['out'], ul_row, -1)
             ul_is_even = ul_col % 2 == 1
         # lower right
         if over_lap > 0 and i < com_cnt - 1:
             lr_col = int(col_axis[i + 1] * over_lap)
             lr_row = int(row_axis[i + 1] * over_lap)
             lr_in_pl = get_distribution(self.rel['in'], lr_col, -1)
             lr_out_pl = get_distribution(self.rel['out'], lr_row, -1)
             lr_is_even = lr_col % 2 == 1
         for row in range(row_axis[i]):
             a_line_set = set()
             d_out = out_pl.get_d()
             a_i = start_i + row
             for _ in range(d_out):
                 j = in_pl.get_j()
                 j = transform(block_is_even, j, col_axis[i] - 1)
                 a_j = start_j + j
                 a_line_set.add(a_j)
             # when d_out > threshold, add noise
             if d_out > out_threshold:
                 y = random.random()
                 d_extra = int(-param_c * math.log(const_a - y * const_b))
                 for _ in range(d_extra):
                     j = in_pl.get_j()
                     j = transform(block_is_even, j, col_axis[i] - 1)
                     j = scale(j, 0, col_axis[i] - 1, 0,
                               self.node2 - col_axis[i] - 1)
                     if j > start_j:
                         j += col_axis[i]
                     a_line_set.add(j)
             # add overlap community in upper left
             if over_lap > 0 and i > 0 and row < ul_row:
                 d_over_ul = ul_out_pl.get_d()
                 for _ in range(d_over_ul):
                     j = ul_in_pl.get_j()
                     j = transform(ul_is_even, j, ul_col - 1)
                     a_j = start_j - j
                     a_line_set.add(a_j)
             # add overlap community in lower right
             if over_lap > 0 and i < com_cnt - 1 and row > (row_axis[i] -
                                                            lr_row):
                 d_over_lr = lr_out_pl.get_d()
                 for _ in range(d_over_lr):
                     j = lr_in_pl.get_j()
                     j = transform(lr_is_even, j, lr_col - 1)
                     a_j = start_j + col_axis[i] + j
                     a_line_set.add(a_j)
             # all_e += len(a_line_set)
             yield [a_i, a_line_set]
         start_i += row_axis[i]
         start_j += col_axis[i]