def mine_c45(table, result): """ An entry point for C45 algorithm. _table_ - a dict representing data table in the following format: { "<column name>': [<column values>], "<column name>': [<column values>], ... } _result_: a string representing a name of column indicating a result. """ col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] tree = [] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append( ['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def infox(table, col, res_col): """ Calculates the entropy of the table _table_ after dividing it on the subtables by column _col_. """ s = 0 # sum for subt in utils.get_subtables(table, col): s += (float(len(subt[col])) / len(table[col])) * info(subt, res_col) return s
def test_get_subtables(self): expected = [{ 'result': ['yes', 'no'], 'arg1': ['left', 'left'], 'arg2': ['down', 'up'], 'arg3': ['no', 'yes'], }, { 'result': ['yes', 'no'], 'arg1': ['right', 'right'], 'arg2': ['down', 'down'], 'arg3': ['yes', 'no'], }] self.assertEqual(utils.get_subtables(self.table, 'arg1'), expected)
def test_get_subtables(self): expected = [ { 'result': ['yes', 'no'], 'arg1': ['left', 'left'], 'arg2': ['down', 'up'], 'arg3': ['no', 'yes'], }, { 'result': ['yes', 'no'], 'arg1': ['right', 'right'], 'arg2': ['down', 'down'], 'arg3': ['yes', 'no'], }] self.assertEqual(utils.get_subtables(self.table, 'arg1'), expected)
def mine_c45(table, result): col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] tree = [] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append(['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def mine_c45(table, result): col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] tree = [] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append( ['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def mine_c45(table, result): """ An entry point for C45 algorithm. _table_ - a dict representing data table in the following format: { "<column name>': [<column values>], "<column name>': [<column values>], ... } _result_: a string representing a name of column indicating a result. """ tree = [] # Special case when there is a mixed strategy if len(table.keys()) == 1: key_distr = get_distribution(table[result]) for k in key_distr[1].keys(): tree.append([ 'probability=%f' % (key_distr[1][k] / key_distr[0]), '%s=%s' % (result, k) ]) return tree # All other cases col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append( ['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def mine_c45(table, result): """ An entry point for C45 algorithm. _table_ - a dict representing data table in the following format: { "<column name>': [<column values>], "<column name>': [<column values>], ... } _result_: a string representing a name of column indicating a result. """ col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] tree = [] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append(['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def mine_c45(table, result): """ An entry point for C45 algorithm. _table_ - a dict representing data table in the following format: { "<column name>': [<column values>], "<column name>': [<column values>], ... } _result_: a string representing a name of column indicating a result. """ tree = [] # Special case when there is a mixed strategy if len(table.keys()) == 1: key_distr = get_distribution(table[result]) for k in key_distr[1].keys(): tree.append(['probability=%f' % (key_distr[1][k] / key_distr[0]), '%s=%s' % (result, k)]) return tree # All other cases col = max([(k, gain(table, k, result)) for k in table.keys() if k != result], key=lambda x: x[1])[0] for subt in get_subtables(table, col): v = subt[col][0] if is_mono(subt[result]): tree.append(['%s=%s' % (col, v), '%s=%s' % (result, subt[result][0])]) else: del subt[col] tree.append(['%s=%s' % (col, v)] + mine_c45(subt, result)) return tree
def info_x(table, col, res_col): return sum( len(subtable[col]) / len(table[col]) * info(subtable, res_col) for subtable in utils.get_subtables(table, col))
def test_get_subtables(self): expected = [ {"result": ["yes", "no"], "arg1": ["left", "left"], "arg2": ["down", "up"], "arg3": ["no", "yes"]}, {"result": ["yes", "no"], "arg1": ["right", "right"], "arg2": ["down", "down"], "arg3": ["yes", "no"]}, ] self.assertEquals(utils.get_subtables(self.table, "arg1"), expected)