Example #1
0
def generate_conditions(data, attr, cls, default_conditions={}):
    if not data[attr]:
        return []
    if attr in default_conditions:
        return default_conditions[attr]
    if not continuous(data[attr][0]):
        return [Condition('=="%s"'%(value), partial((lambda v, x:x == v), value)) for value in set(data[attr])]
    result = {}
    for value in set(data[attr]):
        result[value] = {
            '<=': Counter(),
            '>': Counter(),
        }
    count = 0.0
    for i, value in enumerate(data[attr]):
        cls_value = data[cls][i]
        for base in result:
            if value <= base:
                result[base]['<='][cls_value] += 1
            else:
                result[base]['>'][cls_value] += 1
        count += 1.0
    for base in result:
        sv1 =  sum(result[base]['<='].values())
        sv2 =  sum(result[base]['>'].values())
        result[base]['<='] =sv1*entropy_from_counter(result[base]['<=']) / count
        result[base]['>'] = sv2*entropy_from_counter(result[base]['>']) / count
        result[base] = result[base]['<='] + result[base]['>']
    number = min(result, key=result.get)
    return [
        Condition('<=' + str(number), lambda x: x <= number),
        Condition('>' + str(number), lambda x: x > number)
    ]
Example #2
0
def extract_comp(tup):
    result = []
    for element in tup:
        if not continuous(element):
            result.append(lambda x, y: 0 if x == y else 1)
        else:
            result.append(lambda x, y: x - y)
    return result
Example #3
0
def extract_normalizer(data, cls):
    result = {}
    for attr, values in data.items():
        if attr == cls:
            continue
        if not continuous(values[0]):
            result[attr] = lambda x: x
        else:
            mi, ma = min(values), max(values)
            mi, ma = float(mi), float(ma)
            result[attr] = lambda x: (float(x) - mi) / ma
    return result
def naive_bayes(data, cls, new_tup, adjust=False):
	counter = Counter(data[cls])
	result = Counter()
	for ci, count in counter.items():
		pci = float(count) / float(len(data[cls]))
		mult = 1.0

		base = {x:0.0 for x in new_tup}
		p = {}
		for x, x_value in new_tup.items():
			countx_ci = count_x_and_y(data[x], data[cls], x_value, ci)
			p[x] = p_continuous if continuous(x_value) else p_categorical
			if adjust and not countx_ci:
				base[x] = 1.0

		for x, x_value in new_tup.items():
			px_ci = p[x](data[x], data[cls], x_value, ci, count, base[x])
			mult *= px_ci
		result[ci] = mult * pci
	return result.most_common(1)[0][0]