def EntropyDiscretizer(c, accept_strategy=mdlp_accept, min_size=100): """ c = c.reindex(f.index) return: list of breaking points, l """ cuts = [] intervals = [[0, c.shape[0]]] while intervals != []: currInterval = intervals.pop() start, end = currInterval[0], currInterval[1] if ent(c[start:end]) == 0: continue t, e = optimal_cut(c, start, end) if (t > start + min_size and t < end - min_size) and accept_strategy(c, e, t): cuts.append(t) intervals.append([t, end]) intervals.append([start, t]) return cuts
def symmetricalUncertainty(x, y): return 2 * infoGain(x, y) / (ent(x) + ent(y))
def symmetricalUncertainty(x, y): return 2*infoGain(x, y)/(ent(x)+ent(y))
'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent', 'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent', 'pkts_received' ] # removes every row in skip. faster loadtime data = pd.read_csv(file, skiprows=skip, header=0, names=col_names) return data port_DATA = sample() print("length: %s", len(port_DATA)) # Some of the values are reading as infinite. Replace with NaN port_DATA.replace([np.inf, -np.inf], np.nan, inplace=True) # Drop the Rows with NaN values port_DATA.dropna(inplace=True) port_DATA = binning.binned(port_DATA) entropy.ent(port_DATA) dimension.dim(port_DATA) print("Euclidean Distances: \n") euclidean.euc(port_DATA) decisiontree.decision_tree(port_DATA)
def slice_ent(s): counts = np.bincount(s) vals = np.true_divide(counts, s.shape[0]) return ent(vals), np.sum(vals != 0)