def _gain(self, X, Y, attr): """ :param X: numpy 1D array, data samples :param Y: numpy 1D array, class labels :return: computed information gain This method computes information gain """ entropy_total = entropy(Y) unique_values = np.unique(X) if len(unique_values) == 1: return 0, None if self.attr_dtypes[attr] == int: entropy_subsets = 0 for value in unique_values: subset = Y[np.where(X == value)[0]] entropy_subsets += (subset.size / Y.size) * entropy(subset) return entropy_total - entropy_subsets elif self.attr_dtypes[attr] == float: best_gain = 0 split_value = None for value in unique_values[1:]: left_subset = Y[np.where(X < value)[0]] right_subset = Y[np.where(X >= value)[0]] entropy_subsets = (left_subset.size / Y.size) * entropy(left_subset) + \ (right_subset.size / Y.size) * entropy(right_subset) if entropy_total - entropy_subsets > best_gain: best_gain = entropy_total - entropy_subsets split_value = value return best_gain, split_value
def select_feature(self, X, y, indice): dataset = np.c_[X, y] baseEntropy = entropy(dataset) choose_infoGain = 0.0 bestFeature = -1 for i in indice: vals = [example[i] for example in X] univals = sorted(set(vals)) newEntropy = 0.0 #for value in univals: c = 0 #value = random.choice(univals) #bestValue = value while c < 10: c += 1 value = random.choice(univals) #bestValue = value subX1, subX2, subY1, subY2 = partition_classes(X, y, i, value) p1 = len(subY1) / float(len(X)) p2 = len(subY2) / float(len(X)) subdataset1 = np.c_[subX1, subY1] subdataset2 = np.c_[subX2, subY2] newEntropy = p1 * entropy(subdataset1) + p2 * entropy( subdataset2) infoGain = baseEntropy - newEntropy #print(infoGain,choose_infoGain) if (infoGain >= choose_infoGain): choose_infoGain = infoGain bestFeature = i bestValue = value return bestFeature, bestValue
def mutual_info(self, X, y): res = entropy(y) val, counts = np.unique(X, return_counts=True) freqs = counts.astype('float') / len(X) # We calculate a weighted average of the entropy for p, v in zip(freqs, val): res -= p * entropy(y[X == v]) return res
def learn(self, X, y): # TODO: train decision tree and store it in self.tree d = X.shape[1] valueSet = self.possibleValues(X) rootNode = [] for i in range(d): rootNode.append([ valueSet[i][np.argmin(valueSet[i])], valueSet[i][np.argmax(valueSet[i])] ]) rootNode.append(0) node_num = 0 nodeList = [(node_num, rootNode)] while (len(nodeList) != 0): current_num, node = nodeList.pop() total_list = self.domain(X, node) #if(len(total_list)==0): # continue; if (entropy(y[total_list]) >= 0.1): attr, split, child_node1, child_node2 = self.findBest( X[total_list, :], y[total_list], node) if (child_node1 == node or child_node2 == node): self.leaves.append(node) self.tree[current_num] = [-1, -1, -1, -1, node[9]] continue nodeList.append((node_num + 1, child_node1)) nodeList.append((node_num + 2, child_node2)) self.tree[current_num] = [ attr, split, node_num + 1, node_num + 2, node[9] ] node_num = node_num + 2 else: self.leaves.append(node) self.tree[current_num] = [-1, -1, -1, -1, node[9]]
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) gain_list = {} if entropy(y) != 0: for attr in range(len(X[0])): values = list(set([item[attr] for item in X])) for val in values: X_left, X_right, y_left, y_right = partition_classes( X, y, attr, val) gain_list[(attr, val)] = information_gain(y, [y_left, y_right]) sp_attr, sp_val = list(gain_list.keys())[list( gain_list.values()).index(max(gain_list.values()))] X_left, X_right, y_left, y_right = partition_classes( X, y, sp_attr, sp_val) self.tree['split_attribute'] = sp_attr self.tree['split_val'] = sp_val self.tree['left'] = (X_left, y_left) self.tree['right'] = (X_right, y_right) pass
def wordRank(seq, text): """ 词的灵活程度又它的左临集合和右临集合判定 """ LeftSet, RightSet = [], [] cur = text.find(seq) wl = len(seq) while cur != -1: if cur != 0: LeftSet.append(text[cur - 1:cur]) RightSet.append(text[cur + wl:cur + wl + 1]) cur = text.find(seq, cur + len(seq)) entr = min(entropy(LeftSet), entropy(RightSet)) if entr == 0: return 0 return 1 / entr
def expected_information_gain(likelihood, prior): """ expected_post_entropy has shape n_feature """ n_concept, n_feature, n_y = likelihood.shape full_post = full_posterior(likelihood, prior) prior_predictive = predictive(likelihood, prior) full_post_entropy = np.zeros([n_feature, n_y]) for ind_x in range(n_feature): for ind_y in range(n_y): full_post_entropy[ind_x, ind_y] = entropy(full_post[:, ind_x, ind_y]) expected_post_entropy = np.sum(full_post_entropy * prior_predictive, axis=1) prior_entropy = entropy(prior) return prior_entropy - expected_post_entropy
def learnlearn(X, y): if entropy(y) == 0: # all the same label -> end of tree return {'label': y[0]} best_split = {} # split_attr,split_value,left,right max_IG = -1 current_split = None for attribute in range(len(X[0])): # attribute = column indices unique_value = np.unique([x[attribute] for x in X]) for value in unique_value: X_left, X_right, y_left, y_right = partition_classes( X, y, attribute, value) IG = information_gain(y, [y_left, y_right]) if IG > max_IG: max_IG = IG current_split = [attribute, value] if max_IG == 0: # just couldn't split better -> end of tree cnt_0_1 = np.bincount(y) return {'label': [1, 0][cnt_0_1[0] > cnt_0_1[1]]} # record and split best_split["split_attr"] = current_split[0] best_split["split_value"] = current_split[1] X_left, X_right, y_left, y_right = partition_classes( X, y, current_split[0], current_split[1]) # next level best_split['left'] = learnlearn(X_left, y_left) best_split['right'] = learnlearn(X_right, y_right) return best_split
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) #pass max_info_gain, max_attribute, max_value = -1, 0, 0 X_left = [] X_right = [] y_left = [] y_right = [] self.tree_depth = 0 self.id = None if self.tree_depth > 10 or entropy(y) <= 0: self.id = y[0] return for i in range(0, len(X[0])): values = [X[j][i] for j in range(0, len(X))] # choose the split value according to its average in order to reduce the running time if isinstance(values[0], str): split_avg = values[0] else: split_avg = sum(values) / len(values) xLeft, xRight, yLeft, yRight = partition_classes( X, y, i, split_avg) current = [] current.append(yLeft) current.append(yRight) temp = information_gain(y, current) if temp > max_info_gain: max_attribute = i max_value = split_avg max_info_gain = temp X_left = xLeft X_right = xRight y_left = yLeft y_right = yRight #build tree self.tree['max_attribute'], self.tree[ 'max_value'] = max_attribute, max_value self.tree['L'], self.tree['R'] = DecisionTree(), DecisionTree() #grow tree self.tree['L'].learn(X_left, y_left) self.tree['R'].learn(X_right, y_right) self.tree['L'].tree_depth = self.tree_depth + 1 self.tree['R'].tree_depth = self.tree_depth + 1
def test_entropy_depenency_on_divisor(self): dd_entropy = self.election.entropy() ds_entropy = entropy(self.votes, self.election.results, sainte_lague_gen) self.rules["primary_divider"] = "sainte-lague" self.rules["adj_determine_divider"] = "sainte-lague" self.rules["adj_alloc_divider"] = "sainte-lague" self.sl_election = Election(self.rules, self.votes) self.sl_election.run() ss_entropy = self.sl_election.entropy() sd_entropy = entropy(self.votes, self.sl_election.results, dhondt_gen) self.assertNotEqual(ds_entropy, dd_entropy) self.assertNotEqual(ss_entropy, dd_entropy) self.assertNotEqual(ss_entropy, sd_entropy) self.assertNotEqual(ds_entropy, sd_entropy) self.assertEqual(round(dd_entropy, 2), 42.95) self.assertEqual(round(ds_entropy, 2), 41.22) self.assertEqual(round(ss_entropy, 2), 41.22) self.assertEqual(round(sd_entropy, 2), 42.95)
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) root = { 'attribute_index': None, 'value': None, 'left': None, 'right': None } if entropy( y ) < .2: ##Reasonable bound/cut off based on entropy to prevent overfitting return sp.stats.mode(y)[0][0] else: info_gain = 0 split_attribute_index = 0 split_value = X[0][split_attribute_index] for item in X: for i in range(len(item)): if information_gain(y, [ partition_classes(X, y, i, item[i])[2], partition_classes(X, y, i, item[i])[3] ]) > info_gain: info_gain = information_gain( y, [ partition_classes(X, y, i, item[i])[2], partition_classes(X, y, i, item[i])[3] ] ) ##Base attribute and split value on combination that maximizes info gain split_attribute_index = i split_value = item[i] root['attribute_index'] = split_attribute_index root['value'] = split_value root['left'] = self.learn( partition_classes(X, y, split_attribute_index, split_value)[0], partition_classes(X, y, split_attribute_index, split_value)[2]) root['right'] = self.learn( partition_classes(X, y, split_attribute_index, split_value)[1], partition_classes(X, y, split_attribute_index, split_value)[3]) self.tree.insert(0, root) return root
def _grow_tree(self, X, Y, attributes, depth, value=None): """ :param X: numpy 2D array, data samples :param Y: numpy 1D array, class labels :param attributes: list of attributes in the data :param depth: maximum depth of the tree :param value: possible value of the attribute for the current node/branch :return: a constructed node which has branches pointing further nodes This method grows a decision tree by recursively calling itself. """ # Construct a node node = Node() node.probs = probabilities(Y) node.class_, _ = get_best_class_prob(node.probs) node.branch_value = value # Stop criteria if depth == 0 or entropy(Y) == 0 or len(attributes) == 0: return node # Find the best attribute attr, node.gain, split_value = self._best_attribute(X, Y, attributes) node.next_attr = attr if node.gain == 0: return node if depth: depth -= 1 # Recurse to construct child nodes. The branches are built based on the type of the # attribute. # If the attribute is continuous, only two branches are formed (left-br,right-br) # If the attribute is discrete, a branch is created for each possible value of the # attribute. if self.attr_dtypes[attr] == int: attributes.remove(attr) values = np.unique(X[:, attr]) for val in values: subset_indices = np.where(X[:, attr] == val)[0] node.branches[val] = self._grow_tree(X[subset_indices, :], Y[subset_indices] , attributes, depth, val) elif self.attr_dtypes[attr] == float: node.split_value = split_value left_subset = np.where(X[:, attr] < split_value)[0] node.branches[L_BRANCH] = self._grow_tree(X[left_subset, :], Y[left_subset] , attributes, depth, None) right_subset = np.where(X[:, attr] >= split_value)[0] node.branches[R_BRANCH] = self._grow_tree(X[right_subset, :], Y[right_subset] , attributes, depth, None) return node
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) self.depth = 0 self.group = None y_entropy = entropy(y) max_info_gain = -1 split_attribute = -1 split_val = '' x_left = [] x_right = [] y_left = [] y_right = [] if self.depth < 15 and y_entropy > 0: for column in range(len(X[0])): col_vals = [row[column] for row in X] trial_split_val = sum(col_vals) / (len(col_vals) * 1.0) x_l, x_r, y_l, y_r = partition_classes(X, y, column, trial_split_val) current_y = [y_l, y_r] info_gain = information_gain(y, current_y) if info_gain > max_info_gain: max_info_gain = info_gain split_attribute = column split_val = trial_split_val x_left = x_l x_right = x_r y_left = y_l y_right = y_r self.tree['left'] = DecisionTree() self.tree['right'] = DecisionTree() self.tree['split_attribute'] = split_attribute self.tree['split_val'] = split_val self.tree['left'].learn(x_left, y_left) # create tree within tree self.tree['right'].learn(x_right, y_right) self.tree['left'].depth = self.depth + 1 self.tree['right'].depth = self.depth + 1 else: self.group = y[0] return
def send_find_node(self, address, nid=None): nid = get_neighbor(nid, self.nid) if nid else self.nid tid = entropy(TID_LENGTH) msg = { "t": tid, "y": "q", "q": "find_node", "a": { "id": nid, "target": random_id() } } self.send_krpc(msg, address)
def Build_tree(self, X, y): entropy_y = entropy(y) if entropy_y == 0: return np.atleast_2d(['Leaf', y[0], 'NA', 'NA']) else: best_info = 0 best_i = -1 best_j = 0 for i in range(0, len(X[0])): split_range = [item[i] for item in X] type_x = split_range[0] if (isinstance(type_x, int) or isinstance(type_x, float)): unique = np.unique(split_range)[0:-1] else: unique = np.unique(split_range) if len(unique) == 1: unique = unique[0:-1] for j in unique: [X_left, X_right, y_left, y_right] = partition_classes(X, y, i, j) current_y = list() current_y.append(y_left) current_y.append(y_right) info_gain = information_gain(y, current_y) if info_gain > best_info: best_info = info_gain best_i = i best_j = j if (best_i == -1): counts = np.bincount(y) return np.atleast_2d(['Leaf', np.argmax(counts), 'NA', 'NA']) else: [X_left, X_right, y_left, y_right] = partition_classes(X, y, best_i, best_j) lefttree = self.Build_tree(X_left, y_left) righttree = self.Build_tree(X_right, y_right) root = np.atleast_2d( [best_i, best_j, 1, np.atleast_2d(lefttree).shape[0] + 1]) root_left = np.append(root, lefttree, axis=0) root_right = np.append(root_left, righttree, axis=0) return (root_right) pass
def send_find_node(self, address, nid=None): logging.debug("send find node to : " + str(address)) nid = self.get_neighbor(nid, self.nid) if nid else self.nid tid = entropy(TID_LENGTH) msg = { "t": tid, "y": "q", "q": "find_node", "a": { "id": nid, "target": self.random_id() } } self.send_krpc(msg, address)
def select_feature(cls, y, X, possible_features, weights=None): """ Select the best feature to split in the decision tree. """ best_info_gain = -1 split_feat = -1 for feat in possible_features: info_gain = entropy(y, weights) - conditional_entropy( y, X[:, feat], weights) # print(feat, info_gain) if info_gain > best_info_gain: best_info_gain = info_gain split_feat = feat return split_feat
def _learn(X, y): Y_entropy = entropy(y) if Y_entropy == 0: return [-1, y[0], None, None] cur_max_gain = 0 best_attr = [] best_index = -1 for index in data_length: if type(X[0][index]) == str: is_str = True else: is_str = False if is_str == True: attr_X = np.unique([X[i][index] for i in range(len(X))]) for attr in attr_X: X_left, X_right, y_left, y_right = partition_classes( X, y, index, attr) gain = information_gain(y, [y_left, y_right]) if gain > cur_max_gain: cur_max_gain = gain best_index = index best_attr = attr best_X_left, best_X_right = X_left, X_right best_y_left, best_y_right = y_left, y_right else: attr_X = np.mean([X[i][index] for i in range(len(X))]) X_left, X_right, y_left, y_right = partition_classes( X, y, index, attr_X) gain = information_gain(y, [y_left, y_right]) if gain > cur_max_gain: cur_max_gain = gain best_index = index best_attr = attr_X best_X_left, best_X_right = X_left, X_right best_y_left, best_y_right = y_left, y_right if cur_max_gain <= 0: return [-1, np.argmax(np.bincount(y)), None, None] left = _learn(best_X_left, best_y_left) right = _learn(best_X_right, best_y_right) return [best_index, best_attr, left, right]
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) # pass self.group = None self.depth = 0 max_info_gain = -float("inf") split_attribute = -1 x_l, x_r, y_l, y_r = [], [], [], [] if self.depth < 15 and entropy(y) > 0: for col in range(len(X[0])): values = [row[col] for row in X] cur_split_val = sum(values) / len(values) X_left, X_right, y_left, y_right = partition_classes( X, y, col, cur_split_val) cur_y = [y_left, y_right] cur_info_gain = information_gain(y, cur_y) if max_info_gain < cur_info_gain: max_info_gain = cur_info_gain x_l, x_r, y_l, y_r = X_left, X_right, y_left, y_right split_attribute = col split_val = cur_split_val self.tree['left'], self.tree['right'] = DecisionTree( ), DecisionTree() self.tree['split_attribute'] = split_attribute self.tree['split_val'] = split_val self.tree['left'].learn(x_l, y_l) self.tree['right'].learn(x_r, y_r) self.tree['left'].depth = self.depth + 1 self.tree['right'].depth = self.depth + 1 else: self.group = y[0] return
def learn(self, X, y): if X.min() == X.max(): self.info=np.round(y.mean()) self.isTree = False elif (entropy(y)==0): #print(y) self.info=y[0] #print(self.info) self.isTree = False #print("this is a leaf") else: best_attribute, best_val = findBestSplit(X,y) self.info = [best_attribute, best_val] X_left, X_right, y_left, y_right = partition_classes(X, y, best_attribute, best_val) self.tree["left"] = DecisionTree() self.tree["left"].learn(X_left, y_left) self.tree["right"] = DecisionTree() self.tree["right"].learn(X_right, y_right)
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) trees = {} if entropy(y) != 0: info = {} for i in range(len(X[0])): unique = list(set([x[i] for x in X])) for j in unique: X_left, X_right, y_left, y_right = partition_classes( X, y, i, j) info[(i, j)] = information_gain(y, [y_left, y_right]) vals = list(info.values()) max_vals = max(vals) split_vals = list(info.keys()) split_attr, split_vals2 = split_vals[vals.index(max_vals)] X_left, X_right, y_left, y_right = partition_classes( X, y, split_attr, split_vals2) trees['split_attr'] = split_attr trees['split_vals'] = split_vals2 trees['left_tree'] = (X_left, y_left) trees['right_tree'] = (X_right, y_right) else: trees['leaf'] = 1 trees['label'] = y[0] self.tree = trees
def forward(self, quant_pred, target_wav): """ quant_pred: target_wav: B, """ # Loss per embedding vector com_loss_embeds = self.bn.min_dist * self.bn.gamma log_pred = self.logsoftmax(quant_pred) log_pred_target = torch.gather(log_pred, 1, target_wav.long().unsqueeze(1)) rec_loss_ts = -log_pred_target # total_loss = rec_loss_ts.sum() + com_loss_embeds.sum() # total_loss = rec_loss_ts.sum() total_loss = com_loss_embeds.sum() # total_loss = com_loss_embeds.sum() * 0.0 nh = self.bn.ind_hist / self.bn.ind_hist.sum() self.metrics = { 'rec': rec_loss_ts.mean(), 'com': com_loss_embeds.mean(), 'min_ze': self.bn.ze_norm.min(), 'max_ze': self.bn.ze_norm.max(), 'min_emb': self.bn.emb_norm.min(), 'max_emb': self.bn.emb_norm.max(), 'hst_ent': util.entropy(self.bn.ind_hist, True), # 'hst_100': util.entropy(util.int_hist(self.bn.circ_inds, -1), True), 'nunq': self.bn.uniq.nelement(), 'pk_m': log_pred.max(dim=1)[0].to(torch.float).mean(), 'pk_nuq': log_pred.max(dim=1)[1].unique().nelement(), 'pk_sd': log_pred.max(dim=1)[0].to(torch.float).std() } return total_loss
def random_id(): h = sha1() h.update(entropy(20)) return h.digest()
def get_entropy(): entropy = util.entropy() return json.dumps({"entropy": entropy})
def simulate(rumor, step_mode = 'time', step = 10, limit = 2400): rumor_edges = rumor['edges'] rumor_statuses = rumor['statuses'] trend_onset = rumor['trend_onset'] # Figure plt.figure() # Time series max_sizes = [] total_sizes = [] component_nums = [] entropies = [] max_component_ratios = [] timestamps = [] min_time = min([ edge[2] for edge in rumor_edges ]) if step_mode == 'time': next_time = min_time max_pos = limit print 'time\t\teid\t\tpos\t\t|C_max|\t\tN(C)\t\ttime-trend_onset' components = {} node_to_component_id = {} adj={} # Set to keep track of statuses that gain many inbound edges at the same # time. This happens when a user follows lots of people that have mentioned # the topic, then tweets about the topic gets all of those followees as # parents, causing a sharp spike in the component growth # spikeset = set() for eid, edge in enumerate(rumor_edges): # print edge # print components # print node_to_component_id # Update adjacency list if edge[0] in adj: adj[edge[0]].append(edge[1]) else: adj[edge[0]]=[edge[1]] # Update components if edge[0] not in node_to_component_id and edge[1] not in \ node_to_component_id: # Create new component with id edge[0] (i.e. first node belonging to that # component) component_id = edge[0] # print 'Creating new component ', component_id, ' from ', edge[0], ' and # ', edge[1] members = set([edge[0], edge[1]]) components[edge[0]] = members node_to_component_id[edge[0]] = component_id node_to_component_id[edge[1]] = component_id elif edge[0] not in node_to_component_id: c1 = node_to_component_id[edge[1]] # print 'Adding ', edge[0], ' to ', c1, ': ', components[c1] # raw_input('') components[c1].add(edge[0]) node_to_component_id[edge[0]] = c1 elif edge[1] not in node_to_component_id: c0 = node_to_component_id[edge[0]] # print 'Adding ', edge[1], ' to ', c0, ': ', components[c0] # raw_input('') components[c0].add(edge[1]) node_to_component_id[edge[1]] = c0 else: c0 = node_to_component_id[edge[0]] c1 = node_to_component_id[edge[1]] if c0 != c1: # Merge components. members = components[c1] # print 'Merging\n', c0, ': ', components[c0], '\ninto\n', c1, ': ', # components[c1], '\n' raw_input('') for member in components[c0]: members.add(member) node_to_component_id[member] = c1 components.pop(c0) """ # Pause when you have some number of repeat statuses in a row (meaning that # lots of edges that terminate in that status suddenly got created) repeat_num = 2 status_id = rumor_statuses[rumor_edges[eid][1]][0] if eid > repeat_num and \ last_k_statuses_equal(status_id, rumor_statuses,rumor_edges, eid, repeat_num) and \ status_id not in spikeset: print (rumor_statuses[rumor_edges[eid][0]], \ rumor_statuses[rumor_edges[eid][1]]) spikeset.add(status_id) raw_input() """ if step_mode == 'index': pos = eid elif step_mode == 'time': pos = edge[2] - min_time if pos > limit: break if step_mode == 'index' and eid % step: continue if step_mode == 'time': if edge[2] < next_time: continue else: next_time = edge[2] + step component_sizes = [] # raw_input('======================================================' for cid, members in components.items(): component_sizes.append(len(members)) # print 'component ', cid, ' size: ', len(members) # raw_input('-------------------') time_after_onset = None if trend_onset is not None: time_after_onset = edge[2] - trend_onset print edge[2] - min_time, '\t\t', eid, '\t\t', pos, '/', limit, '\t\t', max(component_sizes), '\t\t', len(components), '\t\t', time_after_onset # Print largest adjacency list sizes. neighbor_counts=[ len(adj[k]) for k in adj ] sorted_idx=range(len(neighbor_counts)) sorted_idx.sort(lambda x, y: neighbor_counts[y] - neighbor_counts[x]) for itop in xrange(10): if itop>=len(sorted_idx): break print adj.keys()[sorted_idx[itop]], ':', neighbor_counts[sorted_idx[itop]] raw_input() # Desc sort of component sizes component_sizes.sort() component_sizes.reverse() # Append to timeseries max_sizes.append(max(component_sizes)) total_sizes.append(sum(component_sizes)) component_nums.append(len(component_sizes)) entropies.append(util.entropy(component_sizes)) if trend_onset is None: trend_onset = 0 timestamps.append((edge[2] - trend_onset) / (60 * 60)) max_component_ratios.append(float(max(component_sizes))/sum(component_sizes)) shifted_ind = np.linspace(1, 1 + len(component_sizes), len(component_sizes)) if eid > 0: color = util.step_to_color(pos, max_pos) plt.subplot(331) plt.loglog(shifted_ind, component_sizes, color = color, hold = 'on') plt.title('Loglog desc component sizes') plt.subplot(332) plt.semilogy(timestamps[-1], max_sizes[-1], 'ro', color = color, hold = 'on') plt.title('Max component size') plt.xlabel('time (hours)') plt.subplot(333) plt.semilogy(timestamps[-1], total_sizes[-1], 'ro', color = color, hold = 'on') plt.title('Total network size') plt.xlabel('time (hours)') plt.subplot(334) plt.plot(timestamps[-1], entropies[-1], 'go', color = color, hold = 'on') plt.title('Entropy of desc component sizes') plt.xlabel('time (hours)') plt.subplot(335) plt.semilogy(timestamps[-1], component_nums[-1], 'ko', color = color, hold = 'on') plt.title('Number of components') plt.xlabel('time (hours)') plt.subplot(336) plt.loglog(shifted_ind, np.cumsum(component_sizes), color = color, hold = 'on') plt.title('Cum. sum. of desc component sizes') plt.subplot(337) plt.plot(timestamps[-1], max_component_ratios[-1], 'ko', color = color, hold = 'on') plt.title('Max comp size / Total network Size') plt.xlabel('time (hours)') # plt.hist(component_sizes, np.linspace(0.5, 15.5, 15)) # plt.plot(np.cumsum(np.histogram(component_sizes, bins = np.linspace(0.5, # 15.5, 15))[0]), hold = 'on') if not eid % 15*step: pass#plt.pause(0.001) plt.show() return components
def forward(self, quant_pred, target_wav): """ quant_pred: target_wav: B, """ # Loss per embedding vector l2_loss_embeds = self.l2(self.bn.sg(self.bn.ze), self.bn.emb) # l2_loss_embeds = scaled_l2_norm(self.bn.sg(self.bn.ze), self.bn.emb) com_loss_embeds = self.bn.min_dist * self.bn.gamma log_pred = self.logsoftmax(quant_pred) log_pred_target = torch.gather(log_pred, 1, target_wav.long().unsqueeze(1)) # Loss per timestep # !!! We don't need a 'loss per timestep'. We only need # to adjust the l2 and com losses by usage weight of each # code. (The codes at the two ends of the window will be # used less) rec_loss_ts = - log_pred_target # Use only a subset of the overlapping windows #sl = slice(0, 1) #rec_loss_sel = rec_loss_ts[...,sl] #l2_loss_sel = l2_loss_ts[...,sl] #com_loss_sel = com_loss_ts[...,sl] # total_loss_sel = rec_loss_sel + l2_loss_sel + com_loss_sel # total_loss_ts = l2_loss_ts # total_loss_ts = com_loss_ts # total_loss_ts = com_loss_ts + l2_loss_ts # total_loss_ts = log_pred_loss_ts + l2_loss_ts # total_loss_ts = log_pred_loss_ts # total_loss_ts = com_loss_ts - com_loss_ts # total_loss = total_loss_sel.mean() # We use sum here for each of the three loss terms because each element # should affect the total loss equally. For a typical WaveNet # architecture, there will be only one l2 loss term (or com_loss term) # per 320 rec_loss terms, due to upsampling. We could adjust for that. # Implicitly, com_loss is already adjusted by gamma. Perhaps l2_loss # should also be adjusted, but at the moment it is not. total_loss = rec_loss_ts.sum() + l2_loss_embeds.sum() + com_loss_embeds.sum() nh = self.bn.ind_hist / self.bn.ind_hist.sum() self.metrics = { 'rec': rec_loss_ts.mean(), 'l2': l2_loss_embeds.mean(), 'com': com_loss_embeds.mean(), #'ze_rng': self.bn.ze.max() - self.bn.ze.min(), #'emb_rng': self.bn.emb.max() - self.bn.emb.min(), 'min_ze': self.bn.ze_norm.min(), 'max_ze': self.bn.ze_norm.max(), 'min_emb': self.bn.emb_norm.min(), 'max_emb': self.bn.emb_norm.max(), 'hst_ent': util.entropy(self.bn.ind_hist, True), 'hst_100': util.entropy(util.int_hist(self.bn.circ_inds, -1), True), #'p_m': log_pred.max(dim=1)[0].to(torch.float).mean(), #'p_sd': log_pred.max(dim=1)[0].to(torch.float).std(), 'nunq': self.bn.uniq.nelement(), 'pk_m': log_pred.max(dim=1)[0].to(torch.float).mean(), 'pk_nuq': log_pred.max(dim=1)[1].unique().nelement(), # 'peak_unq': log_pred.max(dim=1)[1].unique(), 'pk_sd': log_pred.max(dim=1)[0].to(torch.float).std(), # 'unq': self.bn.uniq, #'m_ze': self.bn.ze_norm.max(), #'m_emb': self.bn.emb_norm.max() #emb0 = emb - emb.mean(dim=0) #chan_var = (emb0 ** 2).sum(dim=0) #chan_covar = torch.matmul(emb0.transpose(1, 0), emb0) - torch.diag(chan_var) } # netmisc.print_metrics(losses, 10000000) return total_loss
# poll_stats.py # Jonah Smith # Storytelling with Streaming Data, Spring 2016 # # This file, in an infinite loop, uses the functions in the util file to # calculate entropy and rate based on the state of the Redis db. It takes no # input, and emits a JSON string with the entropy and rate to stdout. These # messages are monitored by find-anomalies.py to, maybe not surprisingly, find # anomalies. import json from sys import stdout from time import sleep # util has our functions for calculating the entropy and rate. import util # Repeat the entropy and rate calculations indefinitely. while 1: # Use our utility functions to calculate entropy and rate. entropy = util.entropy() rate = util.rate() # Dump the entropy and rate to stdout and flush the stdout so we don't end # up with a buffer. print(json.dumps({'entropy': entropy, 'rate': rate})) stdout.flush() # Rest of one second. This will give us a nice smooth function for the rate # and entropy values. sleep(1)
def forward(self, quant_pred, target_wav): # Loss per embedding vector l2_loss_embeds = self.l2(self.bn.ze, self.bn.emb) com_loss_embeds = self.bn.l2norm_min * self.bn.gamma # l2_loss_embeds = self.l2(self.bn.ze, self.bn.emb).sqrt() # com_loss_embeds = self.bn.l2norm_min.sqrt() * self.bn.gamma log_pred = self.logsoftmax(quant_pred) log_pred_target = torch.gather(log_pred, 1, target_wav.unsqueeze(1)) # Loss per timestep rec_loss_ts = -log_pred_target l2_loss_ts = self.combine(l2_loss_embeds.unsqueeze(1))[..., :-1] com_loss_ts = self.combine(com_loss_embeds.unsqueeze(1))[..., :-1] # Use only a subset of the overlapping windows sl = slice(0, 1) rec_loss_sel = rec_loss_ts[..., sl] l2_loss_sel = l2_loss_ts[..., sl] com_loss_sel = com_loss_ts[..., sl] total_loss_sel = rec_loss_sel + l2_loss_sel + com_loss_sel # total_loss_ts = l2_loss_ts # total_loss_ts = com_loss_ts # total_loss_ts = com_loss_ts + l2_loss_ts # total_loss_ts = log_pred_loss_ts + l2_loss_ts # total_loss_ts = log_pred_loss_ts # total_loss_ts = com_loss_ts - com_loss_ts total_loss = total_loss_sel.mean() nh = self.bn.ind_hist / self.bn.ind_hist.sum() self.metrics = { 'rec': rec_loss_sel.mean(), 'l2': l2_loss_sel.mean(), 'com': com_loss_sel.mean(), #'ze_rng': self.bn.ze.max() - self.bn.ze.min(), #'emb_rng': self.bn.emb.max() - self.bn.emb.min(), 'min_ze': self.bn.ze_norm.min(), 'max_ze': self.bn.ze_norm.max(), 'min_emb': self.bn.emb_norm.min(), 'max_emb': self.bn.emb_norm.max(), 'hst_ent': util.entropy(self.bn.ind_hist, True), 'hst_100': util.entropy(util.int_hist(self.bn.circ_inds, -1), True), #'p_m': log_pred.max(dim=1)[0].to(torch.float).mean(), #'p_sd': log_pred.max(dim=1)[0].to(torch.float).std(), 'nunq': self.bn.uniq.nelement(), 'pk_m': log_pred.max(dim=1)[0].to(torch.float).mean(), 'pk_nuq': log_pred.max(dim=1)[1].unique().nelement(), # 'peak_unq': log_pred.max(dim=1)[1].unique(), 'pk_sd': log_pred.max(dim=1)[0].to(torch.float).std(), # 'unq': self.bn.uniq, #'m_ze': self.bn.ze_norm.max(), #'m_emb': self.bn.emb_norm.max() #emb0 = emb - emb.mean(dim=0) #chan_var = (emb0 ** 2).sum(dim=0) #chan_covar = torch.matmul(emb0.transpose(1, 0), emb0) - torch.diag(chan_var) } # netmisc.print_metrics(losses, 10000000) return total_loss
def get_entropy(): entropy = util.entropy() return json.dumps({'entropy': entropy})
def learn(self, X, y): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) nrows = len(X) try: ncols=len(X[0]) except: print(X) ent = entropy(y) self.tree['entropy'] = ent if ent < 0.1 or len(y)<=100: if len(y)==0: return self self.tree['class'] = scipy.stats.mode(y).mode[0] self.tree['split_attr'] = 'null' self.tree['split_val'] = 'null' self.tree['left_child']= None self.tree['right_child']= None return self info_gain = [] split_val = [] for idx in range(ncols-1): best_val_for_column = 0 best_gain_for_column = 0 series = [row[idx] for row in X] steps = np.linspace(start=np.min(series),stop=np.max(series),num=5)[1:4] for val in steps: X_left, X_right, y_left, y_right = partition_classes(X,y,idx,val) gain=information_gain(y,[y_left,y_right]) if gain > best_gain_for_column: best_gain_for_column = gain best_val_for_column = val info_gain.append(best_gain_for_column) split_val.append(best_val_for_column) best_split_col = np.argmax(info_gain) best_split_value = split_val[best_split_col] X_left, X_right, y_left, y_right = partition_classes(X,y,best_split_col,best_split_value) self.tree['class']='parent' self.tree['split_attr']=best_split_col self.tree['split_val']=best_split_value self.tree['left_child'] = DecisionTree() self.tree['left_child'].learn(X_left,y_left) self.tree['right_child'] = DecisionTree() self.tree['right_child'].learn(X_right,y_right)
def learn(self, X, y, par_node={}, depth=0): # TODO: Train the decision tree (self.tree) using the the sample X and labels y # You will have to make use of the functions in utils.py to train the tree # Use the function best_split in util.py to get the best split and # data corresponding to left and right child nodes # One possible way of implementing the tree: # Each node in self.tree could be in the form of a dictionary: # https://docs.python.org/2/library/stdtypes.html#mapping-types-dict # For example, a non-leaf node with two children can have a 'left' key and a # 'right' key. You can add more keys which might help in classification # (eg. split attribute and split value) ### Implement your code here ############################################# entropy_y = entropy(y) if len(X) == 0 or len(y) == 0: self.tree['state'] = 'leaf' self.tree['result'] = 0 return if len(set(y)) == 1: # if all same in y self.tree['state'] = 'leaf' self.tree['result'] = y[0] return y_dist = {} for yi in y: if yi in y_dist.keys(): y_dist[yi] += 1 else: y_dist[yi] = 1 y_max_val = 0 y_max_count = 0 for k, v in y_dist.items(): if v > y_max_count: y_max_val = k y_max_count = v all_same = True for i in range(1, len(X)): if X[i] == X[i - 1]: continue else: all_same = False break if all_same or depth == self.max_depth: self.tree['state'] = 'leaf' self.tree['result'] = y_max_val return split_column, split_value, X_left, X_right, y_left, y_right = best_split( X, y) self.tree['state'] = "parent" self.tree['result'] = "null" self.tree['split_attr'] = split_column self.tree['split_val'] = split_value self.tree['left'] = DecisionTree() self.tree['left'].learn(X_left, y_left, self, depth + 1) self.tree['right'] = DecisionTree() self.tree['right'].learn(X_right, y_right, self, depth + 1)