Ejemplo n.º 1
0
    def _gain(self, X, Y, attr):
        """
        :param X: numpy 1D array, data samples
        :param Y: numpy 1D array, class labels
        :return: computed information gain

        This method computes information gain
        """
        entropy_total = entropy(Y)

        unique_values = np.unique(X)
        if len(unique_values) == 1:
            return 0, None
        if self.attr_dtypes[attr] == int:
            entropy_subsets = 0
            for value in unique_values:
                subset = Y[np.where(X == value)[0]]
                entropy_subsets += (subset.size / Y.size) * entropy(subset)
            return entropy_total - entropy_subsets
        elif self.attr_dtypes[attr] == float:
            best_gain = 0
            split_value = None
            for value in unique_values[1:]:
                left_subset = Y[np.where(X < value)[0]]
                right_subset = Y[np.where(X >= value)[0]]
                entropy_subsets = (left_subset.size / Y.size) * entropy(left_subset) + \
                                  (right_subset.size / Y.size) * entropy(right_subset)
                if entropy_total - entropy_subsets > best_gain:
                    best_gain = entropy_total - entropy_subsets
                    split_value = value
            return best_gain, split_value
    def select_feature(self, X, y, indice):
        dataset = np.c_[X, y]
        baseEntropy = entropy(dataset)
        choose_infoGain = 0.0
        bestFeature = -1

        for i in indice:
            vals = [example[i] for example in X]
            univals = sorted(set(vals))
            newEntropy = 0.0
            #for value in univals:
            c = 0
            #value = random.choice(univals)
            #bestValue = value
            while c < 10:
                c += 1
                value = random.choice(univals)
                #bestValue = value
                subX1, subX2, subY1, subY2 = partition_classes(X, y, i, value)
                p1 = len(subY1) / float(len(X))
                p2 = len(subY2) / float(len(X))
                subdataset1 = np.c_[subX1, subY1]
                subdataset2 = np.c_[subX2, subY2]
                newEntropy = p1 * entropy(subdataset1) + p2 * entropy(
                    subdataset2)
                infoGain = baseEntropy - newEntropy
                #print(infoGain,choose_infoGain)
                if (infoGain >= choose_infoGain):
                    choose_infoGain = infoGain
                    bestFeature = i
                    bestValue = value

        return bestFeature, bestValue
Ejemplo n.º 3
0
 def mutual_info(self, X, y):
     res = entropy(y)
     val, counts = np.unique(X, return_counts=True)
     freqs = counts.astype('float') / len(X)
     # We calculate a weighted average of the entropy
     for p, v in zip(freqs, val):
         res -= p * entropy(y[X == v])
     return res
    def learn(self, X, y):
        # TODO: train decision tree and store it in self.tree
        d = X.shape[1]
        valueSet = self.possibleValues(X)
        rootNode = []
        for i in range(d):
            rootNode.append([
                valueSet[i][np.argmin(valueSet[i])],
                valueSet[i][np.argmax(valueSet[i])]
            ])
        rootNode.append(0)
        node_num = 0
        nodeList = [(node_num, rootNode)]

        while (len(nodeList) != 0):
            current_num, node = nodeList.pop()
            total_list = self.domain(X, node)
            #if(len(total_list)==0):
            #	continue;
            if (entropy(y[total_list]) >= 0.1):
                attr, split, child_node1, child_node2 = self.findBest(
                    X[total_list, :], y[total_list], node)
                if (child_node1 == node or child_node2 == node):
                    self.leaves.append(node)
                    self.tree[current_num] = [-1, -1, -1, -1, node[9]]
                    continue
                nodeList.append((node_num + 1, child_node1))
                nodeList.append((node_num + 2, child_node2))
                self.tree[current_num] = [
                    attr, split, node_num + 1, node_num + 2, node[9]
                ]
                node_num = node_num + 2
            else:
                self.leaves.append(node)
                self.tree[current_num] = [-1, -1, -1, -1, node[9]]
Ejemplo n.º 5
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        gain_list = {}
        if entropy(y) != 0:
            for attr in range(len(X[0])):
                values = list(set([item[attr] for item in X]))
                for val in values:
                    X_left, X_right, y_left, y_right = partition_classes(
                        X, y, attr, val)
                    gain_list[(attr,
                               val)] = information_gain(y, [y_left, y_right])
            sp_attr, sp_val = list(gain_list.keys())[list(
                gain_list.values()).index(max(gain_list.values()))]
            X_left, X_right, y_left, y_right = partition_classes(
                X, y, sp_attr, sp_val)
            self.tree['split_attribute'] = sp_attr
            self.tree['split_val'] = sp_val
            self.tree['left'] = (X_left, y_left)
            self.tree['right'] = (X_right, y_right)
        pass
Ejemplo n.º 6
0
def wordRank(seq, text):
    """
    词的灵活程度又它的左临集合和右临集合判定
    """
    LeftSet, RightSet = [], []
    cur = text.find(seq)
    wl = len(seq)
    while cur != -1:
        if cur != 0:
            LeftSet.append(text[cur - 1:cur])
        RightSet.append(text[cur + wl:cur + wl + 1])
        cur = text.find(seq, cur + len(seq))
    entr = min(entropy(LeftSet), entropy(RightSet))
    if entr == 0:
        return 0
    return 1 / entr
Ejemplo n.º 7
0
def expected_information_gain(likelihood, prior):
    """
    expected_post_entropy has shape n_feature
    """
    n_concept, n_feature, n_y = likelihood.shape
    full_post = full_posterior(likelihood, prior)
    prior_predictive = predictive(likelihood, prior)
    full_post_entropy = np.zeros([n_feature, n_y])
    for ind_x in range(n_feature):
        for ind_y in range(n_y):
            full_post_entropy[ind_x, ind_y] = entropy(full_post[:, ind_x,
                                                                ind_y])
    expected_post_entropy = np.sum(full_post_entropy * prior_predictive,
                                   axis=1)
    prior_entropy = entropy(prior)
    return prior_entropy - expected_post_entropy
Ejemplo n.º 8
0
 def learnlearn(X, y):
     if entropy(y) == 0:  # all the same label -> end of tree
         return {'label': y[0]}
     best_split = {}  # split_attr,split_value,left,right
     max_IG = -1
     current_split = None
     for attribute in range(len(X[0])):  # attribute = column indices
         unique_value = np.unique([x[attribute] for x in X])
         for value in unique_value:
             X_left, X_right, y_left, y_right = partition_classes(
                 X, y, attribute, value)
             IG = information_gain(y, [y_left, y_right])
             if IG > max_IG:
                 max_IG = IG
                 current_split = [attribute, value]
     if max_IG == 0:  # just couldn't split better -> end of tree
         cnt_0_1 = np.bincount(y)
         return {'label': [1, 0][cnt_0_1[0] > cnt_0_1[1]]}
     # record and split
     best_split["split_attr"] = current_split[0]
     best_split["split_value"] = current_split[1]
     X_left, X_right, y_left, y_right = partition_classes(
         X, y, current_split[0], current_split[1])
     # next level
     best_split['left'] = learnlearn(X_left, y_left)
     best_split['right'] = learnlearn(X_right, y_right)
     return best_split
Ejemplo n.º 9
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        #pass

        max_info_gain, max_attribute, max_value = -1, 0, 0

        X_left = []
        X_right = []
        y_left = []
        y_right = []

        self.tree_depth = 0
        self.id = None

        if self.tree_depth > 10 or entropy(y) <= 0:
            self.id = y[0]
            return

        for i in range(0, len(X[0])):
            values = [X[j][i] for j in range(0, len(X))]
            # choose the split value according to its average in order to reduce the running time
            if isinstance(values[0], str):
                split_avg = values[0]
            else:
                split_avg = sum(values) / len(values)

            xLeft, xRight, yLeft, yRight = partition_classes(
                X, y, i, split_avg)
            current = []
            current.append(yLeft)
            current.append(yRight)
            temp = information_gain(y, current)
            if temp > max_info_gain:
                max_attribute = i
                max_value = split_avg
                max_info_gain = temp
                X_left = xLeft
                X_right = xRight
                y_left = yLeft
                y_right = yRight

        #build tree
        self.tree['max_attribute'], self.tree[
            'max_value'] = max_attribute, max_value
        self.tree['L'], self.tree['R'] = DecisionTree(), DecisionTree()

        #grow tree
        self.tree['L'].learn(X_left, y_left)
        self.tree['R'].learn(X_right, y_right)
        self.tree['L'].tree_depth = self.tree_depth + 1
        self.tree['R'].tree_depth = self.tree_depth + 1
Ejemplo n.º 10
0
 def test_entropy_depenency_on_divisor(self):
     dd_entropy = self.election.entropy()
     ds_entropy = entropy(self.votes, self.election.results,
                          sainte_lague_gen)
     self.rules["primary_divider"] = "sainte-lague"
     self.rules["adj_determine_divider"] = "sainte-lague"
     self.rules["adj_alloc_divider"] = "sainte-lague"
     self.sl_election = Election(self.rules, self.votes)
     self.sl_election.run()
     ss_entropy = self.sl_election.entropy()
     sd_entropy = entropy(self.votes, self.sl_election.results, dhondt_gen)
     self.assertNotEqual(ds_entropy, dd_entropy)
     self.assertNotEqual(ss_entropy, dd_entropy)
     self.assertNotEqual(ss_entropy, sd_entropy)
     self.assertNotEqual(ds_entropy, sd_entropy)
     self.assertEqual(round(dd_entropy, 2), 42.95)
     self.assertEqual(round(ds_entropy, 2), 41.22)
     self.assertEqual(round(ss_entropy, 2), 41.22)
     self.assertEqual(round(sd_entropy, 2), 42.95)
Ejemplo n.º 11
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)

        root = {
            'attribute_index': None,
            'value': None,
            'left': None,
            'right': None
        }

        if entropy(
                y
        ) < .2:  ##Reasonable bound/cut off based on entropy to prevent overfitting
            return sp.stats.mode(y)[0][0]

        else:
            info_gain = 0
            split_attribute_index = 0
            split_value = X[0][split_attribute_index]

            for item in X:
                for i in range(len(item)):
                    if information_gain(y, [
                            partition_classes(X, y, i, item[i])[2],
                            partition_classes(X, y, i, item[i])[3]
                    ]) > info_gain:
                        info_gain = information_gain(
                            y, [
                                partition_classes(X, y, i, item[i])[2],
                                partition_classes(X, y, i, item[i])[3]
                            ]
                        )  ##Base attribute and split value on combination that maximizes info gain
                        split_attribute_index = i
                        split_value = item[i]

            root['attribute_index'] = split_attribute_index
            root['value'] = split_value
            root['left'] = self.learn(
                partition_classes(X, y, split_attribute_index, split_value)[0],
                partition_classes(X, y, split_attribute_index, split_value)[2])
            root['right'] = self.learn(
                partition_classes(X, y, split_attribute_index, split_value)[1],
                partition_classes(X, y, split_attribute_index, split_value)[3])

            self.tree.insert(0, root)

            return root
Ejemplo n.º 12
0
    def _grow_tree(self, X, Y, attributes, depth, value=None):
        """
        :param X: numpy 2D array, data samples
        :param Y: numpy 1D array, class labels
        :param attributes: list of attributes in the data
        :param depth: maximum depth of the tree
        :param value: possible value of the attribute for the current node/branch
        :return: a constructed node which has branches pointing further nodes

        This method grows a decision tree by recursively calling itself.
        """
        # Construct a node
        node = Node()
        node.probs = probabilities(Y)
        node.class_, _ = get_best_class_prob(node.probs)
        node.branch_value = value

        # Stop criteria
        if depth == 0 or entropy(Y) == 0 or len(attributes) == 0:
            return node

        # Find the best attribute
        attr, node.gain, split_value = self._best_attribute(X, Y, attributes)
        node.next_attr = attr

        if node.gain == 0:
            return node

        if depth:
            depth -= 1

        # Recurse to construct child nodes. The branches are built based on the type of the
        # attribute.
        # If the attribute is continuous, only two branches are formed (left-br,right-br)
        # If the attribute is discrete, a branch is created for each possible value of the
        # attribute.
        if self.attr_dtypes[attr] == int:
            attributes.remove(attr)
            values = np.unique(X[:, attr])
            for val in values:
                subset_indices = np.where(X[:, attr] == val)[0]
                node.branches[val] = self._grow_tree(X[subset_indices, :], Y[subset_indices]
                                                     , attributes, depth, val)
        elif self.attr_dtypes[attr] == float:
            node.split_value = split_value
            left_subset = np.where(X[:, attr] < split_value)[0]
            node.branches[L_BRANCH] = self._grow_tree(X[left_subset, :], Y[left_subset]
                                                       , attributes, depth, None)
            right_subset = np.where(X[:, attr] >= split_value)[0]
            node.branches[R_BRANCH] = self._grow_tree(X[right_subset, :], Y[right_subset]
                                                        , attributes, depth, None)
        return node
Ejemplo n.º 13
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        self.depth = 0
        self.group = None

        y_entropy = entropy(y)

        max_info_gain = -1
        split_attribute = -1
        split_val = ''
        x_left = []
        x_right = []
        y_left = []
        y_right = []

        if self.depth < 15 and y_entropy > 0:
            for column in range(len(X[0])):
                col_vals = [row[column] for row in X]
                trial_split_val = sum(col_vals) / (len(col_vals) * 1.0)
                x_l, x_r, y_l, y_r = partition_classes(X, y, column,
                                                       trial_split_val)
                current_y = [y_l, y_r]
                info_gain = information_gain(y, current_y)
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    split_attribute = column
                    split_val = trial_split_val
                    x_left = x_l
                    x_right = x_r
                    y_left = y_l
                    y_right = y_r

            self.tree['left'] = DecisionTree()
            self.tree['right'] = DecisionTree()
            self.tree['split_attribute'] = split_attribute
            self.tree['split_val'] = split_val
            self.tree['left'].learn(x_left, y_left)  # create tree within tree
            self.tree['right'].learn(x_right, y_right)
            self.tree['left'].depth = self.depth + 1
            self.tree['right'].depth = self.depth + 1

        else:
            self.group = y[0]
            return
Ejemplo n.º 14
0
 def send_find_node(self, address, nid=None):
     nid = get_neighbor(nid, self.nid) if nid else self.nid
     tid = entropy(TID_LENGTH)
     msg = {
         "t": tid,
         "y": "q",
         "q": "find_node",
         "a": {
             "id": nid,
             "target": random_id()
         }
     }
     self.send_krpc(msg, address)
Ejemplo n.º 15
0
    def Build_tree(self, X, y):

        entropy_y = entropy(y)
        if entropy_y == 0:
            return np.atleast_2d(['Leaf', y[0], 'NA', 'NA'])

        else:
            best_info = 0
            best_i = -1
            best_j = 0
            for i in range(0, len(X[0])):
                split_range = [item[i] for item in X]
                type_x = split_range[0]
                if (isinstance(type_x, int) or isinstance(type_x, float)):
                    unique = np.unique(split_range)[0:-1]
                else:
                    unique = np.unique(split_range)
                    if len(unique) == 1:
                        unique = unique[0:-1]

                for j in unique:
                    [X_left, X_right, y_left,
                     y_right] = partition_classes(X, y, i, j)
                    current_y = list()
                    current_y.append(y_left)
                    current_y.append(y_right)
                    info_gain = information_gain(y, current_y)

                    if info_gain > best_info:
                        best_info = info_gain
                        best_i = i
                        best_j = j

            if (best_i == -1):
                counts = np.bincount(y)
                return np.atleast_2d(['Leaf', np.argmax(counts), 'NA', 'NA'])
            else:
                [X_left, X_right, y_left,
                 y_right] = partition_classes(X, y, best_i, best_j)
                lefttree = self.Build_tree(X_left, y_left)
                righttree = self.Build_tree(X_right, y_right)
                root = np.atleast_2d(
                    [best_i, best_j, 1,
                     np.atleast_2d(lefttree).shape[0] + 1])
                root_left = np.append(root, lefttree, axis=0)
                root_right = np.append(root_left, righttree, axis=0)

        return (root_right)
        pass
Ejemplo n.º 16
0
    def send_find_node(self, address, nid=None):
        logging.debug("send find node to : " + str(address))

        nid = self.get_neighbor(nid, self.nid) if nid else self.nid
        tid = entropy(TID_LENGTH)
        msg = {
            "t": tid,
            "y": "q",
            "q": "find_node",
            "a": {
                "id": nid,
                "target": self.random_id()
            }
        }
        self.send_krpc(msg, address)
Ejemplo n.º 17
0
 def select_feature(cls, y, X, possible_features, weights=None):
     """
     Select the best feature to split in the decision
     tree.
     """
     best_info_gain = -1
     split_feat = -1
     for feat in possible_features:
         info_gain = entropy(y, weights) - conditional_entropy(
             y, X[:, feat], weights)
         # print(feat, info_gain)
         if info_gain > best_info_gain:
             best_info_gain = info_gain
             split_feat = feat
     return split_feat
        def _learn(X, y):
            Y_entropy = entropy(y)
            if Y_entropy == 0:
                return [-1, y[0], None, None]

            cur_max_gain = 0
            best_attr = []
            best_index = -1

            for index in data_length:

                if type(X[0][index]) == str:
                    is_str = True
                else:
                    is_str = False
                if is_str == True:
                    attr_X = np.unique([X[i][index] for i in range(len(X))])

                    for attr in attr_X:
                        X_left, X_right, y_left, y_right = partition_classes(
                            X, y, index, attr)
                        gain = information_gain(y, [y_left, y_right])
                        if gain > cur_max_gain:
                            cur_max_gain = gain
                            best_index = index
                            best_attr = attr
                            best_X_left, best_X_right = X_left, X_right
                            best_y_left, best_y_right = y_left, y_right
                else:
                    attr_X = np.mean([X[i][index] for i in range(len(X))])
                    X_left, X_right, y_left, y_right = partition_classes(
                        X, y, index, attr_X)
                    gain = information_gain(y, [y_left, y_right])
                    if gain > cur_max_gain:
                        cur_max_gain = gain
                        best_index = index
                        best_attr = attr_X
                        best_X_left, best_X_right = X_left, X_right
                        best_y_left, best_y_right = y_left, y_right

            if cur_max_gain <= 0:
                return [-1, np.argmax(np.bincount(y)), None, None]

            left = _learn(best_X_left, best_y_left)
            right = _learn(best_X_right, best_y_right)

            return [best_index, best_attr, left, right]
Ejemplo n.º 19
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        # pass
        self.group = None
        self.depth = 0
        max_info_gain = -float("inf")
        split_attribute = -1
        x_l, x_r, y_l, y_r = [], [], [], []

        if self.depth < 15 and entropy(y) > 0:

            for col in range(len(X[0])):
                values = [row[col] for row in X]
                cur_split_val = sum(values) / len(values)

                X_left, X_right, y_left, y_right = partition_classes(
                    X, y, col, cur_split_val)
                cur_y = [y_left, y_right]
                cur_info_gain = information_gain(y, cur_y)
                if max_info_gain < cur_info_gain:
                    max_info_gain = cur_info_gain
                    x_l, x_r, y_l, y_r = X_left, X_right, y_left, y_right
                    split_attribute = col
                    split_val = cur_split_val

            self.tree['left'], self.tree['right'] = DecisionTree(
            ), DecisionTree()
            self.tree['split_attribute'] = split_attribute
            self.tree['split_val'] = split_val
            self.tree['left'].learn(x_l, y_l)
            self.tree['right'].learn(x_r, y_r)
            self.tree['left'].depth = self.depth + 1
            self.tree['right'].depth = self.depth + 1

        else:
            self.group = y[0]
            return
    def learn(self, X, y):
        if X.min() == X.max():
            self.info=np.round(y.mean())
            self.isTree = False
        elif (entropy(y)==0):
            #print(y)
            self.info=y[0]
            #print(self.info)
            self.isTree = False
            #print("this is a leaf")
        
        else:
            
            best_attribute, best_val = findBestSplit(X,y)
            
            self.info = [best_attribute, best_val]
            
            X_left, X_right, y_left, y_right = partition_classes(X, y, best_attribute, best_val)

            self.tree["left"] = DecisionTree()
            self.tree["left"].learn(X_left, y_left)
            self.tree["right"] = DecisionTree()
            self.tree["right"].learn(X_right, y_right)
Ejemplo n.º 21
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        trees = {}
        if entropy(y) != 0:
            info = {}
            for i in range(len(X[0])):
                unique = list(set([x[i] for x in X]))
                for j in unique:
                    X_left, X_right, y_left, y_right = partition_classes(
                        X, y, i, j)
                    info[(i, j)] = information_gain(y, [y_left, y_right])

            vals = list(info.values())
            max_vals = max(vals)
            split_vals = list(info.keys())
            split_attr, split_vals2 = split_vals[vals.index(max_vals)]

            X_left, X_right, y_left, y_right = partition_classes(
                X, y, split_attr, split_vals2)

            trees['split_attr'] = split_attr
            trees['split_vals'] = split_vals2
            trees['left_tree'] = (X_left, y_left)
            trees['right_tree'] = (X_right, y_right)
        else:
            trees['leaf'] = 1
            trees['label'] = y[0]

        self.tree = trees
Ejemplo n.º 22
0
    def forward(self, quant_pred, target_wav):
        """
        quant_pred: 
        target_wav: B,  
        """
        # Loss per embedding vector
        com_loss_embeds = self.bn.min_dist * self.bn.gamma

        log_pred = self.logsoftmax(quant_pred)
        log_pred_target = torch.gather(log_pred, 1,
                                       target_wav.long().unsqueeze(1))

        rec_loss_ts = -log_pred_target
        # total_loss = rec_loss_ts.sum() + com_loss_embeds.sum()
        # total_loss = rec_loss_ts.sum()
        total_loss = com_loss_embeds.sum()
        # total_loss = com_loss_embeds.sum() * 0.0

        nh = self.bn.ind_hist / self.bn.ind_hist.sum()

        self.metrics = {
            'rec': rec_loss_ts.mean(),
            'com': com_loss_embeds.mean(),
            'min_ze': self.bn.ze_norm.min(),
            'max_ze': self.bn.ze_norm.max(),
            'min_emb': self.bn.emb_norm.min(),
            'max_emb': self.bn.emb_norm.max(),
            'hst_ent': util.entropy(self.bn.ind_hist, True),
            # 'hst_100': util.entropy(util.int_hist(self.bn.circ_inds, -1), True),
            'nunq': self.bn.uniq.nelement(),
            'pk_m': log_pred.max(dim=1)[0].to(torch.float).mean(),
            'pk_nuq': log_pred.max(dim=1)[1].unique().nelement(),
            'pk_sd': log_pred.max(dim=1)[0].to(torch.float).std()
        }

        return total_loss
Ejemplo n.º 23
0
def random_id():
    h = sha1()
    h.update(entropy(20))
    return h.digest()
Ejemplo n.º 24
0
def get_entropy():
    entropy = util.entropy()
    return json.dumps({"entropy": entropy})
Ejemplo n.º 25
0
def simulate(rumor, step_mode = 'time', step = 10, limit = 2400):
  rumor_edges = rumor['edges']
  rumor_statuses = rumor['statuses']
  trend_onset = rumor['trend_onset']

  # Figure
  plt.figure()

  # Time series
  max_sizes = []
  total_sizes = []
  component_nums = []
  entropies = []
  max_component_ratios = []
  timestamps = []

  min_time = min([ edge[2] for edge in rumor_edges ])
  if step_mode == 'time':
    next_time = min_time
  max_pos = limit

  print 'time\t\teid\t\tpos\t\t|C_max|\t\tN(C)\t\ttime-trend_onset'

  components = {}
  node_to_component_id = {}
  adj={}

  # Set to keep track of statuses that gain many inbound edges at the same
  # time. This happens when a user follows lots of people that have mentioned
  # the topic, then tweets about the topic gets all of those followees as
  # parents, causing a sharp spike in the component growth

  # spikeset = set()

  for eid, edge in enumerate(rumor_edges):
    # print edge
    # print components
    # print node_to_component_id

    # Update adjacency list
    if edge[0] in adj:
      adj[edge[0]].append(edge[1])
    else:
      adj[edge[0]]=[edge[1]]
    
    # Update components
    if edge[0] not in node_to_component_id and edge[1] not in \
        node_to_component_id:
      # Create new component with id edge[0] (i.e. first node belonging to that
      #  component)
      component_id = edge[0]
      # print 'Creating new component ', component_id, ' from ', edge[0], ' and
      # ', edge[1]
      members = set([edge[0], edge[1]])
      components[edge[0]] = members
      node_to_component_id[edge[0]] = component_id
      node_to_component_id[edge[1]] = component_id
    elif edge[0] not in node_to_component_id:
      c1 = node_to_component_id[edge[1]]
      # print 'Adding ', edge[0], ' to ', c1, ': ', components[c1]
      # raw_input('')
      components[c1].add(edge[0])
      node_to_component_id[edge[0]] = c1
    elif edge[1] not in node_to_component_id:
      c0 = node_to_component_id[edge[0]]
      # print 'Adding ', edge[1], ' to ', c0, ': ', components[c0]
      # raw_input('')
      components[c0].add(edge[1])
      node_to_component_id[edge[1]] = c0
    else:
      c0 = node_to_component_id[edge[0]]
      c1 = node_to_component_id[edge[1]]
      if c0 != c1:
        # Merge components.
        members = components[c1]
        # print 'Merging\n', c0, ': ', components[c0], '\ninto\n', c1, ': ',
        # components[c1], '\n' raw_input('')
        for member in components[c0]:
          members.add(member)
          node_to_component_id[member] = c1
        components.pop(c0)
    
    """
    # Pause when you have some number of repeat statuses in a row (meaning that
    # lots of edges that terminate in that status suddenly got created)
    repeat_num = 2
    status_id = rumor_statuses[rumor_edges[eid][1]][0]
    if eid > repeat_num and \ 
        last_k_statuses_equal(status_id, rumor_statuses,rumor_edges, eid, repeat_num) and \
        status_id not in spikeset:
      print (rumor_statuses[rumor_edges[eid][0]], \ 
        rumor_statuses[rumor_edges[eid][1]])
      spikeset.add(status_id)
      raw_input()
    """

    if step_mode == 'index':
      pos = eid
    elif step_mode == 'time':
      pos = edge[2] - min_time
        
    if pos > limit:
      break

    if step_mode == 'index' and eid % step:
      continue
    if step_mode == 'time':
      if edge[2] < next_time:
        continue
      else:
        next_time = edge[2] + step

    component_sizes = []
    # raw_input('======================================================'
    for cid, members in components.items():
      component_sizes.append(len(members))
      # print 'component ', cid, ' size: ', len(members)  
      # raw_input('-------------------')

    time_after_onset = None
    if trend_onset is not None:
      time_after_onset = edge[2] - trend_onset

    print edge[2] - min_time, '\t\t', eid, '\t\t', pos, '/', limit, '\t\t', max(component_sizes), '\t\t', len(components), '\t\t', time_after_onset
    # Print largest adjacency list sizes.
    neighbor_counts=[ len(adj[k]) for k in adj ]
    sorted_idx=range(len(neighbor_counts))
    sorted_idx.sort(lambda x, y: neighbor_counts[y] - neighbor_counts[x])
    for itop in xrange(10):
      if itop>=len(sorted_idx):
        break
      print adj.keys()[sorted_idx[itop]], ':', neighbor_counts[sorted_idx[itop]]
    raw_input()

    # Desc sort of component sizes
    component_sizes.sort()
    component_sizes.reverse()

    # Append to timeseries
    max_sizes.append(max(component_sizes))
    total_sizes.append(sum(component_sizes))
    component_nums.append(len(component_sizes))
    entropies.append(util.entropy(component_sizes))
    if trend_onset is None:
      trend_onset = 0
    timestamps.append((edge[2] - trend_onset) / (60 * 60))
    max_component_ratios.append(float(max(component_sizes))/sum(component_sizes))
    shifted_ind = np.linspace(1, 1 + len(component_sizes), len(component_sizes))

    if eid > 0:
      color = util.step_to_color(pos, max_pos)
      plt.subplot(331)
      plt.loglog(shifted_ind, component_sizes, color = color, hold = 'on')
      plt.title('Loglog desc component sizes')

      plt.subplot(332)
      plt.semilogy(timestamps[-1], max_sizes[-1], 'ro', color = color,
                   hold = 'on')
      plt.title('Max component size')
      plt.xlabel('time (hours)')

      plt.subplot(333)
      plt.semilogy(timestamps[-1], total_sizes[-1], 'ro', color = color,
                   hold = 'on')
      plt.title('Total network size')
      plt.xlabel('time (hours)')

      plt.subplot(334)
      plt.plot(timestamps[-1], entropies[-1], 'go', color = color, hold = 'on')
      plt.title('Entropy of desc component sizes')
      plt.xlabel('time (hours)')

      plt.subplot(335)
      plt.semilogy(timestamps[-1], component_nums[-1], 'ko', color = color,
                   hold = 'on')
      plt.title('Number of components')
      plt.xlabel('time (hours)')

      plt.subplot(336)
      plt.loglog(shifted_ind, np.cumsum(component_sizes), color = color,
                 hold = 'on')
      plt.title('Cum. sum. of desc component sizes')

      plt.subplot(337)
      plt.plot(timestamps[-1], max_component_ratios[-1], 'ko', color = color,
               hold = 'on')
      plt.title('Max comp size / Total network Size')
      plt.xlabel('time (hours)')

    # plt.hist(component_sizes, np.linspace(0.5, 15.5, 15))
    # plt.plot(np.cumsum(np.histogram(component_sizes, bins = np.linspace(0.5,
    # 15.5, 15))[0]), hold = 'on')
    if not eid % 15*step:
      pass#plt.pause(0.001)
  plt.show()
  return components
Ejemplo n.º 26
0
    def forward(self, quant_pred, target_wav):
        """
        quant_pred: 
        target_wav: B,  
        """
        # Loss per embedding vector 
        l2_loss_embeds = self.l2(self.bn.sg(self.bn.ze), self.bn.emb)
        # l2_loss_embeds = scaled_l2_norm(self.bn.sg(self.bn.ze), self.bn.emb)
        com_loss_embeds = self.bn.min_dist * self.bn.gamma

        log_pred = self.logsoftmax(quant_pred)
        log_pred_target = torch.gather(log_pred, 1,
                target_wav.long().unsqueeze(1))

        # Loss per timestep
        # !!! We don't need a 'loss per timestep'.  We only need
        # to adjust the l2 and com losses by usage weight of each
        # code.  (The codes at the two ends of the window will be
        # used less)
        rec_loss_ts = - log_pred_target

        # Use only a subset of the overlapping windows
        #sl = slice(0, 1)
        #rec_loss_sel = rec_loss_ts[...,sl]
        #l2_loss_sel = l2_loss_ts[...,sl]
        #com_loss_sel = com_loss_ts[...,sl]
        
        # total_loss_sel = rec_loss_sel + l2_loss_sel + com_loss_sel
        # total_loss_ts = l2_loss_ts
        # total_loss_ts = com_loss_ts
        # total_loss_ts = com_loss_ts + l2_loss_ts
        # total_loss_ts = log_pred_loss_ts + l2_loss_ts
        # total_loss_ts = log_pred_loss_ts 
        # total_loss_ts = com_loss_ts - com_loss_ts

        # total_loss = total_loss_sel.mean()

        # We use sum here for each of the three loss terms because each element
        # should affect the total loss equally.  For a typical WaveNet
        # architecture, there will be only one l2 loss term (or com_loss term)
        # per 320 rec_loss terms, due to upsampling.  We could adjust for that.
        # Implicitly, com_loss is already adjusted by gamma.  Perhaps l2_loss
        # should also be adjusted, but at the moment it is not.
        total_loss = rec_loss_ts.sum() + l2_loss_embeds.sum() + com_loss_embeds.sum()

        nh = self.bn.ind_hist / self.bn.ind_hist.sum()

        self.metrics = { 
                'rec': rec_loss_ts.mean(),
                'l2': l2_loss_embeds.mean(),
                'com': com_loss_embeds.mean(),
                #'ze_rng': self.bn.ze.max() - self.bn.ze.min(),
                #'emb_rng': self.bn.emb.max() - self.bn.emb.min(),
                'min_ze': self.bn.ze_norm.min(),
                'max_ze': self.bn.ze_norm.max(),
                'min_emb': self.bn.emb_norm.min(),
                'max_emb': self.bn.emb_norm.max(),
                'hst_ent': util.entropy(self.bn.ind_hist, True),
                'hst_100': util.entropy(util.int_hist(self.bn.circ_inds, -1), True),
                #'p_m': log_pred.max(dim=1)[0].to(torch.float).mean(),
                #'p_sd': log_pred.max(dim=1)[0].to(torch.float).std(),
                'nunq': self.bn.uniq.nelement(),
                'pk_m': log_pred.max(dim=1)[0].to(torch.float).mean(),
                'pk_nuq': log_pred.max(dim=1)[1].unique().nelement(),
                # 'peak_unq': log_pred.max(dim=1)[1].unique(),
                'pk_sd': log_pred.max(dim=1)[0].to(torch.float).std(),
                # 'unq': self.bn.uniq,
                #'m_ze': self.bn.ze_norm.max(),
                #'m_emb': self.bn.emb_norm.max()
                #emb0 = emb - emb.mean(dim=0)
                #chan_var = (emb0 ** 2).sum(dim=0)
                #chan_covar = torch.matmul(emb0.transpose(1, 0), emb0) - torch.diag(chan_var)
                }
        # netmisc.print_metrics(losses, 10000000)

        return total_loss
Ejemplo n.º 27
0
# poll_stats.py
# Jonah Smith
# Storytelling with Streaming Data, Spring 2016
#
# This file, in an infinite loop, uses the functions in the util file to
# calculate entropy and rate based on the state of the Redis db. It takes no
# input, and emits a JSON string with the entropy and rate to stdout. These
# messages are monitored by find-anomalies.py to, maybe not surprisingly, find
# anomalies.

import json
from sys import stdout
from time import sleep
# util has our functions for calculating the entropy and rate.
import util

# Repeat the entropy and rate calculations indefinitely.
while 1:
    # Use our utility functions to calculate entropy and rate.
    entropy = util.entropy()
    rate = util.rate()

    # Dump the entropy and rate to stdout and flush the stdout so we don't end
    # up with a buffer.
    print(json.dumps({'entropy': entropy, 'rate': rate}))
    stdout.flush()

    # Rest of one second. This will give us a nice smooth function for the rate
    # and entropy values.
    sleep(1)
Ejemplo n.º 28
0
    def forward(self, quant_pred, target_wav):
        # Loss per embedding vector
        l2_loss_embeds = self.l2(self.bn.ze, self.bn.emb)
        com_loss_embeds = self.bn.l2norm_min * self.bn.gamma
        # l2_loss_embeds = self.l2(self.bn.ze, self.bn.emb).sqrt()
        # com_loss_embeds = self.bn.l2norm_min.sqrt() * self.bn.gamma

        log_pred = self.logsoftmax(quant_pred)
        log_pred_target = torch.gather(log_pred, 1, target_wav.unsqueeze(1))

        # Loss per timestep
        rec_loss_ts = -log_pred_target
        l2_loss_ts = self.combine(l2_loss_embeds.unsqueeze(1))[..., :-1]
        com_loss_ts = self.combine(com_loss_embeds.unsqueeze(1))[..., :-1]

        # Use only a subset of the overlapping windows
        sl = slice(0, 1)
        rec_loss_sel = rec_loss_ts[..., sl]
        l2_loss_sel = l2_loss_ts[..., sl]
        com_loss_sel = com_loss_ts[..., sl]

        total_loss_sel = rec_loss_sel + l2_loss_sel + com_loss_sel
        # total_loss_ts = l2_loss_ts
        # total_loss_ts = com_loss_ts
        # total_loss_ts = com_loss_ts + l2_loss_ts
        # total_loss_ts = log_pred_loss_ts + l2_loss_ts
        # total_loss_ts = log_pred_loss_ts
        # total_loss_ts = com_loss_ts - com_loss_ts

        total_loss = total_loss_sel.mean()

        nh = self.bn.ind_hist / self.bn.ind_hist.sum()

        self.metrics = {
            'rec': rec_loss_sel.mean(),
            'l2': l2_loss_sel.mean(),
            'com': com_loss_sel.mean(),
            #'ze_rng': self.bn.ze.max() - self.bn.ze.min(),
            #'emb_rng': self.bn.emb.max() - self.bn.emb.min(),
            'min_ze': self.bn.ze_norm.min(),
            'max_ze': self.bn.ze_norm.max(),
            'min_emb': self.bn.emb_norm.min(),
            'max_emb': self.bn.emb_norm.max(),
            'hst_ent': util.entropy(self.bn.ind_hist, True),
            'hst_100': util.entropy(util.int_hist(self.bn.circ_inds, -1),
                                    True),
            #'p_m': log_pred.max(dim=1)[0].to(torch.float).mean(),
            #'p_sd': log_pred.max(dim=1)[0].to(torch.float).std(),
            'nunq': self.bn.uniq.nelement(),
            'pk_m': log_pred.max(dim=1)[0].to(torch.float).mean(),
            'pk_nuq': log_pred.max(dim=1)[1].unique().nelement(),
            # 'peak_unq': log_pred.max(dim=1)[1].unique(),
            'pk_sd': log_pred.max(dim=1)[0].to(torch.float).std(),
            # 'unq': self.bn.uniq,
            #'m_ze': self.bn.ze_norm.max(),
            #'m_emb': self.bn.emb_norm.max()
            #emb0 = emb - emb.mean(dim=0)
            #chan_var = (emb0 ** 2).sum(dim=0)
            #chan_covar = torch.matmul(emb0.transpose(1, 0), emb0) - torch.diag(chan_var)
        }
        # netmisc.print_metrics(losses, 10000000)

        return total_loss
Ejemplo n.º 29
0
def get_entropy():
    entropy = util.entropy()
    return json.dumps({'entropy': entropy})
Ejemplo n.º 30
0
    def learn(self, X, y):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree
        
        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a 
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        nrows = len(X)
        try:
            ncols=len(X[0])
        except:
            print(X)
            

        ent = entropy(y)

        self.tree['entropy'] = ent

        if ent < 0.1 or len(y)<=100:
            if len(y)==0:
                return self

            self.tree['class'] = scipy.stats.mode(y).mode[0]
            self.tree['split_attr'] = 'null'
            self.tree['split_val'] = 'null'
            self.tree['left_child']= None
            self.tree['right_child']= None
            return self

        info_gain = []
        split_val = []
        
        
        
        

        for idx in range(ncols-1):
            best_val_for_column = 0
            best_gain_for_column = 0
            
            series = [row[idx] for row in X] 
        
            
            steps = np.linspace(start=np.min(series),stop=np.max(series),num=5)[1:4]
            for val in steps: 
                X_left, X_right, y_left, y_right = partition_classes(X,y,idx,val)
                gain=information_gain(y,[y_left,y_right])
                if gain > best_gain_for_column: 
                    best_gain_for_column = gain
                    best_val_for_column = val
                        
            info_gain.append(best_gain_for_column)
            split_val.append(best_val_for_column)
            
                
        best_split_col = np.argmax(info_gain) 
        
        best_split_value = split_val[best_split_col]  
        
     
        X_left, X_right, y_left, y_right = partition_classes(X,y,best_split_col,best_split_value) 
        

        self.tree['class']='parent' 
        self.tree['split_attr']=best_split_col
        self.tree['split_val']=best_split_value
        
        self.tree['left_child'] = DecisionTree()
        self.tree['left_child'].learn(X_left,y_left)
        
        self.tree['right_child'] = DecisionTree()
        self.tree['right_child'].learn(X_right,y_right) 
    def learn(self, X, y, par_node={}, depth=0):
        # TODO: Train the decision tree (self.tree) using the the sample X and labels y
        # You will have to make use of the functions in utils.py to train the tree

        # Use the function best_split in util.py to get the best split and
        # data corresponding to left and right child nodes

        # One possible way of implementing the tree:
        #    Each node in self.tree could be in the form of a dictionary:
        #       https://docs.python.org/2/library/stdtypes.html#mapping-types-dict
        #    For example, a non-leaf node with two children can have a 'left' key and  a
        #    'right' key. You can add more keys which might help in classification
        #    (eg. split attribute and split value)
        ### Implement your code here
        #############################################

        entropy_y = entropy(y)

        if len(X) == 0 or len(y) == 0:
            self.tree['state'] = 'leaf'
            self.tree['result'] = 0
            return

        if len(set(y)) == 1:
            # if all same in y
            self.tree['state'] = 'leaf'
            self.tree['result'] = y[0]
            return

        y_dist = {}
        for yi in y:
            if yi in y_dist.keys():
                y_dist[yi] += 1
            else:
                y_dist[yi] = 1

        y_max_val = 0
        y_max_count = 0

        for k, v in y_dist.items():
            if v > y_max_count:
                y_max_val = k
                y_max_count = v

        all_same = True
        for i in range(1, len(X)):
            if X[i] == X[i - 1]:
                continue
            else:
                all_same = False
                break

        if all_same or depth == self.max_depth:
            self.tree['state'] = 'leaf'
            self.tree['result'] = y_max_val
            return

        split_column, split_value, X_left, X_right, y_left, y_right = best_split(
            X, y)

        self.tree['state'] = "parent"
        self.tree['result'] = "null"

        self.tree['split_attr'] = split_column
        self.tree['split_val'] = split_value

        self.tree['left'] = DecisionTree()
        self.tree['left'].learn(X_left, y_left, self, depth + 1)

        self.tree['right'] = DecisionTree()
        self.tree['right'].learn(X_right, y_right, self, depth + 1)