Ejemplo n.º 1
0
    def enforce_dummy_coded(self, X):
        """
        Enforces that for dummycoded features exactly one feature is set to 1, all the others to 0. Called after gradient ascend.

        :param X: Feature matrix (dimension `n_instances x n_features`)
        :returns: X' (modified feature matrix)
        """
        for k, v in StructuredDataset._parse_feature_names(
                self.feature_names)[0].items():
            ft_indices = (list(
                map(lambda x: self.feature_names.index(k + '=' + x), v)))
            #            print(k,ft_indices, v)
            max_index = np.argmax(X[:, ft_indices], axis=1)

            #            for i in range(len(max_index)):
            #                if X[i,ft_indices].sum() > 0 and k == 'credit_history':
            #                    print(k)
            #                    print(X[i,ft_indices])
            #                    print((X[i,ft_indices] == 1))

            X[:, ft_indices] = 0
            for i in range(len(max_index)):
                X[i, ft_indices[max_index[i]]] = 1
            for x in X:
                assert (x[ft_indices].sum() == 1)

#        print(X.shape)
        return X
Ejemplo n.º 2
0
 def _get_domain(self, ft):
     """
     Infers domain of feature.
     :param ft: Feature name
     :returns: Domain
     """
     if callable(self.domains[ft]):
         return [self.domains[ft]()]
     elif self._is_dummy_coded(ft):
         raise Exception("Can't use dummy coded for sim")
         warnings.warn(
             "Use set of values present in dataset to infer domain for feature "
             + ft)
         # discrete, dummy coded
         return StructuredDataset._parse_feature_names(
             self.feature_names)[0][ft]
     elif ft in self.discrete:
         # discrete
         #warnings.warn("Use set of values present in dataset to infer domain for feature " + ft)
         return list(set(self.features[:, self._ft_index(ft)]))
     else:
         # continious
         df, _ = self.convert_to_dataframe()
         warnings.warn("Used min/max for feature " + ft +
                       " to infer domain + unsupported/not implemented yet")
         return (min(df[ft]), max(df[:, ft]))
Ejemplo n.º 3
0
 def _is_dummy_coded(self, ft):
     """
     :param ft: Feature name
     :returns: True if ft is dummycoded
     """
     # fix this
     return len(
         StructuredDataset._parse_feature_names(self.feature_names)[0][ft])
Ejemplo n.º 4
0
    def _dedummy_code_obj(self, obj, sep='='):
        """
        :param obj: Instance (feature values) in object form (dict)
        :param sep: Seperator used for dummy coding
        :returns: dedummy coded object
        """
        # reimplemented this bc library is too slow for one row only...
        result_obj = obj.copy()
        for k, v in (StructuredDataset._parse_feature_names(
                self.feature_names)[0]).items():
            # figure out which dummy coded is set to 1
            value_l = list(filter(lambda x: obj[k + sep + x] == 1, v))
            value = value_l.pop() if len(value_l) > 0 else None

            # convert to non-dummy coded
            result_obj[k] = value

            # remove all dummy coded ie [key=value]
            [result_obj.pop(k + sep + option) for option in v]

        return result_obj
Ejemplo n.º 5
0
    def scale_dummy_coded(self, X):
        """
        Ensures that the values for one dummy-coded feature sum up to 1 (scales accordingly). Called during gradient ascend. You may find an in-depth explanation in the write-up.

        :param X: Feature matrix (dimension `n_instances x n_features`)
        :returns: X' (modified feature matrix)
        """
        #print(np.where(X[:,12]>0.8))

        for k, v in StructuredDataset._parse_feature_names(
                self.feature_names)[0].items():
            ft_indices = (list(
                map(lambda x: self.feature_names.index(k + '=' + x), v)))

            #if k == 'property':
            #    print(X[4,ft_indices])

            X[:, ft_indices] = X[:, ft_indices] / X[:, ft_indices].sum(
                axis=1)[:, None]

            assert (np.isclose(X[:, ft_indices].sum(axis=1).sum(), len(X)))

        return X