Esempio n. 1
0
    def insert(self, M_c, T, X_L_list, X_D_list, new_rows=None, N_GRID=31, CT_KERNEL=0):
        """
        Insert mutates the data T.
        """

        if new_rows is None:
            raise ValueError("new_row must exist")

        if not isinstance(new_rows, list):
            raise TypeError('new_rows must be list of lists')
            if not isinstance(new_rows[0], list):
                raise TypeError('new_rows must be list of lists')

        X_L_list, X_D_list, was_multistate = su.ensure_multistate(X_L_list, X_D_list)

        # get insert arg tuples
        arg_tuples = self.get_insert_arg_tuples(M_c, T, X_L_list, X_D_list, new_rows, N_GRID,
                                                CT_KERNEL)

        chain_tuples = self.mapper(self.do_insert, arg_tuples)
        X_L_list, X_D_list = zip(*chain_tuples)

        if not was_multistate:
            X_L_list, X_D_list = X_L_list[0], X_D_list[0]

        T.extend(new_rows)

        ret_tuple = X_L_list, X_D_list, T

        return ret_tuple
Esempio n. 2
0
    def ensure_row_dep_constraint(self, M_c, T, X_L, X_D, row1, row2,
            dependent=True, wrt=None, max_iter=100, force=False):
        """Ensures dependencey or indepdendency between rows with respect to
        (wrt) columns."""
        X_L_list, X_D_list, was_multistate = su.ensure_multistate(X_L, X_D)
        if force:
            raise NotImplementedError
        else:
            kernel_list = ('row_partition_assignements',)
            for i, (X_L_i, X_D_i) in enumerate(zip(X_L_list, X_D_list)):
                iters = 0
                X_L_tmp = copy.deepcopy(X_L_i)
                X_D_tmp = copy.deepcopy(X_D_i)
                while not self.assert_row(X_L_tmp, X_D_tmp, row1, row2,
                        dependent=dependent, wrt=wrt):
                    if iters >= max_iter:
                        raise RuntimeError('Maximum ensure iterations reached.')
                    res = self.analyze(M_c, T, X_L_i, X_D_i, kernel_list=kernel_list,
                        n_steps=1, r=(row1,))
                    X_L_tmp = res[0]
                    X_D_tmp = res[1]
                    iters += 1
                X_L_list[i] = X_L_tmp
                X_D_list[i] = X_D_tmp

        if was_multistate:
            return X_L_list, X_D_list
        else:
            return X_L_list[0], X_D_list[0]
Esempio n. 3
0
    def assert_row(self, X_L, X_D, row1, row2, dependent=True, wrt=None):
        X_L_list, X_D_list, was_multistate = su.ensure_multistate(X_L, X_D)

        if wrt is None:
            num_cols = len(X_L_list[0]['column_partition']['assignments'])
            wrt = list(range(num_cols))
        else:
            if not isinstance(wrt, list):
                raise TypeError('wrt must be a list')

        model_assertions = []

        for X_L_i, X_D_i in zip(X_L_list, X_D_list):
            view_assg = X_L_i['column_partition']['assignments']
            views_wrt = list(set([view_assg[col] for col in wrt]))
            model_assertion = True
            for view in views_wrt:
                if (X_D_i[view][row1] == X_D_i[view][row2]) != dependent:
                    model_assertion = False
                    break
            model_assertions.append(model_assertion)

        if was_multistate:
            return model_assertions
        else:
            return model_assertions[0]
        pass
Esempio n. 4
0
    def assert_col_dep_constraints(self,
                                   X_L,
                                   X_D,
                                   col1,
                                   col2,
                                   dependent=True,
                                   single_bool=False):
        # TODO: X_D is not used for anything other than ensure_multistate.
        # I should probably edit ensure_multistate to take X_L or X_D using
        # keyword arguments.
        X_L_list, _, was_multistate = su.ensure_multistate(X_L, X_D)
        model_assertions = []
        assertion = True

        for X_L_i in X_L_list:
            assg = X_L_i['column_partition']['assignments']
            assertion = (assg[col1] == assg[col2]) == dependent
            if single_bool and not assertion:
                return False
            model_assertions.append(assertion)

        if single_bool:
            return True

        if was_multistate:
            return model_assertions
        else:
            return model_assertions[0]
Esempio n. 5
0
    def assert_col_dep_constraints(self, X_L, X_D, col1, col2, dependent=True,
        single_bool=False):
        # TODO: X_D is not used for anything other than ensure_multistate.
        # I should probably edit ensure_multistate to take X_L or X_D using
        # keyword arguments.
        X_L_list, _, was_multistate = su.ensure_multistate(X_L, X_D)
        model_assertions = []
        assertion = True
        for X_L_i in X_L_list:
            assg = X_L_i['column_partition']['assignments']
            assertion = (assg[col1] == assg[col2]) == dependent
            if single_bool and not assertion:
                return False
            model_assertions.append(assertion)

        if single_bool:
            return True

        if was_multistate:
            return model_assertions
        else:
            return model_assertions[0]
Esempio n. 6
0
    def ensure_col_dep_constraints(self, M_c, M_r, T, X_L, X_D,
            dep_constraints, seed, max_rejections=100):
        """Ensures dependencey or indepdendency between columns.

        dep_constraints is a list of where each entry is an (int, int, bool) tuple
        where the first two entries are column indices and the third entry
        describes whether the columns are to be dependent (True) or independent
        (False).

        Behavior Notes:
        ensure_col_dep_constraints will add col_esnure enforcement to the
        metadata (top level of X_L); unensure_col will remove it. Calling
        ensure_col_dep_constraints twice will replace the first ensure.

        This operation destroys the existing X_L and X_D metadata; the user
        should be aware that it will clobber any existing analyses.

        Implementation Notes:
        Initialization is implemented via rejection (by repeatedly initalizing
        states and throwing ones out that do not adhear to dep_constraints).
        This means that in the event the contraints in dep_constraints are
        complex, or impossible, that the rejection alogrithm may fail.

        The returned metadata looks like this:
        >>> dep_constraints
        [(1, 2, True), (2, 5, True), (1, 5, True), (1, 3, False)]
        >>> X_L['col_ensure']
        {
            "dependent" :
            {
                1 : [2, 5],
                2 : [1, 5],
                5 : [1, 2]
            },
            "independent" :
            {
                1 : [3],
                3 : [1]
        }
        """
        X_L_list, X_D_list, was_multistate = su.ensure_multistate(X_L, X_D)
        if was_multistate:
            num_states = len(X_L_list)
        else:
            num_states = 1

        col_ensure_md = dict()
        col_ensure_md[True] = dict()
        col_ensure_md[False] = dict()

        for col1, col2, dependent in dep_constraints:
            if col1 == col2:
                raise ValueError("Cannot specify same columns in dependence"\
                    " constraints.")
            if str(col1) in col_ensure_md[dependent]:
                col_ensure_md[dependent][str(col1)].append(col2)
            else:
                col_ensure_md[dependent][str(col1)] = [col2]
            if col2 in col_ensure_md[dependent]:
                col_ensure_md[dependent][str(col2)].append(col1)
            else:
                col_ensure_md[dependent][str(col2)] = [col1]

        def assert_dep_constraints(X_L, X_D, dep_constraints):
            for col1, col2, dep in dep_constraints:
                if not self.assert_col_dep_constraints(X_L, X_D, col1, col2,
                    dep, True):
                    return False
            return True

        X_L_out = []
        X_D_out = []
        get_next_seed = make_get_next_seed(seed)
        for _ in range(num_states):
            counter = 0
            X_L_i, X_D_i = self.initialize(M_c, M_r, T, get_next_seed())
            while not assert_dep_constraints(X_L_i, X_D_i, dep_constraints):
                if counter > max_rejections:
                    raise RuntimeError("Could not ranomly generate a partition"\
                        " that satisfies the constraints in dep_constraints.")
                counter += 1
                X_L_i, X_D_i = self.initialize(M_c, M_r, T, get_next_seed())

            X_L_i['col_ensure'] = dict()
            X_L_i['col_ensure']['dependent'] = col_ensure_md[True]
            X_L_i['col_ensure']['independent'] = col_ensure_md[False]

            X_D_out.append(X_D_i)
            X_L_out.append(X_L_i)

        if was_multistate:
            return X_L_out, X_D_out
        else:
            return X_L_out[0], X_D_out[0]
Esempio n. 7
0
    def analyze(self, M_c, T, X_L, X_D, seed, kernel_list=(), n_steps=1, c=(),
                r=(),
                max_iterations=-1, max_time=-1, do_diagnostics=False,
                diagnostics_every_N=1,
                ROW_CRP_ALPHA_GRID=(),
                COLUMN_CRP_ALPHA_GRID=(),
                S_GRID=(), MU_GRID=(),
                N_GRID=31,
                do_timing=False,
                CT_KERNEL=0,
                ):
        """Evolve the latent state by running MCMC transition kernels

        :param seed: The random seed
        :type seed: int
        :param M_c: The column metadata
        :type M_c: dict
        :param T: The data table in mapped representation (all floats, generated
                  by data_utils.read_data_objects)
        :param X_L: the latent variables associated with the latent state
        :type X_L: dict
        :param X_D: the particular cluster assignments of each row in each view
        :type X_D: list of lists
        :param kernel_list: names of the MCMC transition kernels to run
        :type kernel_list: list of strings
        :param n_steps: the number of times to run each MCMC transition kernel
        :type n_steps: int
        :param c: the (global) column indices to run MCMC transition kernels on
        :type c: list of ints
        :param r: the (global) row indices to run MCMC transition kernels on
        :type r: list of ints
        :param max_iterations: the maximum number of times ot run each MCMC
                               transition kernel. Applicable only if
                               max_time != -1.
        :type max_iterations: int
        :param max_time: the maximum amount of time (seconds) to run MCMC
                         transition kernels for before stopping to return
                         progress
        :type max_time: float
        :returns: X_L, X_D -- the evolved latent state

        """
        if n_steps <= 0:
            raise ValueError("You must do at least one analyze step.")

        if CT_KERNEL not in [0, 1]:
            raise ValueError("CT_KERNEL must be 0 (Gibbs) or 1 (MH)")

        if do_timing:
            # diagnostics and timing are exclusive
            do_diagnostics = False
        diagnostic_func_dict, reprocess_diagnostics_func = do_diagnostics_to_func_dict(
            do_diagnostics)
        X_L_list, X_D_list, was_multistate = su.ensure_multistate(X_L, X_D)
        arg_tuples = self.get_analyze_arg_tuples(M_c, T, X_L_list, X_D_list,
                                                 kernel_list, n_steps, c, r,
                                                 max_iterations, max_time,
                                                 diagnostic_func_dict, diagnostics_every_N,
                                                 ROW_CRP_ALPHA_GRID,
                                                 COLUMN_CRP_ALPHA_GRID,
                                                 S_GRID, MU_GRID,
                                                 N_GRID,
                                                 do_timing,
                                                 CT_KERNEL,
                                                 make_get_next_seed(seed))
        chain_tuples = self.mapper(self.do_analyze, arg_tuples)
        X_L_list, X_D_list, diagnostics_dict_list = zip(*chain_tuples)
        if do_timing:
            timing_list = diagnostics_dict_list
        if not was_multistate:
            X_L_list, X_D_list = X_L_list[0], X_D_list[0]
        ret_tuple = X_L_list, X_D_list
        #
        if diagnostic_func_dict is not None:
            diagnostics_dict = munge_diagnostics(diagnostics_dict_list)
            if reprocess_diagnostics_func is not None:
                diagnostics_dict = reprocess_diagnostics_func(diagnostics_dict)
            ret_tuple = ret_tuple + (diagnostics_dict, )
        if do_timing:
            ret_tuple = ret_tuple + (timing_list, )
        return ret_tuple
Esempio n. 8
0
    def ensure_col_dep_constraints(self,
                                   M_c,
                                   M_r,
                                   T,
                                   X_L,
                                   X_D,
                                   dep_constraints,
                                   seed,
                                   max_rejections=100):
        """Ensures dependencey or indepdendency between columns.

        `dep_constraints` is a list of where each entry is an (int, int, bool)
        tuple where the first two entries are column indices and the third entry
        describes whether the columns are to be dependent (True) or independent
        (False).

        Behavior Notes:
        `ensure_col_dep_constraints` will add `col_ensure` enforcement to the
        metadata (top level of `X_L`); unensure_col will remove it. Calling
        ensure_col_dep_constraints twice will replace the first ensure.

        This operation destroys the existing `X_L` and `X_D` metadata; the user
        should be aware that it will clobber any existing analyses.

        Implementation Notes:
        Initialization is implemented via rejection (by repeatedly initalizing
        states and throwing ones out that do not adhear to dep_constraints).
        This means that in the event the contraints in dep_constraints are
        complex, or impossible, that the rejection alogrithm may fail.

        The returned metadata looks like this:
        >>> dep_constraints
        [(1, 2, True), (2, 5, True), (1, 3, False)]
        >>> X_L['col_ensure']
        {
            "dependent" : {
                1 : (1, 2, 5),
                2 : (1, 2, 5),
                5 : (1, 5, 2),
            },
            "independent" : {
                1 : [3],
                3 : [1],
            }
        }
        """
        X_L_list, X_D_list, was_multistate = su.ensure_multistate(X_L, X_D)

        if was_multistate:
            num_states = len(X_L_list)
        else:
            num_states = 1

        dependencies = [(c[0], c[1]) for c in dep_constraints if c[2]]
        independencies = [(c[0], c[1]) for c in dep_constraints if not c[2]]

        col_ensure_md = dict()
        col_ensure_md[True] = {
            str(key): list(val)
            for key, val in gu.get_scc_from_tuples(dependencies).iteritems()
        }
        col_ensure_md[False] = {
            str(key): list(val)
            for key, val in gu.get_scc_from_tuples(independencies).iteritems()
        }

        def assert_dep_constraints(X_L, X_D, dep_constraints):
            for col1, col2, dep in dep_constraints:
                if not self.assert_col_dep_constraints(X_L, X_D, col1, col2,
                                                       dep, True):
                    return False
            return True

        X_L_out = []
        X_D_out = []
        get_next_seed = make_get_next_seed(seed)

        for _ in range(num_states):
            counter = 0
            X_L_i, X_D_i = self.initialize(M_c, M_r, T, get_next_seed())
            while not assert_dep_constraints(X_L_i, X_D_i, dep_constraints):
                if counter > max_rejections:
                    raise RuntimeError(
                        'Could not ranomly generate a partition '
                        'that satisfies the constraints in dep_constraints.')
                counter += 1
                X_L_i, X_D_i = self.initialize(M_c, M_r, T, get_next_seed())

            X_L_i['col_ensure'] = dict()
            X_L_i['col_ensure']['dependent'] = col_ensure_md[True]
            X_L_i['col_ensure']['independent'] = col_ensure_md[False]

            X_D_out.append(X_D_i)
            X_L_out.append(X_L_i)

        if was_multistate:
            return X_L_out, X_D_out
        else:
            return X_L_out[0], X_D_out[0]