def split_cols_RDC_py(local_data, ds_context, scope):
        meta_types = ds_context.get_meta_types_by_scope(scope)
        domains = ds_context.get_domains_by_scope(scope)

        if local_data.shape[0] > max_sampling_threshold_cols:
            local_data_sample = local_data[np.random.randint(local_data.shape[0], size=max_sampling_threshold_cols), :]
            clusters = getIndependentRDCGroups_py(
                local_data_sample,
                threshold,
                meta_types,
                domains,
                k=k,
                s=s,
                # ohe=True,
                non_linearity=non_linearity,
                n_jobs=n_jobs,
                rand_gen=rand_gen,
            )
            return split_data_by_clusters(local_data, clusters, scope, rows=False)
        else:
            clusters = getIndependentRDCGroups_py(
                local_data,
                threshold,
                meta_types,
                domains,
                k=k,
                s=s,
                # ohe=True,
                non_linearity=non_linearity,
                n_jobs=n_jobs,
                rand_gen=rand_gen,
            )
            return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 2
0
    def split_cols_RDC_py(local_data,
                          ds_context,
                          scope,
                          l_rfft=None,
                          is_pair=False):
        meta_types = ds_context.get_meta_types_by_scope(scope)
        domains = ds_context.get_domains_by_scope(scope)
        ### by zhongjie
        # local_data = local_data[:, scope]

        clusters = getIndependentRDCGroups_py(
            local_data,
            threshold,
            meta_types,
            domains,
            scope,
            l_rfft,
            is_pair,
            k=k,
            s=s,
            # ohe=True,
            non_linearity=non_linearity,
            n_jobs=n_jobs,
            rand_gen=rand_gen,
        )

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 3
0
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 4
0
    def split_rows_RDC(local_data, ds_context, scope):
        data = get_RDC_transform(
            local_data, ds_context.get_meta_types_by_scope(scope), ohe, k=k, s=s)

        clusters = KMeans(n_clusters=n_clusters, random_state=seed, n_jobs=1).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 5
0
    def split_rows_naive_mle_conditioning(local_data):
        # mle conditioning

        original_ll = naive_ll(local_data)

        scope = [i for i in range(local_data.shape[1])]
        
        best_col_conditioning = None
        best_conditioning_ll = -np.inf
        for col_conditioning in range(local_data.shape[1]):
            ones = np.sum(local_data[:,col_conditioning])
            if  ones == 0 or ones == local_data.shape[0]:
                continue

            clusters = (local_data[:,col_conditioning]==1).astype(int)
            data_slices = split_data_by_clusters(local_data, clusters, scope, rows=True)
            
            left_data_slice, left_scope_slice, left_proportion = data_slices[0]
            right_data_slice, right_scope_slice, right_proportion = data_slices[1]

            left_data_slice = np.hstack((left_data_slice[:,:col_conditioning],left_data_slice[:,(col_conditioning+1):])).reshape(left_data_slice.shape[0],left_data_slice.shape[1]-1)
            right_data_slice = np.hstack((right_data_slice[:,:col_conditioning],right_data_slice[:,(col_conditioning+1):])).reshape(right_data_slice.shape[0],right_data_slice.shape[1]-1)

            left_ll = naive_ll(left_data_slice)
            right_ll = naive_ll(right_data_slice)
                
            conditioning_ll = ( (left_ll + np.log(left_proportion)) * left_data_slice.shape[0] + \
                                (right_ll + np.log(right_proportion)) * right_data_slice.shape[0]) / local_data.shape[0]
            if conditioning_ll > best_conditioning_ll:
                best_conditioning_ll = conditioning_ll
                best_col_conditioning = col_conditioning    
                
        return best_col_conditioning, best_col_conditioning!=None
Ejemplo n.º 6
0
    def split_cols_RDC(local_data, ds_context, scope):
        adjm = get_RDC_adjacency_matrix(
            local_data, ds_context.get_meta_types_by_scope(scope), ohe, linear)

        clusters = clusters_by_adjacency_matrix(adjm, threshold,
                                                local_data.shape[1])

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 7
0
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        from sklearn.cluster import KMeans
        km_model = KMeans(n_clusters=n_clusters, random_state=seed)
        clusters = km_model.fit_predict(data)
        return split_data_by_clusters(local_data, clusters, scope,
                                      rows=True), km_model
Ejemplo n.º 8
0
    def split_conditional_rows_KMeans(local_data, ds_context, scope):
        y, x = get_YX(local_data, ds_context.feature_size)
        data = preproc(y, ds_context, pre_proc, ohe)

        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed,
                          precompute_distances=True).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 9
0
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)
        kmeans_data = TSNE(n_components=3,
                           verbose=verbose,
                           n_jobs=ncpus,
                           random_state=seed).fit_transform(data)
        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed).fit_predict(kmeans_data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 10
0
    def split_cols_random_partitions(local_data, ds_context, scope):
        if rand_gen.random_sample() < fail:
            return [(local_data, scope, 1.0)]

        clusters = np.zeros_like(scope)

        for i, new_scope in enumerate(np.array_split(np.argsort(scope), 2)):
            clusters[new_scope] = i

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 11
0
    def split_rows_binary_random_partition(local_data, ds_context, scope):
        # data = preproc(local_data, ds_context, pre_proc, ohe)

        # draw percentage of split from  a Beta
        alloc_perc = rand_gen.beta(a=beta_a, b=beta_b)
        clusters = rand_gen.choice(2,
                                   size=local_data.shape[0],
                                   p=[alloc_perc, 1 - alloc_perc])

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 12
0
    def split_rows_Gower(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, False)

        try:
            df = robjects.r["as.data.frame"](data)
            clusters = robjects.r["mixedclustering"](df, ds_context.distribution_family, n_clusters, seed)
            clusters = np.asarray(clusters)
        except Exception as e:
            np.savetxt("/tmp/errordata.txt", local_data)
            print(e)
            raise e

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 13
0
    def split_rows_GMM(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        estimator = GaussianMixture(
            n_components=n_clusters,
            covariance_type=covariance_type,
            max_iter=max_iter,
            n_init=n_init,
            random_state=seed,
        )

        clusters = estimator.fit(data).predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 14
0
    def split_cols_RDC_py(local_data, ds_context, scope):
        meta_types = ds_context.get_meta_types_by_scope(scope)
        domains = ds_context.get_domains_by_scope(scope)

        clusters = getIndependentRDCGroups_py(local_data,
                                              threshold,
                                              meta_types,
                                              domains,
                                              k=k,
                                              s=s,
                                              # ohe=True,
                                              non_linearity=non_linearity,
                                              n_jobs=n_jobs,
                                              rand_gen=rand_gen)

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        if data.shape[0] > max_sampling_threshold_rows:
            data_sample = data[np.random.randint(data.shape[0], size=max_sampling_threshold_rows), :]

            kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
            clusters = kmeans.fit(data_sample).predict(data)
        else:
            kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
            clusters = kmeans.fit_predict(data)

        cluster_centers = kmeans.cluster_centers_
        result = split_data_by_clusters(local_data, clusters, scope, rows=True)

        return result, cluster_centers.tolist()
Ejemplo n.º 16
0
    def split_rows_RDC_py(local_data, ds_context, scope):
        meta_types = ds_context.get_meta_types_by_scope(scope)
        domains = ds_context.get_domains_by_scope(scope)

        rdc_data = rdc_transformer(local_data,
                                   meta_types,
                                   domains,
                                   k=k,
                                   s=s,
                                   non_linearity=non_linearity,
                                   return_matrix=True,
                                   rand_gen=rand_gen)

        clusters = KMeans(n_clusters=n_clusters,
                          random_state=rand_gen, n_jobs=n_jobs).fit_predict(rdc_data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 17
0
    def split_rows_RuleClustering(
        local_data,
        ds_context,
        scope,
    ):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        # https://stackoverflow.com/a/39772170/5595684
        km = KMeans(k, random_state=rand_state)
        km_clusters = km.fit_predict(data)
        lab, count = np.unique(km.labels_, return_counts=True)
        # inverse weight classes, todo test if this works ok
        N = len(data)
        lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)}
        W = [lab_wgt[lab] for lab in km.labels_]

        if model == 'stump':
            dtc = DecisionTreeClassifier(
                random_state=rand_state,
                max_depth=1,
            ).fit(data, km.labels_, sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'tree':
            dtc = DecisionTreeClassifier(random_state=rand_state,
                                         max_depth=None,
                                         ccp_alpha=0.05,
                                         min_impurity_split=0.01
                                         # max_leaf_nodes=2*10**(self.k+1)
                                         ).fit(data,
                                               km.labels_,
                                               sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'm-estimate':
            raise ValueError('Not implemented')
        else:
            raise ValueError(str(model) + ' unknown model type')

        # todo try out rule clusters
        right_rule = left_rule.negate()
        rule_clusters = (right_rule.apply(
            data, scope_partial_data=scope)).astype(int)
        split = split_data_by_clusters(data, rule_clusters, scope, rows=True)
        assert len(split) == 2
        return split, (left_rule, right_rule)
Ejemplo n.º 18
0
    def split_cols_binary_random_partitions(local_data, ds_context, scope):
        # data = preproc(local_data, ds_context, None, ohe)

        #
        # with a certain percentage it may fail, such that row partitioning may happen
        clusters = None
        p = rand_gen.rand()
        #print('P', p)
        if p > threshold:
            #
            # draw percentage of split from  a Beta
            alloc_perc = rand_gen.beta(a=beta_a, b=beta_b)
            clusters = rand_gen.choice(2,
                                       size=local_data.shape[1],
                                       p=[alloc_perc, 1 - alloc_perc])
            #print(clusters, clusters.sum(), clusters.shape, alloc_perc)
        else:
            clusters = np.zeros(local_data.shape[1])

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 19
0
    def split_rows_Gower(local_data, ds_context, scope):
        y, x = get_YX(local_data, ds_context.feature_size)
        data = preproc(y, ds_context, pre_proc, False)

        feature_types = []
        for s in scope:
            mt = ds_context.meta_types[s]
            if mt == MetaType.BINARY:
                feature_types.append("categorical")
            elif mt == MetaType.DISCRETE:
                feature_types.append("discrete")
            else:
                feature_types.append("continuous")

        try:
            df = robjects.r["as.data.frame"](data)
            clusters = robjects.r["mixedclustering"](df, feature_types,
                                                     n_clusters, seed)
            clusters = np.asarray(clusters)
        except Exception as e:
            np.savetxt("/tmp/errordata.txt", local_data)
            raise e

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 20
0
    def split_rows_DBScan(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        clusters = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 21
0
    def split_cols_random_partitions(local_data, ds_context, scope):
        #same as above, but transpose the data
        data = preproc(local_data.T, ds_context, None, ohe)
        clusters = above(make_planes(1, data.shape[1], rand_gen), data)[:, 0]

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 22
0
    def split_rows_random_partitions(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, None, ohe)
        clusters = above(make_planes(1, local_data.shape[1], rand_gen),
                         data)[:, 0]

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Ejemplo n.º 23
0
    def split_cols_GTest(local_data, ds_context, scope):

        domains = ds_context.domains
        clusters = gtest_greedy_feature_split(local_data, domains, threshold,
                                              rand_gen)
        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Ejemplo n.º 24
0
    def split_rows_RuleClustering(
        local_data,
        ds_context,
        scope,
    ):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        #https://stackoverflow.com/a/39772170/5595684
        km = KMeans(k, random_state=rand_state)
        km_clusters = km.fit_predict(data)
        lab, count = np.unique(km.labels_, return_counts=True)
        #inverse weight classes, todo test if this works ok
        N = len(data)
        lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)}
        W = [lab_wgt[lab] for lab in km.labels_]

        if model == 'stump':
            dtc = DecisionTreeClassifier(
                random_state=rand_state,
                max_depth=1,
            ).fit(data, km.labels_, sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'tree':
            dtc = DecisionTreeClassifier(
                random_state=rand_state,
                max_depth=None,
                ccp_alpha=0.05,
                min_impurity_split=0.01  #max_leaf_nodes=2*10**(self.k+1)
            ).fit(data, km.labels_, sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'm-estimate':
            raise ValueError('Not implemented')
        else:
            raise ValueError(str(model) + ' unknown model type')

        if debug:
            import matplotlib as plt  #todo remove when everythings working
            dt_labels = dtc.predict(data)
            # plot_tree(dtc)
            # plt.show()
            print(export_text(dtc.tree_.value))

            if data.shape[1] == 2:
                fig, ax = plt.subplots()
                colors = np.full(dt_labels.shape, 'blue', dtype=object)
                np.putmask(colors, dt_labels.astype(bool), 'green')
                colors[km.labels_ != dt_labels] = 'black'
                #plot rule:
                assert len(left_rule) <= 2
                for cond in left_rule:
                    if cond['feature'] == 0:
                        ax.axvline(cond['threshhold'], )
                    else:
                        ax.axhline(cond['threshhold'])
                ax.scatter(data[:, 0], data[:, 1], c=colors)
                plt.show()

        # todo try out rule clusters
        # rule_clusters = rule.apply(data)
        split = split_data_by_clusters(data, km_clusters, scope, rows=True)
        assert len(split) == 2
        right_rule = left_rule.negate()
        return split, (left_rule, right_rule)
Ejemplo n.º 25
0
def learn_structure_cnet(
    dataset,
    ds_context,
    conditioning,
    create_leaf,
    next_operation_cnet=get_next_operation_cnet(),
    initial_scope=None,
    data_slicer=default_slicer,
):
    assert dataset is not None
    assert ds_context is not None
    assert create_leaf is not None
    assert next_operation_cnet is not None

    root = Product()
    root.children.append(None)

    if initial_scope is None:
        initial_scope = list(range(dataset.shape[1]))

    tasks = deque()
    tasks.append((dataset, root, 0, initial_scope))

    while tasks:

        local_data, parent, children_pos, scope = tasks.popleft()

        operation, op_params = next_operation_cnet(local_data, scope)

        logging.debug("OP: {} on slice {} (remaining tasks {})".format(
            operation, local_data.shape, len(tasks)))

        if operation == Operation.CONDITIONING:
            from spn.algorithms.splitting.Base import split_data_by_clusters

            conditioning_start_t = perf_counter()

            col_conditioning, found_conditioning = conditioning(local_data)

            if not found_conditioning:
                node = create_leaf(local_data, ds_context, scope)
                parent.children[children_pos] = node

                continue

            clusters = (local_data[:, col_conditioning] == 1).astype(int)
            data_slices = split_data_by_clusters(local_data,
                                                 clusters,
                                                 scope,
                                                 rows=True)

            node = Sum()
            node.scope.extend(scope)
            parent.children[children_pos] = node

            for data_slice, scope_slice, proportion in data_slices:
                assert isinstance(scope_slice, list), "slice must be a list"

                node.weights.append(proportion)

                product_node = Product()
                node.children.append(product_node)
                node.children[-1].scope.extend(scope)

                right_data_slice = np.hstack(
                    (data_slice[:, :col_conditioning],
                     data_slice[:, (col_conditioning + 1):])).reshape(
                         data_slice.shape[0], data_slice.shape[1] - 1)
                product_node.children.append(None)
                tasks.append((
                    right_data_slice,
                    product_node,
                    len(product_node.children) - 1,
                    scope_slice[:col_conditioning] +
                    scope_slice[col_conditioning + 1:],
                ))

                left_data_slice = data_slice[:, col_conditioning].reshape(
                    data_slice.shape[0], 1)
                product_node.children.append(None)
                tasks.append((left_data_slice, product_node,
                              len(product_node.children) - 1,
                              [scope_slice[col_conditioning]]))

            conditioning_end_t = perf_counter()
            logging.debug("\t\tconditioning  (in {:.5f} secs)".format(
                conditioning_end_t - conditioning_start_t))

            continue

        elif operation == Operation.CREATE_LEAF:
            cltree_start_t = perf_counter()
            node = create_leaf(local_data, ds_context, scope)
            parent.children[children_pos] = node
            cltree_end_t = perf_counter()
        else:
            raise Exception("Invalid operation: " + operation)

    node = root.children[0]
    assign_ids(node)
    valid, err = is_valid(node)
    assert valid, "invalid spn: " + err
    node = Prune(node)
    valid, err = is_valid(node)
    assert valid, "invalid spn: " + err

    return node