コード例 #1
0
def spc_querying_naive(g : graph_tool.Graph, paths, y, trust_own_predictions=True, weight=None, closed_interval=False):
    '''

    :param g:
    :param paths: list of paths
    :param y: ground truth
    :param weight:
    :return:
    '''
    known_labels = -np.ones(g.num_vertices())*np.inf
    budget = np.zeros(g.num_vertices())
    for i, path in enumerate(paths):
        if not trust_own_predictions or known_labels[path[0]] == -np.inf:
            budget[i] += 1
            known_labels[path[0]] = y[path[0]]
        if not trust_own_predictions or known_labels[path[-1]] == -np.inf:
            budget[i] += 1
            known_labels[path[-1]] = y[path[-1]]

        if known_labels[path[0]] == known_labels[path[-1]]:
            known_labels[path] = known_labels[path[0]]
        else:
            label_budget, new_labels = binarySearch(y[path], 0, len(path)-1, known_labels[path[0]], known_labels[path])
            known_labels[path] = new_labels
            budget[i] += label_budget
        if closed_interval:
            p =closure.compute_hull(g, np.where(known_labels==np.unique(y)[0])[0], weight, compute_closure=False)
            n = closure.compute_hull(g, np.where(known_labels==np.unique(y)[1])[0], weight, compute_closure=False)

            known_labels[p] = np.unique(y)[0]
            known_labels[n] = np.unique(y)[1]

    return known_labels, budget
コード例 #2
0
def is_convex(dataset):
    X = np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',X.tab')
    #X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
    y = (np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',y.tab'))

    n = 300
    for n_prime in [n]:
        print("================================")
        print("n_prime=", n_prime)
        for q in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05]:
            print("q=", q)
            dists = scipy.spatial.distance.cdist(X, X)
            y = y[:n]

            W = dists[:n, :n]  #np.exp(-(dists) ** 2 / (2 * sigma ** 2))
            np.fill_diagonal(W, 0)
            W[W > np.quantile(W, q)] = np.inf
            # W2 = np.copy(W) less edges is slower strangely
            # W2[W2 <= 0.1] = 0

            weights = W[(W < np.inf) & (W > 0)].flatten()
            edges = np.array(np.where((W < np.inf) & (W > 0))).T

            np.random.seed(0)

            g = gt.Graph()

            # construct actual graph
            g.add_vertex(n)
            g.add_edge_list(edges)
            weight_prop = g.new_edge_property("double", vals=weights)

            comps, hist = gt.topology.label_components(g)

            print(len(simplicial_vertices(g)))
            continue
            paths = shortest_path_cover_logn_apx(g, weight_prop)

            sum = 0
            for i in paths:
                sum += np.ceil(np.log2(len(i)))

            print("|S|=", len(paths))
            print("#queries<=", sum, "%:", sum / n)

            pos = list(np.arange(n)[y > 0])[:n_prime]
            neg = list(np.arange(n)[y <= 0])[:n_prime]

            print(n, pos, neg)
            print("p", len(pos))
            print("n", len(neg))

            pos_hull = closure.compute_hull(g, pos, weight_prop, comps, hist)
            print(np.sum(pos_hull))
            neg_hull = closure.compute_hull(g, neg, weight_prop, comps, hist)
            print(np.sum(neg_hull))
            print(
                len(
                    set(np.where(pos_hull)[0]).intersection(
                        set(np.where(neg_hull)[0]))) / n)
コード例 #3
0
def is_convex(directed):
    print("cora")
    np.random.seed(0)
    edges = np.genfromtxt('res/cora/cora.edges', dtype=np.int,
                          delimiter=',')[:, :2] - 1

    labels = np.genfromtxt('res/cora/cora.node_labels',
                           dtype=np.int,
                           delimiter=',')[:, 1]

    g = gt.Graph(directed=directed)

    g.add_edge_list(edges)

    weight = g.new_edge_property("double", val=1)

    comps, hist = gt.label_components(g)
    print(hist)
    dist_map = gt.shortest_distance(g, weights=weight)  #, weights=weight)
    simple = simplicial_vertices.simplicial_vertices(g)

    print("n=", g.num_vertices(), "s=", len(simple))

    spc = pickle.load(open("res/cora/spc_" + str(directed) + ".p",
                           "rb"))  #shortest_path_cover_logn_apx(g, weight)

    a, b = spc_querying_naive(g, spc, labels)
    print(a)
    print(b, np.sum(b))
    print(np.sum(a == labels))
    return

    print("len(spc)", len(spc))
    num_of_convex_paths = 0
    total_error = 0
    for p in spc:
        error = are_convex(labels[p])
        if error == 0:
            num_of_convex_paths += 1
        else:
            total_error += error

    print("#convex paths", num_of_convex_paths)
    print("total error on paths", total_error)
    return
    pickle.dump(spc, open("res/cora/spc_" + str(directed) + ".p", "wb"))

    for c in np.unique(labels):
        print("class label", c)
        print("class size: ", np.sum(labels == c))
        cls = np.where(labels == c)[0]
        for sample_size in [5, 10, 20, len(cls)]:
            print("sample_size", sample_size)
            if sample_size <= 20:
                times = 5
            else:
                times = 1
            for _ in range(times):

                sample = np.random.choice(cls, sample_size, replace=False)

                hull_p = compute_hull(g,
                                      sample,
                                      dist_map=dist_map,
                                      comps=comps,
                                      hist=hist,
                                      compute_closure=False)
                print("size interval: ", np.sum(hull_p))
                print("number of correct in interval: ", np.sum(hull_p[cls]))

                hull_p = compute_hull(g,
                                      sample,
                                      dist_map=dist_map,
                                      comps=comps,
                                      hist=hist)
                print("size hull: ", np.sum(hull_p))
                print("number of correct in interval: ", np.sum(hull_p[cls]))

    print("==================================")
コード例 #4
0
def budgeted_heuristic_querying(g: graph_tool.Graph, y, weights=None, budget=50, compute_hulls_between_queries=False,
                          hull_as_optimization=False, use_adjacency=False):
    '''

    :param g:
    :param paths: list of paths
    :param y: ground truth
    :param weight:
    :return:
    '''

    deg = g.degree_property_map("total").a
    #deg = deg*deg
    if use_adjacency:
        dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(range(g.num_vertices())).T

        adjacency = dist_map.copy()
        adjacency[adjacency > 1] = 0
    else:
        # to prevent overflow etc.
        dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(
            range(g.num_vertices())).T.astype(np.double)
        dist_map[dist_map > g.num_vertices()] = np.inf

    # hack to allow both endpoints as candidates:
    # new_spc = paths.copy()
    # for p in paths:
    #    new_spc.append(p[::-1])

    # paths = new_spc

    comps, hist = graph_tool.topology.label_components(g)
    n = g.num_vertices()
    classes = np.unique(y)
    known_labels = -np.ones(g.num_vertices()) * np.inf

    candidate_hulls = np.zeros(n, dtype=np.object)
    candidate_hull_sizes = np.zeros(n)
    known_classes = dict()
    classes_hulls = dict()
    for j in range(n):
        candidate_hulls[j] = dict()

    for c in classes:
        known_classes[c] = set()
        classes_hulls[c] = dict()
        classes_hulls[c] = np.zeros(n, np.bool)
        for j in range(n):
            one_hot = np.zeros(n, dtype=np.bool)
            one_hot[j] = True
            candidate_hulls[j][c] = one_hot  # singleton hull
    for z in range(budget):
        # compute most promising vertex
        for p in range(n):
            if known_labels[p] == -np.inf:
                candidate_hull_sizes[p] = helper_sum_sizes(candidate_hulls[p], classes_hulls)
            else:
                candidate_hull_sizes[p] = -1

        maximizers = np.where(candidate_hull_sizes == np.max(candidate_hull_sizes))[0]


        #overlap of classes
        classes_hulls_overlap = np.sum(np.array([key_index_array[1] for key_index_array in classes_hulls.items()]), axis=0)
        #classes_hulls_overlap[classes_hulls_overlap<=1] = 0
        maximizers = maximizers[np.where(classes_hulls_overlap[maximizers] == np.min(classes_hulls_overlap[maximizers]))[0]]

        #maximizers = maximizers[np.where(deg[maximizers] == np.max(deg[maximizers]))[0]]

        p_star = np.random.choice(maximizers)

        # query it
        known_labels[p_star] = y[p_star]

        # update data structures
        known_classes[known_labels[p_star]].add(p_star)
        classes_hulls[known_labels[p_star]] = candidate_hulls[p_star][known_labels[p_star]]

        for j in range(n):

            if known_labels[j] == -np.inf:# and not classes_hulls[c][j]:
                # if not candidate_hulls[j][c][candidate]:
                # if not classes_hulls[c][path[candidates[j]]]:
                # classes_hulls_c_set = set(np.where(classes_hulls[c])[0])
                # old_hull_with_new_candidate = list(classes_hulls_c_set)
                # old_hull_with_new_candidate.append(path[candidates[j]])
                c = known_labels[p_star]
                candidate_hulls[j][c] = compute_hull(g, list(known_classes[c].union([j])), weights,
                                                        dist_map, comps, hist,
                                                         hull_as_optimization)  # , classes_hulls_c_set)


                test = np.zeros(n, dtype=np.bool)

                for p1 in list(known_classes[c].union([j])):
                    for p2 in list(known_classes[c].union([j])):
                        test[dist_map[p1,:]+ dist_map[:,p2] == dist_map[p1,p2]] = True



        '''if compute_hulls_between_queries:
            for c in classes:
                known_labels[np.where(compute_hull(g, np.where(known_labels == c)[0], weights, dist_map, comps, hist))[0]] = c'''

        if compute_hulls_between_queries:
            known_labels_augmented = known_labels.copy()
            known_classes_hulls_temp = np.zeros((n, len(classes)), dtype=np.bool)
            for i, c in enumerate(classes):
                known_classes_hulls_temp[:, i] = compute_hull(g, np.where(known_labels_augmented == c)[0], weights,
                                                              dist_map, comps, hist, compute_closure=False)

            for i, c in enumerate(classes):
                only_c = known_classes_hulls_temp[:, i] & ~(
                    np.sum(known_classes_hulls_temp[:, np.arange(len(classes)) != i], axis=1).astype(bool))
                known_labels_augmented[only_c] = c

        else:
            known_labels_augmented = known_labels

        if use_adjacency:
            prediction = label_propagation(adjacency, known_labels_augmented, y, use_adjacency=use_adjacency)
        else:
            prediction = label_propagation(dist_map, known_labels_augmented, y, use_adjacency=use_adjacency)
        print("=====")
        print(z + 1, np.sum(known_labels > -np.inf))
        print(np.sum(np.array([i[1] for i in list(classes_hulls.items())]),axis=1))
        print("accuracy", np.sum(prediction == y) / y.size)
        #print(known_classes)

    return known_labels
コード例 #5
0
def budgeted_spc_querying(g : graph_tool.Graph, paths, y, weights=None, budget=50,  compute_hulls_between_queries=False, hull_as_optimization=False, use_adjacency=False):
    '''

    :param g:
    :param paths: list of paths
    :param y: ground truth
    :param weight:
    :return:
    '''

    if use_adjacency:
        dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(range(g.num_vertices())).T

        adjacency = dist_map.copy()
        adjacency[adjacency > 1] = 0
    else:
        #to prevent overflow etc.
        dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(
            range(g.num_vertices())).T.astype(np.double)
        dist_map[dist_map > g.num_vertices()] = np.inf

    #hack to allow both endpoints as candidates:
    #new_spc = paths.copy()
    #for p in paths:
    #    new_spc.append(p[::-1])

    #paths = new_spc

    comps, hist = graph_tool.topology.label_components(g)
    n = g.num_vertices()
    classes = np.unique(y)
    known_labels = -np.ones(g.num_vertices())*np.inf

    candidates = np.zeros(len(paths), dtype=np.int)
    candidate_generators = np.zeros(len(paths), dtype=np.object)
    for i, path in enumerate(paths):
        candidate_generators[i] = binarySearchGenerator(known_labels, path, 0, len(path)-1)
        candidates[i] = next(candidate_generators[i])

    candidate_hulls = np.zeros(len(paths), dtype=np.object)
    candidate_hull_sizes = np.zeros(len(paths))
    classes_hull_sizes = np.zeros(len(paths))
    known_classes = dict()
    classes_hulls = dict()

    deg = g.degree_property_map("total").a
    deg = deg*deg

    for j, candidate in enumerate(candidates):
        candidate_hulls[j] = dict()

    for c in classes:
        known_classes[c] = set()
        classes_hulls[c] = dict()
        for j, candidate in enumerate(candidates):
            temp = np.zeros(n, dtype=np.bool)
            classes_hulls[c] = temp.copy() #empty hulls
            temp[paths[j][candidate]] = True
            candidate_hulls[j][c] = temp #singleton hull
    for z in range(budget):
        #compute most promising vertex
        for p in range(len(paths)):
            if known_labels[paths[p][candidates[p]]] == -np.inf:
                candidate_hull_sizes[p] = helper_sum_sizes(candidate_hulls[p], classes_hulls)
            else:
                candidate_hull_sizes[p] = -1

        maximizers = np.where(candidate_hull_sizes == np.max(candidate_hull_sizes))[0]

        #prefer not queried paths
        if np.any(candidates[maximizers] == 0):
            maximizers = maximizers[np.where(candidates[maximizers] == 0)[0]]
            p_star = np.random.choice(maximizers)
        else:
            p_star = np.random.choice(maximizers)
        candidate = paths[p_star][candidates[p_star]]

        #query it
        known_labels[candidate] = y[candidate]

        #update data structures
        known_classes[known_labels[candidate]].add(candidate)
        classes_hulls[known_labels[candidate]] = candidate_hulls[p_star][known_labels[candidate]]



        for j in range(len(candidates)):
            path = paths[j]
            while known_labels[path[candidates[j]]] != -np.inf or path[candidates[j]] in classes_hulls[known_labels[candidate]]:
                try:
                    candidates[j] = next(candidate_generators[j])
                except StopIteration:
                    break
            #if not candidate_hulls[j][c][candidate]:
            #if not classes_hulls[c][path[candidates[j]]]:
                #classes_hulls_c_set = set(np.where(classes_hulls[c])[0])
                #old_hull_with_new_candidate = list(classes_hulls_c_set)
                #old_hull_with_new_candidate.append(path[candidates[j]])
            for c in classes:
                candidate_hulls[j][c] = compute_hull(g, list(known_classes[c].union([path[candidates[j]]])), weights, dist_map, comps, hist, hull_as_optimization)#, classes_hulls_c_set)

        '''if compute_hulls_between_queries:
            for c in classes:
                known_labels[np.where(compute_hull(g, np.where(known_labels == c)[0], weights, dist_map, comps, hist))[0]] = c'''

        if compute_hulls_between_queries:
            known_labels_augmented = known_labels.copy()
            known_classes_hulls_temp = np.zeros((n, len(classes)), dtype=np.bool)
            for i, c in enumerate(classes):
                known_classes_hulls_temp[:,i] = compute_hull(g, np.where(known_labels_augmented == c)[0], weights, dist_map, comps, hist, compute_closure=False)

            for i, c in enumerate(classes):
                only_c = known_classes_hulls_temp[:,i] & ~(np.sum(known_classes_hulls_temp[:,np.arange(len(classes))!=i],axis=1).astype(bool))
                known_labels_augmented[only_c] = c

        else:
            known_labels_augmented = known_labels

        if use_adjacency:
            prediction = label_propagation(adjacency, known_labels_augmented, y, use_adjacency=use_adjacency)
        else:
            prediction = label_propagation(dist_map, known_labels_augmented, y, use_adjacency=use_adjacency)
        print("======")
        print(z+1, np.sum(known_labels>-np.inf))
        print("accuracy", np.sum(prediction==y)/y.size)
        #print(known_classes)
        
    return known_labels
コード例 #6
0
def spc_querying_with_shadow(g: graph_tool.Graph, paths, weights, y):
    '''

    :param g:
    :param paths: list of paths
    :param y: ground truth
    :param weight:
    :return:
    '''
    np.random.seed(55)
    #these two lines make repetitive closure computation a lot faster
    dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(range(g.num_vertices())).T
    comps, hist = graph_tool.topology.label_components(g)

    known_labels = -np.ones(g.num_vertices())
    num_of_known_labels = 0
    budget = 0

    pos_value, neg_value = np.unique(y)

    next_candidate_queues = [Queue() for _ in paths]
    left = np.zeros(len(paths), dtype=np.int)
    right = np.array([len(p)-1 for p in paths], dtype=np.int)
    queue_idxs = list(range(len(paths)))

    n = g.num_vertices()

    for i,path in enumerate(paths):
        next_candidate_queues[i].put(0)
        if len(path) > 1:
            next_candidate_queues[i].put(len(path)-1)

    starting_idx = np.random.choice(np.where(right>0)[0])
    starting_path = paths[starting_idx]

    budget += 2
    l = next_candidate_queues[starting_idx].get()
    r = next_candidate_queues[starting_idx].get()
    known_labels[starting_path[l]] = y[starting_path[l]]
    known_labels[starting_path[r]] = y[starting_path[r]]

    if known_labels[starting_path[0]] == known_labels[starting_path[-1]]:
        #color the hull of the path in the color of the endpoints
        path_closure = np.where(compute_hull(g, starting_path, weights, dist_map, comps, hist))[0]
        known_labels[path_closure] = known_labels[starting_path[0]]
        num_of_known_labels = len(path_closure)
        del queue_idxs[starting_idx]
    else:
        if (len(starting_path)>=3):
            next_candidate_queues[starting_idx].put(l + (r - l)//2)
        else:
            del queue_idxs[starting_idx]
        num_of_known_labels = 2

    pos = np.where(known_labels==pos_value)[0]
    neg = np.where(known_labels==neg_value)[0]

    candidates = np.zeros(len(paths), dtype=np.int)

    candidates[queue_idxs] = [next_candidate_queues[queue_idx].get() for queue_idx in queue_idxs] #this is always relative to the path

    candidate_pos_hulls = np.zeros((len(paths),n), dtype=np.bool)
    temp_pos_hulls =  np.zeros((n,n), dtype=np.bool)
    if len(pos) > 0:
        candidate_pos_hulls[queue_idxs] = [closure.compute_hull(g, np.append(pos, paths[idx][candidates[idx]]), weights, dist_map, comps, hist) for idx in queue_idxs]
    else:
        for idx in queue_idxs:
            candidate_pos_hulls[idx][paths[idx][candidates[idx]]] = True
    candidate_neg_hulls = np.zeros((len(paths),n), dtype=np.bool)
    temp_neg_hulls = np.zeros((n, n), dtype=np.bool)
    if len(neg) > 0:
        candidate_neg_hulls[queue_idxs] = [closure.compute_hull(g, np.append(neg, paths[idx][candidates[idx]]), weights, dist_map, comps, hist) for idx in queue_idxs]
    else:
        for idx in queue_idxs:
            candidate_neg_hulls[idx][paths[idx][candidates[idx]]] = True
    pos_gains = np.zeros(len(paths))
    neg_gains = np.zeros(len(paths))

    while num_of_known_labels < n:
        to_remove = []
        changed = []
        for idx in queue_idxs:
            while known_labels[paths[idx][candidates[idx]]] >= 0:
                if not next_candidate_queues[idx].empty():
                    candidates[idx] = next_candidate_queues[idx].get()
                else:
                    maybe_remove = refill_queue_for_candidate(idx, candidates[idx], candidates, known_labels, left, next_candidate_queues, paths, queue_idxs, right)
                    if maybe_remove is not None:
                        to_remove.append(maybe_remove)
                        break
                    else:
                        candidates[idx] = next_candidate_queues[idx].get()
                changed.append(idx)

        for i in range(n):
            temp_pos_hulls[i] = closure.compute_hull(g, np.append(pos, i), weights, dist_map, comps, hist, True, pos if len(pos) > 0 else None)
            temp_neg_hulls[i] = closure.compute_hull(g, np.append(neg, i), weights, dist_map, comps, hist, True, neg if len(neg) > 0 else None)

        for i in changed:
            candidate_pos_hulls[i] = closure.compute_shadow(g, np.append(pos, paths[i][candidates[i]]), neg, weights, dist_map, comps, hist, B_hulls=temp_neg_hulls)
            candidate_neg_hulls[i] = closure.compute_shadow(g, np.append(neg, paths[i][candidates[i]]), pos, weights, dist_map, comps, hist, B_hulls=temp_pos_hulls)

        for i in to_remove:
            queue_idxs.remove(i)
            if np.sum(known_labels[paths[i]] >= 0) != len(paths[i]):
                exit(555)

        pos_gains[queue_idxs] = np.sum(candidate_pos_hulls[queue_idxs], axis=1) - len(pos)
        neg_gains[queue_idxs] = np.sum(candidate_neg_hulls[queue_idxs], axis=1) - len(neg)

        heuristic = np.average(np.array([pos_gains[queue_idxs], neg_gains[queue_idxs]]), axis=0)

        candidate_idx = queue_idxs[np.argmax(heuristic)]
        candidate_vertex = candidates[candidate_idx]

        if known_labels[paths[candidate_idx][candidate_vertex]] == y[paths[candidate_idx][candidate_vertex]]:
            exit(9)
        known_labels[paths[candidate_idx][candidate_vertex]] = y[paths[candidate_idx][candidate_vertex]]

        budget += 1

        if known_labels[paths[candidate_idx][candidate_vertex]] == pos_value:
            pos =np.where(candidate_pos_hulls[candidate_idx])[0]
            known_labels[pos]  = pos_value
            #only recompute pos hulls, the negatives won't change
            candidate_pos_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(pos, paths[idx][candidates[idx]]), neg, weights, dist_map, comps, hist, temp_neg_hulls) for idx in queue_idxs]
            candidate_neg_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(neg, paths[idx][candidates[idx]]), pos, weights, dist_map, comps, hist, temp_pos_hulls) for idx in queue_idxs]

        else:
            neg =np.where(candidate_neg_hulls[candidate_idx])[0]
            known_labels[neg] = neg_value
            # only recompute pos hulls, the negatives won't change
            candidate_pos_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(pos, paths[idx][candidates[idx]]), neg, weights, dist_map, comps, hist, temp_neg_hulls) for idx in queue_idxs]

            candidate_neg_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(neg, paths[idx][candidates[idx]]), pos, weights, dist_map, comps, hist, temp_pos_hulls) for idx in queue_idxs]

        if next_candidate_queues[candidate_idx].empty():

            maybe_remove = refill_queue_for_candidate(candidate_idx, candidate_vertex, candidates, known_labels, left, next_candidate_queues, paths, queue_idxs, right)
            if maybe_remove is None:
                candidates[candidate_idx] = next_candidate_queues[candidate_idx].get()
            else:
                queue_idxs.remove(candidate_idx)
        else:
            candidates[candidate_idx] = next_candidate_queues[candidate_idx].get()

        candidate_pos_hulls[candidate_idx] = closure.compute_shadow(g, np.append(pos, paths[candidate_idx][candidates[candidate_idx]]), neg, weights, dist_map, comps, hist, temp_neg_hulls)
        candidate_neg_hulls[candidate_idx] = closure.compute_shadow(g, np.append(neg, paths[candidate_idx][candidates[candidate_idx]]), pos, weights, dist_map, comps, hist, temp_pos_hulls)

        #pos = np.where(known_labels==pos_value)[0]
        #neg = np.where(known_labels==neg_value)[0]
        pos = np.where(compute_hull(g, np.where(known_labels==pos_value)[0], weights, dist_map, comps, hist))[0]
        neg = np.where(compute_hull(g, np.where(known_labels==neg_value)[0], weights, dist_map, comps, hist))[0]
        num_of_known_labels = len(pos) + len(neg)

        print(num_of_known_labels, n)

    return known_labels, budget
コード例 #7
0
def spc_querying_naive_one_convex(g : graph_tool.Graph, paths, y, convex_label, epsilon=0.5, weight=None, binary_search=False,closed_interval=False):
    '''

    :param g:
    :param paths: list of paths
    :param y: ground truth
    :param weight:
    :return:
    '''
    print("epsilon", epsilon)
    known_labels = -np.ones(g.num_vertices())*np.inf
    budget = np.zeros(g.num_vertices())

    non_convex_label = np.unique(y)
    non_convex_label = non_convex_label[int(np.where(non_convex_label==convex_label)[0]+1)%2]
    for i, full_path in enumerate(paths):

        if np.any(known_labels[full_path] == convex_label):
            smallest = np.min(np.where(known_labels[full_path] == convex_label)[0])
            biggest = np.max(np.where(known_labels[full_path] == convex_label)[0])

            if np.any(known_labels[full_path[:smallest]] == non_convex_label):
                known_labels[full_path[:np.max(np.where(known_labels[full_path[:smallest]] == non_convex_label)[0])]] = non_convex_label

            if np.any(known_labels[full_path[biggest:]] == non_convex_label):
                known_labels[full_path[np.min(np.where(known_labels[full_path[biggest:]] == non_convex_label)[0]):]] = non_convex_label

        path = np.array(full_path)[known_labels[full_path] == -np.inf]

        for z in range(1,int(np.ceil(1/epsilon))):
            j = int(z*(np.ceil(epsilon*len(path))))
            while j < len(path) and known_labels[path[j]] != -np.inf:
                j += 1
            if j >= len(path):
                break

            if np.sum(np.where(known_labels==-np.inf)[0]) <= epsilon*len(path):
                conv_region = np.where(known_labels[path] == convex_label)[0]
                if conv_region.size > 0:
                    known_labels[path] = known_labels[path[0]]
                    known_labels[np.min(conv_region):np.max(conv_region)+1] = convex_label
                break

            known_labels[path[j]] = y[path[j]]
            budget[i] += 1

        if np.any(known_labels[path] == convex_label):
            smallest = np.min(np.where(known_labels[path] == convex_label)[0])
            biggest = np.max(np.where(known_labels[path] == convex_label)[0])
            if binary_search:
                l_path = path[:smallest+1]
                if known_labels[l_path[0]] == -np.inf:
                    known_labels[l_path[0]] = y[l_path[0]]
                    budget[i] += 1
                label_budget, new_labels = binarySearch(y[l_path], 0, len(l_path) - 1, known_labels[l_path[0]], known_labels[l_path])
                known_labels[l_path] = new_labels
                budget[i] += label_budget

                r_path = path[biggest:]
                if known_labels[r_path[-1]] == -np.inf:
                    known_labels[r_path[-1]] = y[r_path[-1]]
                    budget[i] += 1
                label_budget, new_labels = binarySearch(y[r_path], 0, len(r_path) - 1, known_labels[r_path[0]], known_labels[r_path])
                known_labels[r_path] = new_labels
                budget[i] += label_budget
            else:
                j_minus = smallest -1
                while j_minus > 0 and known_labels[path[j_minus]] == -np.inf:
                    j_minus -= 1
                j_plus = biggest+ 1
                while j_plus < len(path) and known_labels[path[j_plus]] == -np.inf:
                    j_plus += 1

                if known_labels[path[j_minus + (smallest - j_minus)//2]] == -np.inf:
                    known_labels[path[j_minus + (smallest - j_minus)//2]] = y[path[j_minus + (smallest - j_minus)//2]]
                    budget[i] += 1
                if known_labels[path[biggest + (j_plus - biggest) // 2]] == -np.inf:
                    known_labels[path[biggest + (j_plus - biggest) // 2]] = y[path[biggest + (j_plus - biggest) // 2]]
                    budget[i] += 1

                smallest = np.min(np.where(known_labels[path] == convex_label)[0])
                biggest = np.max(np.where(known_labels[path] == convex_label)[0])

                known_labels[path[smallest:biggest+1]] = convex_label

                if smallest > 0:
                    known_labels[path[:smallest-1]] = non_convex_label
                if biggest < len(path)-1:
                    known_labels[path[biggest+1:]] = non_convex_label
        else:
            known_labels[path] = non_convex_label

        convex_class = closure.compute_hull(g, np.where(known_labels == convex_label)[0], weight)
        known_labels[convex_class] = convex_label
    return known_labels, budget
コード例 #8
0
def spc_semi_supervised_experiments(g: gt.Graph,
                                    weight_prop: gt.EdgePropertyMap, labels):
    np.random.seed(1)
    dist_map = gt.topology.shortest_distance(g, weights=weight_prop)
    W = dist_map.get_2d_array(range(g.num_vertices()))  # original distance map
    new_labels = np.zeros(g.num_vertices())
    new_labels[labels == np.unique(labels)[1]] = 1
    for budget in [10, 20, 50, 100]:
        print("========================================================")
        print("budget: ", budget, "|V|=", g.num_vertices())
        print("==================s2=====================")
        overall_labelling = shortest_shortest_path_querying.s2(
            g, weight_prop, labels, budget)
        print("accuracy after label_prop: ",
              np.sum(overall_labelling == new_labels) / g.num_vertices())
        for _ in range(5):
            starting_vertices = np.random.choice(range(g.num_vertices()),
                                                 budget,
                                                 replace=False)

            known_labels = -np.ones(g.num_vertices()) * np.inf
            known_labels[starting_vertices] = labels[starting_vertices]

            pos_label, neg_label = np.unique(labels)

            pos = np.where(known_labels == pos_label)[0]
            neg = np.where(known_labels == neg_label)[0]
            print("=============without hull===================")
            print("label propagation")
            overall_labelling = label_propagation(W, known_labels,
                                                  np.unique(labels))
            print("accuracy after label_prop: ",
                  np.sum(overall_labelling == new_labels) / g.num_vertices())

            print("=============interval============")
            pos_hull = compute_hull(g,
                                    pos,
                                    weight_prop,
                                    dist_map,
                                    compute_closure=False)
            neg_hull = compute_hull(g,
                                    neg,
                                    weight_prop,
                                    dist_map,
                                    compute_closure=False)
            print("pos", pos.size)
            print("hull size: ", np.sum(pos_hull))
            print("hull correctness overall",
                  np.sum(pos_hull & (labels == pos_label)))
            mask = np.ones(g.num_vertices(), dtype=np.bool)
            mask[pos] = False
            print("hull correctness on new vertices",
                  np.sum(pos_hull[mask] & (labels == pos_label)[mask]))
            known_labels[pos_hull] = pos_label

            print("neg", neg.size)
            print("hull size: ", np.sum(neg_hull))
            print("hull correctness overall",
                  np.sum(neg_hull & (labels == neg_label)))
            mask = np.ones(g.num_vertices(), dtype=np.bool)
            mask[neg] = False
            print("hull correctness on new vertices",
                  np.sum(neg_hull[mask] & (labels == neg_label)[mask]))
            known_labels[neg_hull] = neg_label

            print("label propagation")
            overall_labelling = label_propagation(W, known_labels,
                                                  np.unique(labels))
            print("accuracy after label_prop: ",
                  np.sum(overall_labelling == new_labels) / g.num_vertices())

            print("==============closure=================")
            pos_hull = compute_hull(g, pos, weight_prop, dist_map)
            neg_hull = compute_hull(g, neg, weight_prop, dist_map)
            print("pos", pos.size)
            print("hull size: ", np.sum(pos_hull))
            print("hull correctness overall",
                  np.sum(pos_hull & (labels == pos_label)))
            mask = np.ones(g.num_vertices(), dtype=np.bool)
            mask[pos] = False
            print("hull correctness on new vertices",
                  np.sum(pos_hull[mask] & (labels == pos_label)[mask]))
            known_labels[pos_hull] = pos_label

            print("neg", neg.size)
            print("hull size: ", np.sum(neg_hull))
            print("hull correctness overall",
                  np.sum(neg_hull & (labels == neg_label)))
            mask = np.ones(g.num_vertices(), dtype=np.bool)
            mask[neg] = False
            print("hull correctness on new vertices",
                  np.sum(neg_hull[mask] & (labels == neg_label)[mask]))
            print("label propagation")
            known_labels[neg_hull] = neg_label

            overall_labelling = label_propagation(W, known_labels,
                                                  np.unique(labels))
            print("accuracy after label_prop: ",
                  np.sum(overall_labelling == new_labels) / g.num_vertices())
コード例 #9
0
def is_convex(dir, prefix, target_column, weighted=False):
    print(dir)
    np.random.seed(0)
    edges = np.genfromtxt(dir + prefix + '_edges.csv',
                          skip_header=True,
                          dtype=np.int,
                          delimiter=',')

    df = pd.read_csv(dir + prefix + '_target.csv')  #.sort_values('new_id')
    print(dir, "weighted", weighted)

    weight = 1
    if weighted:
        if 'twitch' in dir:
            weight = np.zeros(edges.shape[0])
            max = df.iloc[:, 1].max()
            min = df.iloc[:, 1].min()
            df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min)
            max = df.iloc[:, 3].max()
            min = df.iloc[:, 3].min()
            df.iloc[:, 3] = (df.iloc[:, 3] - min) / (max - min)

            for i, e in enumerate(edges):
                weight[i] = (df.iloc[e[0], 1] - df.iloc[e[1], 1])**2 + (
                    df.iloc[e[0], 3] - df.iloc[e[1], 3])**2

        elif 'facebook' in dir:
            attributes = json.load(
                open('res/git/' + dir + '/facebook_features.json'))
            weight = np.zeros(edges.shape[0])
            for i, e in enumerate(edges):
                weight[i] = len(
                    set(attributes[str(e[0])]).symmetric_difference(
                        attributes[str(e[1])]))

    labels, _ = pd.factorize(df.iloc[:, target_column])

    new_n = 4000
    pos_label, neg_label = np.unique(labels)
    pos = np.where(labels == pos_label)[0]
    neg = np.where(labels == neg_label)[0]

    g = gt.Graph(directed=False)

    g.add_edge_list(edges)
    '''d = g.get_out_degrees(range(g.num_vertices()))
    
    

    d_pos = d[pos].argsort()[-new_n//2:][::-1]
    d_neg = d[neg].argsort()[-new_n//2:][::-1]

    d = np.append(d_pos, d_neg)

    g2 = gt.Graph(directed=False)

    edges =edges[np.isin(edges[:,0],d)&np.isin(edges[:,1],d)]

    indexes = np.unique(edges)
    labels = labels[indexes]
    for i, idx in enumerate(indexes):
        edges[edges==idx] = i

    g2.add_edge_list(edges)

    comp = gt.topology.label_largest_component(g2)
    d = np.where(comp.a == 1)[0]
    labels = labels[d]
    g3 = gt.Graph(directed=False)

    edges = edges[np.isin(edges[:, 0], d) & np.isin(edges[:, 1], d)]

    for i, idx in enumerate(np.unique(edges)):
        edges[edges == idx] = i
    g3.add_edge_list(edges)
    g = g3'''

    if weighted:
        weight = g.new_edge_property("double", vals=weight)
    else:
        weight = g.new_edge_property("double", val=1)

    comps, hist = gt.topology.label_components(g)
    #print(hist)
    #dist_map = gt.shortest_distance(g, weights=weight)
    simple = simplicial_vertices.simplicial_vertices(g)
    gt.stats.remove_self_loops(g)
    print("n=", g.num_vertices(), "simplicial=", len(simple))
    #spc = shortest_path_cover_logn_apx(g, weight)
    if weighted:
        weighted_str = "_weigted_"
    else:
        weighted_str = ""
    #pickle.dump(spc, open(dir+'spc'+weighted_str+'.p', 'wb'))
    spc = pickle.load(open(dir + 'spc' + weighted_str + '.p', 'rb'))

    weight = None

    pos = np.where(labels == pos_label)[0]
    neg = np.where(labels == neg_label)[0]

    print("pos", len(pos))
    print("neg", len(neg))
    spc_semi_supervised_experiments(g, weight, labels)

    p_interval = compute_hull(g, pos, weight, compute_closure=False)
    n_interval = compute_hull(g, neg, weight, compute_closure=False)

    print("pos_interval size: ", np.sum(p_interval))
    print("neg_interval size: ", np.sum(n_interval))
    print("intersection of intervals size: ", np.sum(p_interval & n_interval))

    p_hull = compute_hull(g, pos, weight)
    n_hull = compute_hull(g, neg, weight)

    print("pos_hull size: ", np.sum(p_hull))
    print("neg_hull size: ", np.sum(n_hull))
    print("intersection of hulls size: ", np.sum(p_hull & n_hull))
コード例 #10
0
def is_convex(weighted):
    print("digit1")
    np.random.seed(0)
    X = np.genfromtxt('res/benchmark/SSL,set=' + str(1) + ',X.tab')
    # X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
    y = (np.genfromtxt('res/benchmark/SSL,set=' + str(1) + ',y.tab'))

    n = X.shape[0]
    dists = scipy.spatial.distance.cdist(X, X)
    y = y[:n]

    W = dists[:n, :n]  # np.exp(-(dists) ** 2 / (2 * sigma ** 2))
    np.fill_diagonal(W, 0)
    W[W > np.quantile(W, 0.004)] = np.inf
    # W2 = np.copy(W) less edges is slower strangely
    # W2[W2 <= 0.1] = 0

    weights = W[(W < np.inf) & (W > 0)].flatten()
    edges = np.array(np.where((W < np.inf) & (W > 0))).T

    np.random.seed(0)

    g = gt.Graph()

    # construct actual graph
    g.add_vertex(n)
    g.add_edge_list(edges)
    if weighted:
        weight_prop = g.new_edge_property("double", vals=weights)
    else:
        weight_prop = g.new_edge_property("double", val=1)

    comps, hist = gt.label_components(g)

    #print("simplicial=", len(simplicial_vertices(g)), "#coms=", hist.size)
    dist_map = gt.shortest_distance(g, weights=weight_prop)
    #paths = shortest_path_cover_logn_apx(g, weight_prop)
    if not weighted:
        spc = pickle.load(
            open(
                "res/benchmark/spc_" + str(1) + "_q_" + str(0.004) +
                "_weighted_" + str(weighted) + ".p", "rb"))
    else:
        spc = shortest_path_cover_logn_apx(g, weight_prop)
    labels = y

    a, b = spc_querying_naive(g, spc, labels)
    print(a)
    print(b, np.sum(b))
    print(np.sum(a == labels))
    print("len(spc)", len(spc))
    num_of_convex_paths = 0
    total_error = 0
    for p in spc:
        error = are_convex(labels[p])
        if error == 0:
            num_of_convex_paths += 1
        else:
            total_error += error

    print("#convex paths", num_of_convex_paths)
    print("total error on paths", total_error)

    return
    for c in np.unique(labels):
        print("class label", c)
        print("class size: ", np.sum(labels == c))
        cls = np.where(labels == c)[0]
        for sample_size in [5, 10, 20, len(cls)]:
            print("sample_size", sample_size)
            if sample_size <= 20:
                times = 5
            else:
                times = 1
            for _ in range(times):

                sample = np.random.choice(cls, sample_size, replace=False)

                hull_p = compute_hull(g,
                                      sample,
                                      dist_map=dist_map,
                                      comps=comps,
                                      hist=hist,
                                      compute_closure=False)
                print("size interval: ", np.sum(hull_p))
                print("number of correct in interval: ", np.sum(hull_p[cls]))

                hull_p = compute_hull(g,
                                      sample,
                                      dist_map=dist_map,
                                      comps=comps,
                                      hist=hist)
                print("size hull: ", np.sum(hull_p))
                print("number of correct in interval: ", np.sum(hull_p[cls]))

    print("==================================")
コード例 #11
0
    for v in g.vertices():

        #try to find a clique around v
        #TODO: Replace with numpy style
        for x, y in itertools.combinations(g.get_all_neighbors(v), 2):
            if g.edge(x, y) is None:
                break
        else:
            simplicial_vertices.append(int(v))
            #print(len(g.get_all_neighbors(v)))

    return simplicial_vertices


if __name__ == "__main__":
    for i in range(1, 100):
        deg_sampler = lambda: np.random.randint(1, i * 50)
        g = random_graph(i * 100, deg_sampler, directed=False)
        weight = g.new_edge_property("int", val=1)
        s = simplicial_vertices(g)
        print(i * 100, len(s))

        if len(s) > 0:
            print(np.sum(compute_hull(g, s, weight) > 0))
            print(
                np.sum(
                    compute_hull(g, np.random.randint(0, i * 100, len(s)),
                                 weight) > 0))

        print("=========================")
def florians_procedure(g: gt.Graph, use_simplicial):
    n = g.num_vertices()

    if not use_simplicial:
        s = simplicial_vertices(g)
        a = s[0]
        while a in s:
            a = np.random.randint(0, n)

        b = a
        while a == b or b in s:
            b = np.random.randint(0, n)

    else:
        a = np.random.randint(0, n)

        b = a
        while a == b:
            b = np.random.randint(0, n)

    A = np.zeros(n, dtype=np.bool)
    A[a] = True
    B = np.zeros(n, dtype=np.bool)
    B[b] = True

    F = set(range(n)).difference(np.where(A | B == True)[0])

    i = 0
    while len(F) > 0:
        e = F.pop()

        if i % 2 == 0:

            A[e] = True
            A_new = (g, np.where(A == True)[0])
            if not np.any(B & A_new):
                A = A_new
                F = F.difference(set(np.where(A == True)[0]))
            else:
                A[e] = False
                B[e] = True
                B_new = compute_hull(g, np.where(B == True)[0])
                if not np.any(A & B_new):
                    B = B_new
                    F = F.difference(set(np.where(A == True)[0]))
                else:
                    B[e] = False
        else:
            B[e] = True
            B_new = compute_hull(g, np.where(B == True)[0])
            if not np.any(A & B_new):
                B = B_new
                F = F.difference(set(np.where(A == True)[0]))
            else:
                B[e] = False
                A[e] = True
                A_new = compute_hull(g, np.where(A == True)[0])
                if not np.any(B & A_new):
                    A = A_new
                    F = F.difference(set(np.where(A == True)[0]))

        i += 1
        print(len(F))
    return A, B
コード例 #13
0
def is_convex(dataset,q,weighted=True):
    X = np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',X.tab')
    #X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
    y = (np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',y.tab'))

    n = 100
    dists = scipy.spatial.distance.cdist(X, X)
    y = y[:n]
    y = (y-np.min(y))//(np.max(y)-np.min(y))
    #q = 0.04
    W = dists[:n,:n]#np.exp(-(dists) ** 2 / (2 * sigma ** 2))
    q = np.quantile(W, 0.1)
    W[W > q] = np.inf
    # W2 = np.copy(W) less edges is slower strangely
    if not weighted:
        W[W <= q] = 1
    np.fill_diagonal(W, 0)

    weights = W[(W<np.inf) & (W>0)].flatten()
    edges = np.array(np.where((W<np.inf) & (W>0))).T

    print("e",len(edges))
    #return

    np.random.seed(0)

    g = gt.Graph()

    # construct actual graph
    g.add_vertex(n)
    g.add_edge_list(edges)
    weight_prop = g.new_edge_property("double", val=1)


    comps,hist = gt.topology.label_components(g)

    simpl = simplicial_vertices(g)

    print(len(simpl), np.sum(closure.compute_hull(g, simpl, weight_prop)>0))
    #return
    paths = shortest_path_cover_logn_apx(g, weight_prop)



    sum = 0
    for i in paths:
        sum += np.ceil(np.log2(len(i)))

    print("|S|=", len(paths))
    print("#queries<=", sum, "%:", sum / n)


    pos = list(np.arange(n)[y > 0])[:n]
    neg = list(np.arange(n)[y <= 0])[:n]

    print(n,pos,neg)
    print("p",len(pos))
    print("n",len(neg))

    #pos_hull = closure.compute_hull(g,pos, weight_prop,comps,hist)
    #print(np.sum(pos_hull))
    #neg_hull = closure.compute_hull(g, neg, weight_prop,comps,hist)
    #print(np.sum(neg_hull))
    #print(len(set(np.where(pos_hull)[0]).intersection(set(np.where(neg_hull)[0])))/n)

    print("===============================================================")
    known_labels, budget = spc_querying_with_closure(g, paths,weight_prop,y)
    print(np.sum(np.abs(known_labels-y)/n))
    print(budget)