Esempio n. 1
0
def test_H_observed_EC2_variants():
    """Illustrate the variants of H_observed"""
    print(
        "\n\n-- test_H_observed_EC2_variants(): 'H_observed', 'M_observed', uses: 'planted_distribution_model_H' --"
    )

    # --- Parameters for graph
    n = 3000
    a = 1
    h = 8
    d = 2
    k = 3
    f = 0.2
    distribution = 'uniform'

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=None,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, _ = replace_fraction_of_rows(X0, f, avoidNeighbors=False)

    # --- Print first rows of matrices
    distance = 3

    print("First rows of powers of H0:")
    for k in range(1, distance + 1):
        print("{}: {}".format(k, np.linalg.matrix_power(H0, k)[0]))

    print("\nNumber of observed edges between labels (M_observed):")
    M = M_observed(W, X1, distance=distance, NB=True)
    print("M[0]:\n{}".format(M[0]))
    print("M[2]:\n{}".format(M[1]))

    for EC in [False, True]:
        for variant in [1, 2]:
            print("\nP (H observed): variant {} with EC={}".format(
                variant, EC))
            H_vec = H_observed(W,
                               X1,
                               distance=distance,
                               NB=EC,
                               variant=variant)
            for i, H in enumerate(H_vec):
                print("{}:\n{}".format(i, H))
def calculate_nVec_from_Xd(Xd):
    """Calculates 'n_vec': the number of times each node class occurs in graph.
    Given graph with explicit beliefs in dictionary format 'Xd'.
    Assumes zeroindexing.
    """
    X0 = from_dictionary_beliefs(Xd)
    return X0.sum(axis=0)
def calculate_Ptot_from_graph(W, Xd, zeroindexing=True):
    """Calculates [k x k] array 'P_tot': the number of times each edge type occurs in graph.
    Uses a sparse directed (incl. undirected) [n x n] adjacency matrix 'W' and explicit beliefs in dictionary format 'Xd'.
    [Does not ignore weights of 'W'. Updated with simpler multiplication]
    Assumes zeroindexing.
    If normalizing is required later:
        m = sum(P_tot.flatten())       # sum of all entries = number of edges
        Pot = 1. * P_tot / m           # Potential: normalized sum = 1
        P_tot = Pot
    """
    X0 = from_dictionary_beliefs(Xd, zeroindexing=zeroindexing)
    return X0.transpose().dot(W.dot(X0))
def calculate_outdegree_distribution_from_graph(W, Xd=None):
    """Given a graph 'W', returns a dictionary {degree -> number of nodes with that degree}.
    If a dictionary 'Xd' of explicit beliefs is given, then returns a list of dictionaries, one for each node class.
    Takes weight into acount [OLD: Ignores weights of 'W'. Assumes zeroindexing.]
    Transpose W to get indegrees.
    """
    n, _ = W.shape
    countDegrees = W.dot(np.ones((n, 1))).flatten().astype(int)

    # # OLD version that ignored the weight
    # row, col = W.nonzero()                      # transform the sparse W back to row col format
    # countNodes = collections.Counter(row)       # count number of times every node appears in rows
    # # Add all nodes to the counter (thus nodes with 0 ocurrences have "node key -> 0", important for later statistics)
    # for key in range(n):
    #     countNodes.setdefault(key, 0)

    if Xd is None:
        countIndegrees = collections.Counter(countDegrees)  # count multiplicies of nodes classes
        return countIndegrees
        # # OLD version
        # countIndegrees = collections.Counter(countNodes.values())   # count multiplicies of nodes classes

    else:
        listCountIndegrees = []

        X0 = from_dictionary_beliefs(Xd)
        for col in X0.transpose():
            countDegreesInClass = countDegrees*col      # entry-wise multiplication
            countDegreesInClass = countDegreesInClass[np.nonzero(countDegreesInClass)]
            countIndegreesInClass = collections.Counter(countDegreesInClass)
            listCountIndegrees.append(countIndegreesInClass)

        # # OLD version
        # k = max(Xd.values()) + 1
        # listCounterNodes = [{} for _ in range(k)]
        # for key, value in countNodes.iteritems():
        #     j = Xd[key]
        #     listCounterNodes[j][key] = value
        # for dict in listCounterNodes:
        #     countIndegrees = collections.Counter(dict.values())   # count multiplicies of nodes classes
        #     # listCountIndegrees.append(countIndegrees)

        return listCountIndegrees
Esempio n. 5
0
def test_M_observed():
    """Illustrate M_observed: non-backtracking or not
    Also shows that W^2 is denser for powerlaw graphs than uniform
    """
    print(
        "\n-- test_M_observed(): 'M_observed', uses: 'planted_distribution_model_H' --"
    )

    # --- Parameters for graph
    n = 3000
    a = 1
    h = 8
    d = 10  # variant 2
    d = 2  # variant 1
    k = 3
    distribution = 'powerlaw'  # variant 2
    distribution = 'uniform'  # variant 1
    exponent = -0.5

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)

    # --- Print results
    distance = 8

    M_vec = M_observed(W, X0, distance=distance, NB=False)
    M_vec_EC = M_observed(W, X0, distance=distance, NB=True)

    print("Graph with n={} nodes and uniform d={} degrees".format(n, d))
    print("\nSum of entries and first rows of M_vec (without NB)")
    for i, M in enumerate(M_vec):  # M_vec[1:] to skip the first entry in list
        print("{}: {}, {}".format(i, np.sum(M), M[0]))

    print("\nSum of entries and first rows of M_vec (with NB)")
    for i, M in enumerate(M_vec_EC):
        print("{}: {}, {}".format(i, np.sum(M), M[0]))

    if True:
        print("\nFull matrices:")
        print("M_vec")
        for i, M in enumerate(M_vec):  # skip the first entry in list
            print("{}: \n{}".format(i, M))

        print("\nM_vec_EC")
        for i, M in enumerate(M_vec_EC):  # skip the first entry in list
            print("{}: \n{}".format(i, M))
Esempio n. 6
0
def test_gradient_optimization2():
    print(
        "\n-- 'estimateH, define_gradient_energy_H, define_energy_H, uses: planted_distribution_model_H, H_observed, M_observed, --"
    )

    # --- Parameters for graph
    n = 10000
    a = 1
    h = 2
    d = 10
    k = 7
    distribution = 'powerlaw'
    exponent = -0.3

    np.set_printoptions(precision=4)

    alpha0 = create_parameterized_alpha(k, a)
    H0 = create_parameterized_H(k, h, symmetric=True)
    f = 0.02
    print("Graph n={}, d={}, f={}".format(n, d, f))
    print("H0:\n{}".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- M_vec, H_vec statistics
    distance = 5

    print("M_vec:")
    M_vec = M_observed(W, X1, distance=distance)
    for i, M in enumerate(M_vec):
        print("{}:\n{}".format(i, M))

    print("\nH_vec_observed:")
    H_vec = H_observed(W, X1, distance=distance)
    for i, H in enumerate(H_vec):
        print("{}:\n{}".format(i, H))

    # --- estimate_H based on distance 1 and uninformative point
    distance = 1
    weights = [1, 0, 0, 0, 0]
    print(
        "\n= Estimate H based on X1 and distance={} from uninformative point:".
        format(distance))
    h0 = np.ones(int(k * (k - 1) / 2)).dot(
        1 / k)  # use uninformative matrix to start with
    energy_H = define_energy_H(H_vec_observed=H_vec,
                               weights=weights,
                               distance=distance)
    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)

    start = time.time()
    H1 = estimateH(X1, W, distance=distance, weights=weights, gradient=False)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))
    e = energy_H(H1)
    print("Energy at estimated point: {}".format(e))

    start = time.time()
    H2 = estimateH(X1, W, distance=distance, weights=weights, gradient=True)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))
    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))

    G = gradient_energy_H(H2)
    h = derivative_H_to_h(G)
    print("Gradient matrix at estimated point:\n{}".format(G))
    print("Gradient vector at estimated point:\n{}".format(h))
Esempio n. 7
0
def test_gradient_optimization():
    print(
        "\n-- 'estimateH, define_gradient_energy_H, define_energy_H, uses: planted_distribution_model_H, H_observed, M_observed, --"
    )

    # --- Parameters for graph
    n = 1000
    a = 1
    h = 8
    d = 25
    k = 3
    distribution = 'powerlaw'
    exponent = -0.3

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    f = 0.1
    print("Graph n={}, d={}, f={}".format(n, d, f))
    print("H0:\n{}".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- M_vec, H_vec statistics
    distance = 5
    print("\nH_vec_observed:")
    H_vec = H_observed(W, X1, distance=distance)
    for i, H in enumerate(H_vec):
        print("{}:\n{}".format(i, H))

    # --- estimate_H based on distance 1
    print(
        "\n= Estimate H based on X1 and distance=1 (old without or with gradient):"
    )
    distance = 1
    weights = [1, 0, 0, 0, 0]

    start = time.time()
    H1 = estimateH(X1, W, distance=distance, weights=weights, gradient=False)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))

    start = time.time()
    H2 = estimateH(X1, W, distance=distance, weights=weights, gradient=True)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))

    # --- estimate_H based on distance 5 and uninformative point
    print(
        "\n= Estimate H based on X1 and distance=5 (ignoring distances 1-4) from various points (old without or with gradient):"
    )
    print(
        "From uninformative point (all methods get stuck, even with gradient !!!:"
    )
    distance = 5
    weights = [0, 0, 0, 0, 1]
    h0 = np.ones(3).dot(1 / k)  # use uninformative matrix to start with

    start = time.time()
    H1 = estimateH(X1, W, distance=distance, weights=weights, gradient=False)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))

    start = time.time()
    H2 = estimateH(X1, W, distance=distance, weights=weights, gradient=True)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))

    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    g = gradient_energy_H(transform_hToH(h0, 3))
    h = derivative_H_to_h(g)
    print("Gradient at uninformative point:\n{}".format(g))
    print("Gradient at uninformative point: {}".format(h))
    energy_H = define_energy_H(H_vec_observed=H_vec,
                               weights=weights,
                               distance=distance)
    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))

    # --- estimate_H based on distance 5 and wrong point
    print(
        "\n= From wrong point (gradient method with BFGS can fix it, SLSQP stays stuck !!!"
    )
    distance = 5
    weights = [0, 0, 0, 0, 1]
    h0 = np.array([0.4, 0.3, 0.3])

    start = time.time()
    H1 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=False,
                   initial_h0=h0)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))

    start = time.time()
    H2 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=True,
                   initial_h0=h0)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))

    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    g = gradient_energy_H(transform_hToH(h0, 3))
    h = derivative_H_to_h(g)
    print("Gradient at wrong point:\n{}".format(g))
    print("Gradient at wrong point: {}".format(h))
    energy_H = define_energy_H(H_vec_observed=H_vec,
                               weights=weights,
                               distance=distance)
    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))

    # --- estimate_H based on distance 5 and some closer point
    print(
        "\n= From closer point (converges for BFGS, but not always for SLSQP!!!):"
    )
    distance = 5
    weights = [0, 0, 0, 0, 1]
    h0 = np.array([0.3, 0.4, 0.3])

    start = time.time()
    H1 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=False,
                   initial_h0=h0)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))

    start = time.time()
    H2 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=True,
                   initial_h0=h0)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))

    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    g = gradient_energy_H(transform_hToH(h0, 3))
    h = derivative_H_to_h(g)
    print("Gradient at closer point:\n{}".format(g))
    print("Gradient at closer point: {}".format(h))
    energy_H = define_energy_H(H_vec_observed=H_vec,
                               weights=weights,
                               distance=distance)
    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))

    # --- estimate_H based on distance 5 and some closer point
    print("\n= From even closer point:")
    distance = 5
    weights = [0, 0, 0, 0, 1]
    h0 = np.array([0.2, 0.4, 0.2])

    start = time.time()
    H1 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=False,
                   initial_h0=h0)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))

    start = time.time()
    H2 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=True,
                   initial_h0=h0)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))

    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    g = gradient_energy_H(transform_hToH(h0, 3))
    h = derivative_H_to_h(g)
    print("Gradient at closer point:\n{}".format(g))
    print("Gradient at closer point: {}".format(h))
    energy_H = define_energy_H(H_vec_observed=H_vec,
                               weights=weights,
                               distance=distance)
    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))

    # --- estimate_H based on distance 5 and some closer point
    print(
        "\n= Variant with constraints (constraints only work with SLSQP !!!):")

    start = time.time()
    H2 = estimateH(X1,
                   W,
                   distance=distance,
                   weights=weights,
                   gradient=True,
                   initial_h0=h0,
                   constraints=True)
    time_est = time.time() - start
    print("Estimated H with gradient and constraints:\n{}".format(H2))
    print("Time :{}".format(time_est))

    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))
def forced_block_model(n, d, H, alpha, directed=True, clamped=True):
    """Returns a graph with n nodes and n*b edges.
    Nodes are divided into classes exactly according to alpha (no uncertainty).
    Each node as source actively tries to connect to exactly b other nodes as targets. Thus outdegree = b.
    Targets are chosen according to row-normalized H matrix.

    Parameters
    ----------
    n : int
        The number of nodes
    d : int
        The exact [!!! average???] outdegree of each node, thus the number of edges per node
    H : [k,k] ndarray
        row-normalized homophily matrix
    alpha : k ndarray
        a prior probability distribution of classes
    seed : int, optional
       seed for random number generator (default=None)
    directed : bool, optional (Default = true)
        model creates directed edges.
    clamped : bool, optional (Default = true)
        new model with fixed Potential. If False, then original model with expected alpha and H

    Returns
    -------
    W : sparse.csr_matrix
        sparse weighted adjacency matrix
    X : np.array int
        Explicit belief matrix
        ! maybe should be better Xd

    Notes
    -----
    Some nodes may be more incoming edges than others. But average indegree is also b.
    Uses function weighted_sample for random neighbor draws.
    Directed edges are drawn as to never go in both directions (!!! may want to put a flag later)
    internal parameter repMax: edge node tries to connect to randomly chosen node type.
        If not possible repMax times, then it gives up. Thus can lead to graphs with fewer edges b than expected
    """

    repMax = 10     # !!! max number of times of attempts connect to a certain node type
    k = len(alpha)

    # Determine node classes
    # Xl: list that maps each index i to the respective class k of that node i
    # Xd, X
    if clamped:                        # clamp the number of nodes for each class exactly
        classNum = np.array(alpha*n, int)  # array of number of nodes in each class
        delta = np.sum(classNum) - n
        classNum[k-1] = classNum[k-1] - delta     # make sure sum(N)=n, in case there are rounding errors, correct the last entry
        Xl = [ [i]*classNum[i] for i in range(k) ]     # create a list of that maps for each index of a node to its class, in 3 steps
        Xl = np.hstack(Xl)          # flatten nested array
        np.random.shuffle(Xl)       # random order of those classes. Array that maps i -> k
    else:                           # old code that chooses independently. Led to random instantiaton for actual alpha
        Xl = np.random.choice(k, n, replace=True, p=alpha)  # Array that maps i -> k
    Xd = {i : Xl[i] for i in range(n)}      # Xd: dictionary that maps i -> k
    X = from_dictionary_beliefs(Xd, n, k)

    # ClassNodes [maps k -> array of nodes of class k]
    classNodes =[[] for i in range(k)]
    for c in range(k):
        classNodes[c] = np.array([i for (i,j) in Xd.items() if j == c])

    # legacy
    if not clamped:
        classNum = []
        for c in range(k):
            classNum.append( np.size(classNodes[c]) )
        assert(min(classNum) > 0)   # at least one node for each class

    # row, col: index structure for edges
    # edges: set of edges
    # He: count edge matrix
    # T: T[j] is list T[j,i] of node types to which the i-th edge links
    row = []
    col = []
    edges = set()       # set of edges, used to verify if edge already exists

    # Determine end classes for each edge with a given edge source type
    if clamped:
        He = []         # Count edge matrix
        for j in range(k):
            Z = np.array(H[j]*classNum[j]*d, int)  # number of nodes in each class
            delta = np.sum(Z) - classNum[j]*d      # balancing due to rounding possible errors
            Z[k-1] = Z[k-1] - delta
            He.append(Z)
            # T[j]
        He = np.array(He)
        delta = np.sum(He) - n*d            # balancing due to rounding possible errors
        He[k-1,k-1] = He[k-1,k-1] - delta

        T = []          # list of lists (for a given source node class) of edge target node classes for each edge
        id_T = []       # list of indexes
        for j in range(k):
            Z1 = [ [i]*He[j,i] for i in range(k) ]
            Z2 = np.hstack(Z1).astype(np.int64)          # flatten nested array. Then make sure it is integer
            np.random.shuffle(Z2)       # random order of those classes. Array that maps i -> k
            T.append(Z2)
            id_T.append(0)

        # determine actual edges
        for i in range(n):
            j = Xd[i]           # class of start node
            for l in range(d):
                c = T[j][id_T[j]]  # class of end node
                connected = False  # has the new node already found another node to connect to
                l=1                # l-th attempt to connect this node type
                while l <= repMax and not connected:
                    v = random.choice(classNodes[c])    # choose a random node of that class
                    if      (not v==i and               # don't connect to any node if edge exists in either direction
                             not (i,v) in edges and
                             not (v,i) in edges):
                        connected = True
                        row.append(i)
                        col.append(v)
                        edges.add((i,v))
                    l += 1
                id_T[j] += 1

    else:
        # Actual loop for each node and edge
        # !!! better use: to_scipy_sparse_matrix(G,nodelist=None,dtype=None):
        for i in range(n):
            pk = X[i].dot(H)
            pk = np.squeeze(np.asarray(pk)) # probability distribuion of neighbor

            for j in range(d):
                c = weighted_sample(pk)     # class of next neighbor to connect to

                connected = False  # has the new node already found another node to connect to
                l=1                # l-th attempt to connect this node type
                while l <= repMax and not connected:
                    v = random.choice(classNodes[c])
                    if      (not v==i and
                             not (i,v) in edges and
                             not (v,i) in edges):
                        connected = True
                        row.append(i)
                        col.append(v)
                        edges.add((i,v))
                    l += 1

    # Create sparse matrix. If directed=False then insert edges in both directions => symmetric W
    if directed is False:
        row2 = list(row)    # need to make a temp copy
        row.extend(col)
        col.extend(row2)
    Ws = csr_matrix(([1]*len(row), (row, col)), shape=(n, n))
    return Ws, X
Esempio n. 9
0
def run(choice,
        create_data=False,
        add_data=False,
        create_fig=True,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        shorten_length=False,
        show_arrows=True):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf

    csv_filename = 'Fig_MHE_Optimal_ScalingFactor_d_{}.csv'.format(CHOICE)
    header = [
        'currenttime',
        'option',  # one option corresponds to one choice of weight vector. In practice, one choice of scaling factor (for weight vector)
        'd',
        'scaling',
        'diff'
    ]  # L2 norm between H and estimate
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    randomize = False
    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1
    rep = 26
    EC = True
    scaling_vec = [0] + [0.1 * pow(10, 1 / 8)**x for x in range(33)]
    num_options = len(scaling_vec)
    scaling_vec = np.array(scaling_vec)
    weight = np.array([np.power(scaling_vec, i) for i in range(5)])
    weight = weight.transpose()
    d_vec = list(range(3, 9)) + [10 * pow(10, 1 / 12)**x for x in range(13)]
    # print(d_vec)
    d_vec = [int(i) for i in d_vec]
    fraction_of_minimum = 1.1  # scaling parameters that lead to optimum except for this scaling factor are included
    ymin2 = 0.3
    ymax2 = 500
    xmin1 = 3
    xmax1 = 100
    xmin2 = 2.87
    xmax2 = 105
    xtick_lab = [3, 5, 10, 30, 100]
    # ytick_lab1 = np.arange(0, 1, 0.1)
    ytick_lab1 = [0.001, 0.01, 0.1, 1]
    ytick_lab2 = [0.3, 1, 10, 100, 1000]
    ymax1 = 0.2
    ymin1 = 0.001
    k = 3
    a = 1

    # -- Options
    if CHOICE == 1:  # #=100
        n = 1000
        h = 8
        f = 0.1
        distribution = 'uniform'
        ytick_lab1 = [0.01, 0.1, 0.5]
        ymax1 = 0.5
        ymin1 = 0.01

    elif CHOICE == 2:  # selection #=124
        n = 10000
        h = 8
        f = 0.1
        distribution = 'powerlaw'
        ymin1 = 0.003

    elif CHOICE == 3:  # special selection #=100
        n = 10000
        h = 8
        f = 0.05
        distribution = 'powerlaw'
        ymin1 = 0.005
        ymax1 = 0.5

    elif CHOICE == 4:  # selection #=100
        n = 10000
        h = 3
        f = 0.1
        distribution = 'powerlaw'
        ymin1 = 0.003

    elif CHOICE == 5:  # #=5
        n = 10000
        h = 3
        f = 0.1
        distribution = 'uniform'

    elif CHOICE == 6:  # #=5
        n = 10000
        h = 8
        f = 0.1
        distribution = 'uniform'

    elif CHOICE == 7:  # special selection #=100
        n = 10000
        h = 3
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.401
        ymin1 = 0.003

    else:
        raise Warning("Incorrect choice!")

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    #print("CHOICE: {}".format(CHOICE))

    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for r in range(1, rep + 1):
            # print('Repetition {}'.format(r))
            for d in d_vec:
                # print('d: {}'.format(d))

                # -- Create graph
                W, Xd = planted_distribution_model_H(n,
                                                     alpha=alpha0,
                                                     H=H0,
                                                     d_out=d,
                                                     distribution=distribution,
                                                     exponent=exponent,
                                                     directed=False,
                                                     debug=False)
                X0 = from_dictionary_beliefs(Xd)
                X1, ind = replace_fraction_of_rows(X0, 1 - f)

                # -- Create estimates and compare against GT
                for option in range(num_options):
                    H_est = estimateH(X1,
                                      W,
                                      method='MHE',
                                      variant=variant,
                                      distance=length,
                                      EC=EC,
                                      weights=weight[option],
                                      randomize=randomize,
                                      initial_h0=initial_h0)
                    diff = LA.norm(H_est - H0)

                    tuple = [str(datetime.datetime.now())]
                    text = [option, d, scaling_vec[option], diff]
                    tuple.extend(text)
                    save_csv_record(join(data_directory, csv_filename), tuple)

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    #print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # Aggregate repetitions
    df2 = df1.groupby(['d', 'scaling']).agg \
        ({'diff': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'diff_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # find minimum diff for each d, then join it back into df2
    df3 = df2.groupby(['d']).agg \
        ({'diff_mean': [np.min],  # Multiple Aggregates
          })
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(90)))
    df4 = pd.merge(
        df2, df3, left_on='d', right_index=True
    )  # ! join df2 and df3 on column "d" from df2, and index (=d) from df3
    # df4 = df4.drop(['index'], axis=1)     # does not work
    # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(25)))

    # Select columns for energy comparison plot: H0
    df5 = df4.query('scaling==0')
    # print("\n-- df5 (length {}):\n{}".format(len(df5.index), df5.head(90)))
    # df5.drop('option', axis=1, inplace=True)  # gives warning
    df5 = df5.drop(['diff_mean_amin'], axis=1)
    # print("\n-- df5: scaling==0 (length {}):\n{}".format(len(df5.index), df5.head(90)))
    X_d = df5['d'].values  # plot value
    Y_diff0 = df5['diff_mean'].values  # plot value
    Y_diff0_std = df5['diff_std'].values  # plot value

    # Select columns for energy comparison plot: H5 optimal
    df6 = df4.copy()
    # print("\n-- df6 (length {}):\n{}".format(len(df6.index), df6.head(90)))
    df6['cond'] = np.where((df6['diff_mean'] == df6['diff_mean_amin']), True,
                           False)
    df6 = df6.query('cond==True')
    df6.drop([
        'cond',
    ], axis=1, inplace=True)
    # print("\n-- df6: best scaling (length {}):\n{}".format(len(df6.index), df6.head(90)))
    Y_diff1 = df6['diff_mean'].values  # plot value
    Y_diff1_std = df6['diff_std'].values  # plot value
    Y_scaling = df6['scaling'].values  # plot value

    # Select all (d, scaling) combinations that are close to optimal
    df4['cond'] = np.where(
        (df4['diff_mean'] <= fraction_of_minimum * df4['diff_mean_amin']),
        True, False)
    df7 = df4.query('cond==True')
    df7.drop([
        'cond',
    ], axis=1, inplace=True)
    # print("\n-- df7: all good data points(length {}):\n{}".format(len(df7.index), df7.head(90)))
    X_points = df7['d'].values  # plot value
    Y_points = df7['scaling'].values  # plot value

    # Select average (and lower and upper bound) on good data points
    df8 = df7.groupby(['d']).agg \
        ({'scaling': [np.mean, np.amin, np.amax, ],  # Multiple Aggregates
          })
    df8.columns = ['_'.join(col).strip() for col in df8.columns.values
                   ]  # flatten the column hierarchy
    df8.reset_index(inplace=True)  # remove the index hierarchy
    # print("\n-- df8: input for moving average (length {}):\n{}".format(len(df8.index), df8.head(15)))
    Y_point_mean = df8['scaling_mean'].values  # plot value

    if SHOW_PLOT or SHOW_PDF or CREATE_PDF:
        # -- Setup figure
        fig_filename = 'Fig_MHE_Optimal_ScalingFactor_diff_d_{}.pdf'.format(
            CHOICE)
        mpl.rcParams['backend'] = 'pdf'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = 14
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 16
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams[
            'xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Draw the plots
        p1 = ax.plot(X_d, Y_diff0, color='blue', linewidth=2)
        ax.fill_between(X_d,
                        Y_diff0 + Y_diff0_std,
                        Y_diff0 - Y_diff0_std,
                        facecolor='blue',
                        alpha=0.2,
                        edgecolor='none',
                        label=r'$\tilde {\mathbf{H}}$')
        p2 = ax.plot(X_d, Y_diff1, color='red', linewidth=2)
        ax.fill_between(X_d,
                        Y_diff1 + Y_diff1_std,
                        Y_diff1 - Y_diff1_std,
                        facecolor='red',
                        alpha=0.2,
                        edgecolor='none',
                        label=r'$\tilde {\mathbf{H}}^{\ell}_{\mathrm{EC}}$')
        plt.xscale('log')
        plt.yscale('log')

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        plt.title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, h\!=\!{}, f\!=\!{}{}'.format(
            int(n / 1000), h, f, distribution_label))
        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            labels,
            loc='upper right',  # 'upper right'
            handlelength=1.5,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            # title='Variants',
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
        )
        frame = legend.get_frame()
        # frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_lab)
        plt.yticks(ytick_lab1, ytick_lab1)
        plt.grid(b=True,
                 which='minor',
                 axis='both',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.grid(b=True,
                 which='major',
                 axis='y',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'$d$', labelpad=0)  # labelpad=0
        plt.ylabel(r'L$^2$ norm', labelpad=-5)

        if xmin1 is None:
            xmin1 = plt.xlim()[0]
        if xmax1 is None:
            xmax1 = plt.xlim()[1]
        if ymin1 is None:
            ymin1 = plt.ylim()[1]
        if ymax1 is None:
            ymax1 = plt.ylim()[1]
        plt.xlim(xmin1, xmax1)
        plt.ylim(ymin1, ymax1)
        if CREATE_PDF:
            plt.savefig(join(figure_directory, fig_filename),
                        format='pdf',
                        dpi=None,
                        edgecolor='w',
                        orientation='portrait',
                        transparent=False,
                        bbox_inches='tight',
                        pad_inches=0.05,
                        frameon=None)
        if SHOW_PDF:
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
        if SHOW_PLOT:
            plt.show()

    if SHOW_PLOT or SHOW_PDF or CREATE_PDF:
        # -- Setup figure
        fig_filename = 'Fig_MHE_Optimal_ScalingFactor_lambda_d_{}.pdf'.format(
            CHOICE)
        mpl.rcParams['backend'] = 'pdf'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = 14
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams[
            'xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Draw the plots
        p1 = ax.plot(
            X_points,
            Y_points,
            color='0.8',
            linewidth=0,
            marker='o',
            markeredgewidth=0.0,
            clip_on=False,  # cut off data points outside of plot area
            zorder=9,
            markevery=1,
            label=r'$\!\leq${} Opt'.format(fraction_of_minimum))
        p2 = ax.plot(
            X_d,
            Y_scaling,
            color='red',
            linewidth=0,
            marker='o',
            clip_on=False,  # cut off data points outside of plot area
            zorder=10,
            markevery=1,
            label=r'Opt$(\lambda|d)$')
        plt.xscale('log')
        plt.yscale('log')

        # Draw the moving average from Y_point_mean
        def movingaverage(interval, window_size):
            window = np.ones(int(window_size)) / float(window_size)
            return np.convolve(interval, window, 'same')

        Y_point_mean_window = movingaverage(Y_point_mean, 3)
        p5 = ax.plot(X_d,
                     Y_point_mean_window,
                     color='red',
                     linewidth=1,
                     marker=None)
        # p3 = ax.plot(X_d, Y_point_mean, color='red', linewidth=1, marker=None)

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        plt.title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, h\!=\!{}, f\!=\!{}{}'.format(
            int(n / 1000), h, f, distribution_label))
        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles[::-1],
            labels[::-1],
            loc='upper left',  # 'upper right'
            handlelength=1,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            borderaxespad=0.3,  # distance between legend and the outer axes
            borderpad=0.1,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        frame = legend.get_frame()
        # frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_lab)
        plt.yticks(ytick_lab2, ytick_lab2)
        plt.grid(b=True,
                 which='minor',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'$d$', labelpad=0)  # labelpad=0
        plt.ylabel(r'$\lambda$', labelpad=0, rotation=0)

        if xmin2 is None:
            xmin2 = plt.xlim()[0]
        if xmax2 is None:
            xmax2 = plt.xlim()[1]
        if ymin2 is None:
            ymin2 = plt.ylim()[0]
        if ymax2 is None:
            ymax2 = plt.ylim()[1]
        plt.xlim(xmin2, xmax2)
        plt.ylim(ymin2, ymax2)
        if CREATE_PDF:
            plt.savefig(join(figure_directory, fig_filename),
                        format='pdf',
                        dpi=None,
                        edgecolor='w',
                        orientation='portrait',
                        transparent=False,
                        bbox_inches='tight',
                        pad_inches=0.05,
                        frameon=None)
        if SHOW_PDF:
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
        if SHOW_PLOT:
            plt.show()
def test_dictionary_transform():
    print("\n-- 'check_dictionary_beliefs', 'from_dictionary_beliefs' --")
    Xd = {1: 1, 2: 2, 3: 3, 5: 1}
    print("Xd:", Xd)

    print("zeroindexing=True:")
    print("X:\n", from_dictionary_beliefs(Xd,
                                          n=None,
                                          k=None,
                                          zeroindexing=True))
    print("zeroindexing=False:")
    print("X:\n",
          from_dictionary_beliefs(Xd, n=None, k=None, zeroindexing=False))
    print("zeroindexing=True, n=7, k=5:")
    print("X:\n", from_dictionary_beliefs(Xd, n=7, k=5, zeroindexing=True))

    print("\nzeroindexing=False, fullBeliefs=True:")
    X1 = {1: 1, 2: 2, 3: 3, 4: 1}
    assert check_dictionary_beliefs(X1,
                                    n=None,
                                    k=None,
                                    zeroindexing=False,
                                    fullBeliefs=True)
    print("X1:", X1)

    print("zeroindexing=True, fullBeliefs=True:")
    X2 = {0: 0, 1: 1, 2: 2, 3: 0}
    assert check_dictionary_beliefs(X2,
                                    n=None,
                                    k=None,
                                    zeroindexing=True,
                                    fullBeliefs=True)
    print("X2:", X2)

    print("zeroindexing=True, fullBeliefs=False:")
    X3 = {0: 0, 1: 1, 2: 2, 4: 0}
    assert check_dictionary_beliefs(X3,
                                    n=None,
                                    k=None,
                                    zeroindexing=True,
                                    fullBeliefs=False)
    print("X3:", X3)

    print("zeroindexing=True, fullBeliefs=False:")
    X4 = {0: 1, 2: 2, 4: 0}
    assert check_dictionary_beliefs(X4,
                                    n=None,
                                    k=None,
                                    zeroindexing=True,
                                    fullBeliefs=False)
    print("X4:", X4)

    print("\nerrors:")
    X5 = {0: 0, 1: 1, 2: 3, 3: 0}
    print("X5:", X5)
    assert not check_dictionary_beliefs(
        X5, n=None, k=None, zeroindexing=False, fullBeliefs=True)
    X6 = {0: 1, 1: 1, 2: 2, 4: 1}
    print("X6:", X6)
    assert not check_dictionary_beliefs(
        X6, n=None, k=None, zeroindexing=True, fullBeliefs=True)

    print("\n-- 'check_explicit_beliefs', 'to_dictionary_beliefs' --")
    X = np.array([
        [1., 0, 0],
        [0, 0, 0],
        [0, 0, 1],
    ])
    print("original X:\n", X)
    print("Stacked X:\n", np.hstack(X))
    print("List of X entires:\n", set(np.hstack(X)))
    print("Verify: ", set(np.hstack(X)) == {0, 1})
    print("Verify: ", {0., 1.} == {0, 1})
    assert check_explicit_beliefs(X)
    Y = np.array([1., 0, 0])
    assert check_explicit_beliefs(Y)
    Xd = to_dictionary_beliefs(X)
    print("Xd: ", Xd)
Esempio n. 11
0
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False,
        show_arrows=False):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf

    SHOW_STD = True         ## FALSE for just scatter plot points
    SHOW_ARROWS = show_arrows


    # -- Default Graph parameters
    rep_SameGraph = 1       # iterations on same graph
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    variant = 1
    EC = False
    numberOfSplits = 1
    scaling_vec = [None]*10
    ymin = 0.3
    ymax = 1
    xmin = 1e-3
    xmax = 1e3
    xtick_lab = [1e-3, 0.01, 0.1, 1, 10, 100, 1000]
    xtick_labels = [r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$10^{2}$', r'$10^{3}$']
    ytick_lab = np.arange(0, 1.1, 0.1)
    k = 3
    a = 1
    rep_DifferentGraphs = 1   # iterations on different graphs
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = 0.99
    facecolor_vec = ["#4C72B0", "#55A868", "#8172B2", "#C44E52", "#CCB974", "#64B5CD"]
    label_vec = ['MCE', 'LCE', 'DCE', 'Holdout']
    linewidth_vec = [4, 3, 1, 2, 2, 1]
    # clip_ons = [True, True, True, True, True, True]
    FILEZNAME = 'Fig_timing_accuracy_learning'
    marker_vec = ['s', '^', 'v', 'o', 'x', '+', 'None']   #'^'
    length_vec = [5]
    stratified = True
    f = 0.01
    numMaxIt_vec = [10]*7
    alpha_vec = [0] * 7
    beta_vec = [0] * 7  # TODO: LinBP does not use beta. Also SSLH uses alpha, but not beta for W^row! Now fixed
    gamma_vec = [0] * 7
    s_vec = [0.5] * 7


    # -- Main Options
    if CHOICE == 1:         # Main graph
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8]


    elif CHOICE == 2:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'GS']
        randomize_vec = [False]*3 + [True] + [None]
        scaling_vec = [None]*2 + [10, 100] + [None]


    elif CHOICE == 3:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'GS']
        randomize_vec = [False]*3 + [True] + [None]
        scaling_vec = [None]*2 + [10, 100] + [None]
        f = 0.02


    elif CHOICE == 4:         # TODO: Overnight Wolfgang
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8, 16]


    elif CHOICE == 5:         # Toy graph with 100 nodes
        n = 100
        h = 3
        d = 8
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8]
        f=0.05


    elif CHOICE == 6:         # To be run by Prakhar on Cluster
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8]
        f=0.003
        xmin = 1e-2
        # ymax = 0.9
        ymin = 0.2
        ymax = 0.9
        xmin = 1e-2
        xmax = 1e3



    elif CHOICE == 7:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8, 16]
        f=0.009

    # elif CHOICE == 8:       # not working well
    #     n = 1000
    #     h = 3
    #     d = 25
    #     option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
    #     learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
    #     label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
    #     randomize_vec = [False]*3 + [True] + [None]*2
    #     scaling_vec = [None]*2 + [10, 100] + [None]*2
    #     splits_vec = [1, 2, 4, 8, 16]
    #     f=0.005



    else:
        raise Warning("Incorrect choice!")



    csv_filename = '{}_{}.csv'.format(FILEZNAME, CHOICE)
    header = ['currenttime',
              'option',
              'lensplit',
              'f',
              'accuracy',
              'timetaken']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    H0c = to_centering_beliefs(H0)


    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs):  # create several graphs with same parameters
            # print("\ni: {}".format(i))

            W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                                      distribution=distribution,
                                                      exponent=exponent,
                                                      directed=False,
                                                      debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for j in range(rep_SameGraph):  # repeat several times for same graph
                # print("j: {}".format(j))

                ind = None
                X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified = stratified)     # TODO: stratified sampling option = True
                X2 = introduce_errors(X1, ind, err)

                for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weight, randomize, option) in \
                        enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, scaling_vec, randomize_vec, option_vec)):

                    # weight = np.array([np.power(scaling, i) for i in range(5)])       # TODO: now enough to specify weight as a scalar!
                    H_est_dict = {}
                    timeTaken_dict = {}

                    # -- Learning
                    if learning_method == 'Holdout' :
                        for numberOfSplits in splits_vec:
                            prev_time = time.time()
                            H_est_dict[numberOfSplits] = estimateH_baseline_serial(X2, ind, W, numMax=numMaxIt,
                                                                                   # ignore_rows=ind,
                                                                                   numberOfSplits=numberOfSplits,
                                                                                   # method=learning_method, variant=1, distance=length,
                                                                                   EC=EC,
                                                                                   weights=weight, alpha=alpha, beta=beta, gamma=gamma)
                            timeTaken = time.time() - prev_time
                            timeTaken_dict[numberOfSplits] = timeTaken

                    elif learning_method in ['LHE', 'MHE', 'DHE']:      # TODO: no smartInit, just randomization as option
                        for length in length_vec:
                            prev_time = time.time()
                            H_est_dict[length] = estimateH(X2, W, method=learning_method, variant=1, randomize=randomize, distance=length, EC=EC, weights=weight)
                            timeTaken = time.time() - prev_time
                            timeTaken_dict[length] = timeTaken

                    elif learning_method == 'GS':
                        H_est_dict['GS'] = H0

                    for key in H_est_dict:
                        H_est = H_est_dict[key]
                        H2c = to_centering_beliefs(H_est)
                        # print("H_estimated by {} is \n".format(learning_method), H_est)
                        # print("H0 is \n", H0)
                        # print("randomize was: ", randomize)

                        # Propagation
                        X2c = to_centering_beliefs(X2, ignoreZeroRows=True)  # try without
                        eps_max = eps_convergence_linbp_parameterized(H2c, W,
                                                                      method='noecho',
                                                                      alpha=alpha, beta=beta, gamma=gamma,
                                                                      X=X2)

                        eps = s * eps_max

                        # print("Max Eps ", eps_max)

                        try:
                            F, actualIt, actualPercentageConverged = \
                                linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                              method='noecho',
                                                              alpha=alpha, beta=beta, gamma=gamma,
                                                              numMaxIt=numMaxIt,
                                                              convergencePercentage=convergencePercentage_W,
                                                              convergenceThreshold=0.99,
                                                              debug=2)

                        except ValueError as e:
                            print(
                                "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                        else:
                            accuracy_X = matrix_difference(X0, F, ignore_rows=ind)

                            tuple = [str(datetime.datetime.now())]
                            if learning_method == 'Holdout':
                                text = [option,"split{}".format(key), f, accuracy_X, timeTaken_dict[key]]
                            elif learning_method in ['MHE', 'DHE', 'LHE']:
                                text = [option, "len{}".format(key), f, accuracy_X, timeTaken_dict[key]]
                            elif learning_method == 'GS':
                                text = [option, 0, f, accuracy_X, 0]

                            tuple.extend(text)
                            # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option, f, actualIt, accuracy_X))
                            save_csv_record(join(data_directory, csv_filename), tuple)





    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # Aggregate repetitions
    df2 = df1.groupby(['option', 'lensplit', 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    df3 = df1.groupby(['option', 'lensplit', 'f']).agg({'timetaken': [np.median] })
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # resultdf3 = df3.sort(['timetaken'], ascending=1)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(15)))

    X_time_median_dict = {}
    Y_acc_dict = {}
    Y_std_dict = {}

    for option in option_vec:
        Y_acc_dict[option] = df2.loc[(df2['option'] == option), "accuracy_mean"].values
        Y_std_dict[option] = df2.loc[(df2['option'] == option), "accuracy_std"].values
        X_time_median_dict[option] = df3.loc[(df3['option'] == option), "timetaken_median"].values

        # print("option: ", option)
        # print("Y_acc_dict[option]: ", Y_acc_dict[option])
        # print("Y_std_dict[option]: ", Y_std_dict[option])
        # print("X_time_median_dict[option]: ", X_time_median_dict[option])



    # -- Setup figure
    fig_filename = '{}_{}.pdf'.format(FILEZNAME, CHOICE)
    mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
    mpl.rcParams['axes.labelsize'] = 18
    mpl.rcParams['xtick.labelsize'] = 16
    mpl.rcParams['ytick.labelsize'] = 16
    mpl.rcParams['axes.titlesize'] = 16
    mpl.rcParams['legend.fontsize'] = 14
    mpl.rcParams['grid.color'] = '777777'  # grid color
    mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
    mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
    mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
    mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
    mpl.rcParams['figure.figsize'] = [4, 4]
    fig = figure()
    ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


    SHOW_ARROWS = True

    for choice, color, learning_method, label, linewidth, marker in \
            zip(option_vec, facecolor_vec, learning_method_vec, label_vec, linewidth_vec, marker_vec):

        if learning_method == 'Holdout':
            # Draw std
            X1 = X_time_median_dict[choice]
            s = X1.argsort()
            X1 = X1[s]
            Y1 = Y_acc_dict[choice][s]
            Y2 = Y_std_dict[choice][s]

            if SHOW_STD:
                ax.fill_between(X1, Y1 + Y2, Y1 - Y2, facecolor=color, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X1, Y1 + Y2, linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X1, Y1 - Y2, linewidth=0.5, color='0.8', linestyle='solid')
                ax.set_ylim(bottom=ymin)

                ax.plot(X1, Y1, linewidth=linewidth, color=color, linestyle='solid', label=label, zorder=20, marker='x', markersize=linewidth + 5, markeredgewidth=1)
                ax.annotate(np.round(X1[1], decimals=1), xy=(X1[1], Y1[1] - 0.05), color=color, va='center', annotation_clip=False, zorder=5)

            else:
                ax.scatter(list(X1), list(Y1),
                           color=color, label=label, marker='x', s=42)


        elif learning_method == 'GS':
            ax.plot([1e-4, 1e4], [Y_acc_dict[choice], Y_acc_dict[choice]],
                    linewidth=1, color='black',
                    linestyle='dashed', zorder=0,
                    marker=None,
                    label=label,
                    )

        else:       # For all other
            if SHOW_STD:
                ax.errorbar(list(X_time_median_dict[choice]), list(Y_acc_dict[choice]), yerr=Y_std_dict[choice],
                            fmt='-o', linewidth=2, color=color,
                            label=label, marker=marker, markersize=8)
                ax.annotate(np.round(X_time_median_dict[choice], decimals=2), xy=(X_time_median_dict[choice], Y_acc_dict[choice]-0.05), color=color, va='center',
                            annotation_clip=False, zorder=5)

            else:
                ax.scatter(list(X_time_median_dict[choice]), list(Y_acc_dict[choice]),
                           color=color, label=label, marker=marker, s=42)

        if SHOW_ARROWS:
            dce_opt = 'opt4'
            holdout_opt = 'opt5'

            ax.annotate(s='', xy=(X_time_median_dict[dce_opt], Y_acc_dict[dce_opt]-0.3), xytext=(X_time_median_dict[holdout_opt][2]+0.02, Y_acc_dict[dce_opt]-0.3), arrowprops=dict(arrowstyle='<->'))
            ax.annotate(str(int(np.round(X_time_median_dict[holdout_opt][2] / X_time_median_dict[dce_opt]))) + 'x', xy=((X_time_median_dict[dce_opt] + X_time_median_dict[holdout_opt][2])/100, Y_acc_dict[dce_opt]-0.28),
                        color='black', va='center',
                        # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                        annotation_clip=False, zorder=5)






    # -- Title and legend
    title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}$'.format(int(n / 1000), d, h, f))
    handles, label_vec = ax.get_legend_handles_labels()
    for i, (h, learning_method) in enumerate(zip(handles, learning_method_vec)):        # remove error bars in legend
        if isinstance(handles[i], collections.Container):
            handles[i] = handles[i][0]

    # plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0))

    SHOW_STD = False


    legend = plt.legend(handles, label_vec,
                        loc='upper right',  # 'upper right'
                        handlelength=2,
                        fontsize=12,
                        labelspacing=0.2,  # distance between label entries
                        handletextpad=0.3,  # distance between label and the line representation
                        borderaxespad=0.2,  # distance between legend and the outer axes
                        borderpad=0.3,  # padding inside legend box
                        numpoints=1,  # put the marker only once
                        )
    if not(SHOW_STD):
        legend = plt.legend(handles, label_vec,
                        loc='upper right',  # 'upper right'
                        handlelength=2,
                        fontsize=10,
                        labelspacing=0.2,  # distance between label entries
                        handletextpad=0.3,  # distance between label and the line representation
                        borderaxespad=0.2,  # distance between legend and the outer axes
                        borderpad=0.3,  # padding inside legend box
                        numpoints=1,  # put the marker only once
                        scatterpoints=1  # display only one-scatter point in legend
                        )

    # # legend.set_zorder(1)
    frame = legend.get_frame()
    frame.set_linewidth(0.0)
    frame.set_alpha(0.9)  # 0.8


    # -- Figure settings and save
    plt.xscale('log')
    plt.xticks(xtick_lab, xtick_labels)
    plt.yticks(ytick_lab, ytick_lab)
    ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_ylim(bottom=ymin)

    grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
    grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',

    xlim(xmin, xmax)
    ylim(ymin, ymax)


    xlabel(r'Time Median (sec)', labelpad=0)      # labelpad=0
    ylabel(r'Accuracy', labelpad=0)
    if CREATE_PDF:
        savefig(join(figure_directory, fig_filename), format='pdf',
                dpi=None,
                edgecolor='w',
                orientation='portrait',
                transparent=False,
                bbox_inches='tight',
                pad_inches=0.05,
                frameon=None)

    if SHOW_PDF:
        showfig(join(figure_directory, fig_filename))

    if SHOW_PLOT:
        plt.show()
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        show_fig=True):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    SHOW_PDF = show_pdf
    SHOW_FIG1 = show_fig
    SHOW_FIG2 = show_fig

    csv_filename = 'Fig_MHE_Optimal_ScalingFactor_f_lambda10_{}.csv'.format(
        CHOICE)
    header = [
        'currenttime',
        'option',  # one option corresponds to one choice of weight vector. In practice, one choice of scaling factor (for weight vector)
        'f',
        'scaling',
        'diff'
    ]  # L2 norm between H and estimate
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    rep = 100
    randomize = False
    initial_h0 = None  # initial vector to start finding optimal H
    distribution = 'powerlaw'
    exponent = -0.3
    rep_differentGraphs = 1
    EC = True
    f_vec = [0.9 * pow(0.1, 1 / 12)**x for x in range(42)]
    fraction_of_minimum = 1.1  # scaling parameters that lead to optimum except for this scaling factor are included
    ymin2 = 0.28
    ymax2 = 500
    xmin = 0.001
    # xmin = 0.0005
    xmax = None
    xtick_lab = [0.001, 0.01, 0.1, 1]
    # ytick_lab1 = np.arange(0, 1, 0.1)
    ytick_lab2 = [0.3, 1, 10, 100, 1000]
    ymax1 = 1.2
    ymin1 = 0.001
    # ytick_lab1 = [0.001, 0.01, 0.1, 1]
    k = 3
    a = 1
    stratified = True
    gradient = False
    n = 10000
    # color_vec = ['blue', 'orange', 'red']
    color_vec = ["#4C72B0", "#55A868", "#C44E52", "#CCB974", "#64B5CD"]
    color_vec = ["#4C72B0", "#8172B2", "#C44E52"]
    # label_vec = [r'$\tilde {\mathbf{H}}$', r'$\tilde{\mathbf{H}}^{(5)}_{\mathrm{NB}}$', r'$\tilde {\mathbf{H}}^{(5)}_{\mathrm{NB}}$ r']
    label_vec = ['MCE', 'DCE', 'DCEr']
    marker_vec = ['s', 'x', 'o']
    legendPosition = 'upper right'

    # -- Options
    if CHOICE == 11:
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 12:
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 13:
        h = 8
        d = 10
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 14:
        h = 3
        d = 10
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 15:
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    # elif CHOICE == 16:
    #     n = 10000
    #     h = 3
    #     d = 10
    #     option_vec = ['opt1', 'opt2', 'opt3']
    #     scaling_vec = [0, 50, 50]
    #     randomize_vec = [False, False, True]
    #     length_vec = [1, 5, 5]

    elif CHOICE == 17:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 18:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    # -- Options
    elif CHOICE == 19:
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 20:
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]
        gradient = True
        legendPosition = 'center right'

    else:
        raise Warning("Incorrect choice!")

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))

    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for rs in range(1, rep_differentGraphs + 1):
            # print('Graph {}'.format(rs))

            # -- Create graph
            W, Xd = planted_distribution_model_H(n,
                                                 alpha=alpha0,
                                                 H=H0,
                                                 d_out=d,
                                                 distribution=distribution,
                                                 exponent=exponent,
                                                 directed=False,
                                                 debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for r in range(1, rep + 1):
                # print('Repetition {}'.format(r))

                for f in f_vec:
                    # -- Sample labeled data
                    X1, ind = replace_fraction_of_rows(X0,
                                                       1 - f,
                                                       stratified=stratified)

                    # -- Calculate number of labeled neighbors
                    M_vec = M_observed(W, X1, distance=5, NB=True)
                    M = M_vec[1]
                    num_N = np.sum(M)
                    # print("f={:1.4f}, number labeled neighbors={}".format(f, num_N))
                    # print("M_vec:\n{}".format(M_vec))

                    # -- Create estimates and compare against GT
                    for option, scaling, randomize, length in zip(
                            option_vec, scaling_vec, randomize_vec,
                            length_vec):
                        H_est = estimateH(X1,
                                          W,
                                          method='DHE',
                                          variant=1,
                                          distance=length,
                                          EC=EC,
                                          weights=scaling,
                                          randomize=randomize,
                                          initial_H0=initial_h0,
                                          gradient=gradient)
                        diff = LA.norm(H_est - H0)

                        tuple = [str(datetime.datetime.now())]
                        text = [option, f, scaling, diff]
                        tuple.extend(text)
                        save_csv_record(join(data_directory, csv_filename),
                                        tuple)

                        # print("diff={:1.4f}, H_est:\n{}".format(diff, H_est))

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # Aggregate repetitions
    df2 = df1.groupby(['option', 'f']).agg \
        ({'diff': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'diff_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # Pivot table
    df3 = pd.pivot_table(df2,
                         index=['f'],
                         columns=['option'],
                         values=['diff_mean', 'diff_std'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    # Extract values
    X_f = df3['f'].values  # plot x values
    Y = []
    Y_std = []
    for option in option_vec:
        Y.append(df3['diff_mean_{}'.format(option)].values)
        Y_std.append(df3['diff_std_{}'.format(option)].values)

    # print("X_f:\n", X_f)
    # print("Y:\n", Y)
    # print("Y_std:\n", Y_std)

    if SHOW_FIG1:
        # -- Setup figure
        fig_filename = 'Fig_MHE_Optimal_ScalingFactor_diff_f_lambda10_{}.pdf'.format(
            CHOICE)
        mpl.rcParams['backend'] = 'pdf'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = 14
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 16
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams[
            'xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Draw the plots
        for i, (color, marker) in enumerate(zip(color_vec, marker_vec)):
            p = ax.plot(X_f,
                        Y[i],
                        color=color,
                        linewidth=3,
                        label=label_vec[i],
                        marker=marker)
            if i != 1:
                ax.fill_between(X_f,
                                Y[i] + Y_std[i],
                                Y[i] - Y_std[i],
                                facecolor=color,
                                alpha=0.2,
                                edgecolor='none')
        plt.xscale('log')
        plt.yscale('log')

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        plt.title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, h\!=\!{}, d\!=\!{}{}'.format(
            int(n / 1000), h, d, distribution_label))
        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            labels,
            loc=legendPosition,  # 'upper right'
            handlelength=1.5,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            # title='Variants',
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.1,  # padding inside legend box
        )
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_lab)
        # plt.yticks(ytick_lab1, ytick_lab1)
        plt.grid(b=True,
                 which='minor',
                 axis='both',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.grid(b=True,
                 which='major',
                 axis='y',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'Label Sparsity $(f)$', labelpad=0)  # labelpad=0
        plt.ylabel(r'L2 norm', labelpad=-5)

        if xmin is None:
            xmin = plt.xlim()[0]
        if xmax is None:
            xmax = plt.xlim()[1]
        if ymin1 is None:
            ymin1 = plt.ylim()[1]
        if ymax1 is None:
            ymax1 = plt.ylim()[1]
        plt.xlim(xmin, xmax)
        plt.ylim(ymin1, ymax1)

        if CREATE_PDF:
            plt.savefig(join(figure_directory, fig_filename),
                        format='pdf',
                        dpi=None,
                        edgecolor='w',
                        orientation='portrait',
                        transparent=False,
                        bbox_inches='tight',
                        pad_inches=0.05,
                        frameon=None)

        if SHOW_FIG1:
            plt.show()
        if SHOW_PDF:
            os.system('{} "'.format(open_cmd[sys.platform]) +
                      join(figure_directory, fig_filename) +
                      '"')  # shows actually created PDF
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        shorten_length=False):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf
    SHOW_ARROWS = False
    STD_FILL = False

    CALCULATE_DATA_STATISTICS = False
    csv_filename = 'Fig_timing_VaryK_{}.csv'.format(CHOICE)
    header = ['currenttime', 'option', 'k', 'f', 'time']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    rep_SameGraph = 2  # iterations on same graph
    initial_h0 = None  # initial vector to start finding optimal H
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    variant = 1
    EC = True  # Non-backtracking for learning
    ymin = 0.0
    ymax = 1
    xmin = 2
    xmax = 7.5
    xtick_lab = [2, 3, 4, 5, 6, 7, 8]
    xtick_labels = ['2', '3', '4', '5', '6', '7', '8']
    ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 50]
    ytick_labels = [
        r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$50$'
    ]
    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    k_vec = [3, 4, 5]
    rep_DifferentGraphs = 1000  # iterations on different graphs
    err = 0
    avoidNeighbors = False
    gradient = False
    convergencePercentage_W = None
    stratified = True
    label_vec = ['*'] * 10
    clip_on_vec = [True] * 15
    draw_std_vec = range(10)
    numberOfSplits = 1
    linestyle_vec = ['solid'] * 15
    linewidth_vec = [3, 2, 4, 2, 3, 2] + [3] * 15
    marker_vec = ['^', 's', 'o', 'x', 'o', '+', 's'] * 3
    markersize_vec = [8, 7, 8, 10, 7, 6] + [10] * 10
    facecolor_vec = [
        "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#64B5CD"
    ]
    legend_location = 'upper right'

    # -- Options with propagation variants
    if CHOICE == 600:  ## 1k nodes
        n = 1000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['GT', 'MHE', 'DHE', 'Holdout']
        weight_vec = [10] * 4
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True]
        xmin = 3.
        xmax = 10.
        ymin = 0.
        ymax = 50.
        label_vec = ['GT', 'MCE', 'DCE', 'Holdout']
        facecolor_vec = [
            'black'
        ] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 4
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [3, 4, 5, 6]
        ytick_lab = [0, 1e-3, 1e-2, 1e-1, 1, 10, 50]
        ytick_labels = [
            r'$0$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$50$'
        ]

    elif CHOICE == 601:  ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['GT', 'MHE', 'DHE', 'Holdout']
        weight_vec = [10] * 4
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 15 + [True]
        xmin = 3.
        xmax = 8.
        ymin = 0.
        ymax = 500.
        label_vec = ['GT', 'MCE', 'DCE', 'Holdout']
        facecolor_vec = [
            'black'
        ] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 4
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [3, 4, 5]
        ytick_lab = [0, 1e-3, 1e-2, 1e-1, 1, 10, 100, 300]
        ytick_labels = [
            r'$0$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$300$'
        ]

    elif CHOICE == 602:  ## 10k nodes
        n = 10000
        h = 8
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 3 + [True] + [False]
        ymin = 0.01
        ymax = 500
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DHEr']
        facecolor_vec = [
            "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"
        ] * 4
        f_vec = [0.01]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7, 8]

        # option_vec = ['opt2', 'opt3', 'opt6']
        # learning_method_vec = ['MHE', 'DHE', 'LHE']
        # k_vec = [2, 3, 4, 5]

    elif CHOICE == 603:  ## 10k nodes

        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]

        xmin = 1.8
        xmax = 8.2
        ymin = 0.01
        ymax = 500
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.01]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7, 8]

        legend_location = 'upper right'

        # option_vec = ['opt2', 'opt3', 'opt6']
        # learning_method_vec = ['MHE', 'DHE', 'LHE']
        # k_vec = [2, 3, 4, 5]

        # option_vec = ['opt4', 'opt3']
        # learning_method_vec = ['MHE', 'MHE']
        # randomize_vec = [True, False]
        # k_vec = [2, 3, 4, 5]

    elif CHOICE == 604:  ## 10k nodes with Gradient
        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]
        ymin = 0.00
        ymax = 800
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.01]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [7, 8]
        gradient = True
        legend_location = 'center right'

    elif CHOICE == 605:  ## 10k nodes with Gradient   with f = 0.005
        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]
        ymin = 0.00
        ymax = 800
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.005]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7]
        # k_vec = [7, 8]
        gradient = True
        legend_location = 'center right'

    elif CHOICE == 606:  ## 10k nodes with Gradient   with f = 0.005 and Gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]

        xmin = 1.8
        xmax = 7.2
        ymin = 0.01
        ymax = 800
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.005]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7]

        gradient = True
        pruneRandom = True
        legend_location = 'upper right'

    elif CHOICE == 607:  ## 10k nodes   with gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 3 + [True] + [False]

        xmin = 1.8
        xmax = 7.
        ymin = 0.01
        ymax = 800
        label_vec = ['LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = [
            "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"
        ] * 4
        legend_location = 'upper left'
        marker_vec = [None, 's', 'x', 'o', '^', '+'] * 3
        markersize_vec = [8, 7, 10, 8, 7, 6] + [10] * 10
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        clip_on_vec = [True] * 10
        gradient = True
        pruneRandom = True
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

    elif CHOICE == 608:  ## 10k nodes   with gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 3 + [True] + [False]

        xmin = 1.8
        xmax = 7.2
        ymin = 0.01
        ymax = 800
        label_vec = ['LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = [
            "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"
        ] * 4
        legend_location = 'upper left'
        marker_vec = [None, 's', 'x', 'o', '^', '+'] * 3
        markersize_vec = [8, 7, 10, 8, 7, 6] + [10] * 10
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        clip_on_vec = [True] * 10
        gradient = True
        pruneRandom = True
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]
        rep_DifferentGraphs = 10

    else:
        raise Warning("Incorrect choice!")

    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))

    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs
                       ):  # create several graphs with same parameters
            # print("\ni: {}".format(i))

            for k in k_vec:
                # print("\nk: {}".format(k))

                H0 = create_parameterized_H(k, h, symmetric=True)
                H0c = to_centering_beliefs(H0)

                a = [1.] * k
                alpha0 = np.array(a)
                alpha0 = alpha0 / np.sum(alpha0)

                W, Xd = planted_distribution_model_H(n,
                                                     alpha=alpha0,
                                                     H=H0,
                                                     d_out=d,
                                                     distribution=distribution,
                                                     exponent=exponent,
                                                     directed=False,
                                                     debug=False)
                X0 = from_dictionary_beliefs(Xd)

                for j in range(
                        rep_SameGraph):  # repeat several times for same graph
                    # print("j: {}".format(j))

                    ind = None
                    for f in f_vec:  # Remove fraction (1-f) of rows from X0 (notice that different from first implementation)
                        X1, ind = replace_fraction_of_rows(
                            X0,
                            1 - f,
                            avoidNeighbors=avoidNeighbors,
                            W=W,
                            ind_prior=ind,
                            stratified=stratified)
                        X2 = introduce_errors(X1, ind, err)

                        for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
                                enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):

                            # -- Learning
                            if learning_method == 'GT':
                                timeTaken = 0.0

                            elif learning_method == 'Holdout':

                                prev_time = time.time()
                                H2 = estimateH_baseline_serial(
                                    X2,
                                    ind,
                                    W,
                                    numMax=numMaxIt,
                                    numberOfSplits=numberOfSplits,
                                    EC=EC,
                                    alpha=alpha,
                                    beta=beta,
                                    gamma=gamma)
                                timeTaken = time.time() - prev_time

                            else:
                                prev_time = time.time()
                                if gradient and pruneRandom:
                                    H2 = estimateH(X2,
                                                   W,
                                                   method=learning_method,
                                                   variant=1,
                                                   distance=length,
                                                   EC=EC,
                                                   weights=weights,
                                                   randomize=randomize,
                                                   gradient=gradient)
                                else:
                                    H2 = estimateH(X2,
                                                   W,
                                                   method=learning_method,
                                                   variant=1,
                                                   distance=length,
                                                   EC=EC,
                                                   weights=weights,
                                                   randomize=randomize)
                                timeTaken = time.time() - prev_time

                            tuple = [str(datetime.datetime.now())]
                            text = [option_vec[option_index], k, f, timeTaken]
                            tuple.extend(text)
                            # print("option: {}, f: {}, timeTaken: {}".format(option_vec[option_index], f, timeTaken))
                            save_csv_record(join(data_directory, csv_filename),
                                            tuple)

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # -- Aggregate repetitions
    df2 = df1.groupby(['option', 'k', 'f']).agg \
        ({'time': [np.mean, np.std, np.size, np.median],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # -- Pivot table
    df3 = pd.pivot_table(df2,
                         index=['f', 'k'],
                         columns=['option'],
                         values=['time_mean', 'time_std',
                                 'time_median'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(100)))

    # X_f = k_vec
    X_f = df3['k'].values  # read k from values instead

    Y_hash = defaultdict(dict)
    Y_hash_std = defaultdict(dict)

    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = list()
            Y_hash_std[f][option] = list()

    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = df3.loc[df3['f'] == f]['time_mean_{}'.format(
                option)].values  # mean
            # Y_hash[f][option] = df3.loc[df3['f'] == f]['time_median_{}'.format(option)].values          # median
            Y_hash_std[f][option] = df3.loc[df3['f'] == f][
                'time_std_{}'.format(option)].values

    if SHOW_PLOT or SHOW_PDF or CREATE_PDF:

        # -- Setup figure
        fig_filename = 'Fig_Time_varyK_{}.pdf'.format(CHOICE)
        mpl.rc(
            'font', **{
                'family': 'sans-serif',
                'sans-serif': [u'Arial', u'Liberation Sans']
            })
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams[
            'xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        opt_f_vecs = [(option, f) for option in option_vec for f in f_vec]

        for ((option, f), color, linewidth, clip_on, linestyle, marker, markersize) in \
            zip(opt_f_vecs, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec):

            label = label_vec[option_vec.index(option)]
            # label = label + " " + str(f)

            if STD_FILL:
                ax.fill_between(X_f,
                                Y_hash[f][option] + Y_hash_std[f][option],
                                Y_hash[f][option] - Y_hash_std[f][option],
                                facecolor=color,
                                alpha=0.2,
                                edgecolor=None,
                                linewidth=0)
                ax.plot(X_f,
                        Y_hash[f][option] + Y_hash_std[f][option],
                        linewidth=0.5,
                        color='0.8',
                        linestyle='solid')
                ax.plot(X_f,
                        Y_hash[f][option] - Y_hash_std[f][option],
                        linewidth=0.5,
                        color='0.8',
                        linestyle='solid')

            ax.plot(X_f,
                    Y_hash[f][option],
                    linewidth=linewidth,
                    color=color,
                    linestyle=linestyle,
                    label=label,
                    zorder=4,
                    marker=marker,
                    markersize=markersize,
                    markeredgecolor='black',
                    markeredgewidth=1,
                    clip_on=clip_on)

        if SHOW_ARROWS:
            for indx in [2, 3]:
                ax.annotate(s='',
                            xy=(X_f[indx] - 0.05, Y_hash[f]['opt4'][indx]),
                            xytext=(X_f[indx] - 0.05, Y_hash[f]['opt5'][indx]),
                            arrowprops=dict(facecolor='blue',
                                            arrowstyle='<->'))
                ax.annotate(
                    str(
                        int(
                            np.round(Y_hash[f]['opt5'][indx] /
                                     Y_hash[f]['opt4'][indx]))) + 'x',
                    xy=(X_f[indx] - 0.4,
                        (Y_hash[f]['opt5'][indx] + Y_hash[f]['opt4'][indx]) /
                        10),
                    color='black',
                    va='center',
                    annotation_clip=False,
                    zorder=5)

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        if n < 1000:
            n_label = '{}'.format(n)
        else:
            n_label = '{}k'.format(int(n / 1000))

        title(r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(
            n_label, d, h, f, distribution_label))
        handles, label_vec = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            label_vec,
            loc=legend_location,  # 'upper right'
            handlelength=2,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings and save
        plt.yscale('log')
        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)

        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)

        grid(b=True,
             which='major',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True,
             which='minor',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Number of Classes $(k)$', labelpad=0)  # labelpad=0
        ylabel(r'Time [sec]', labelpad=0)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename),
                    format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)

        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
Esempio n. 14
0
def run(option, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, show_fig=True):

    # -- Setup
    OPTION = option
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT=show_plot
    CREATE_PDF=create_pdf
    SHOW_PDF=show_pdf
    SHOW_FIG2 = show_fig     # curve

    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed

    header = ['currenttime',
              'option',     # one option corresponds to one choice of weight vector. In practice, one choice of scaling factor (for weight vector)
              'variant',    # 1, 2, 3 (against GT), and 1-2, 1-3, 2-3 (against each other)
              'length',
              'diff',
              'time']       # L2 norm between H and estimate


    # Default Graph parameters and options
    n = 10000
    d = 25
    h = 8
    distribution = 'powerlaw'
    randomize = False
    initial_h0 = None           # initial vector to start finding optimal H
    initial_H0 = None
    exponent = -0.3
    length = 5
    rep_differentGraphs = 1
    rep = 10       #
    EC = [False] + [True] * 35
    # scaling_vec = [1, 0.1, 0.14, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    scaling_vec = [1] + [round(0.1 * pow(10, 1/8)**x, 4) for x in range(33)]
    num_options = len(scaling_vec)
    scaling_vec = np.array(scaling_vec)
    # weight = np.array([np.power(scaling_vec, i) for i in range(5)])
    # weight = weight.transpose()
    # ymin1 = None
    # ymax1 = None
    xmin2 = 0.1
    xmax2 = 1000
    ymax2 = 1.
    ymin2 = 0
    stratified = False
    xtick_lab = [0.1, 1, 10, 100, 1000]
    # ytick_lab = [0.05, 0.1, 0.5, 1]
    # fig1_index = [0, 11, 16, 21, 23, 24, 25, 26]         # which index of scaling options to display if CHOICE_FIG_BAR_VARIANT==True
    smartInit = False
    smartInitRandomize = False
    delta = 0.1
    variant = 1           # for figure 2, to speed up calculations
    logarithm = False

    if OPTION == 1:
        CHOICE_vec = [18, 50, 51, 52, 53, 54]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 4
        randomize_vec = [False]*2 + [True]*4
        delta_vec = [None]*2 + [0.1, 0.2, 0.3] + [0.1]
        constraints_vec = [False]*5 + [True]

    # elif OPTION == 0:
    #     CHOICE_vec = [54]
    #     initial_H0_vec = [None]
    #     randomize_vec = [True]
    #     delta_vec = [0.1]
    #     constraints_vec = [True]
    #
    # elif OPTION == 2:
    #     f = 0.003
    #     CHOICE_vec = [101, 102, 103, 104, 105]
    #     initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 3
    #     randomize_vec = [False]*2 + [True]*3
    #     delta_vec = [None]*2 + [0.1, 0.3] + [0.1]
    #     constraints_vec = [False]*4 + [True]
    #
    # elif OPTION == 3:
    #     f = 0.003
    #     h = 3
    #     CHOICE_vec = [111, 112, 113, 114, 115]
    #     initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 3
    #     randomize_vec = [False]*2 + [True]*3
    #     delta_vec = [None]*2 + [0.1, 0.3] + [0.1]
    #     constraints_vec = [False]*4 + [True]

    # elif OPTION == 4:
    #     f = 0.001
    #     h = 8
    #     CHOICE_vec = [121, 122, 123, 124]
    #     initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 2
    #     randomize_vec = [False]*2 + [True]*2
    #     delta_vec = [None]*2 + [0.1, 0.3]
    #     constraints_vec = [False]*4

    elif OPTION == 5:
        f = 0.001
        h = 8
        ymax2 = 2
        ymin2 = 4e-2
        CHOICE_vec = [131, 132, 133]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False]*2 + [True]*1
        delta_vec = [None]*2 + [0.1]
        constraints_vec = [False]*3
        stratified = True
        # CHOICE_vec = [131, 132, 133, 134]
        # initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 2
        # randomize_vec = [False]*2 + [True]*2
        # delta_vec = [None]*2 + [0.1, 0.3]
        # constraints_vec = [False]*4
        # stratified = True


    # elif OPTION == 6:
    #     f = 0.003
    #     h = 8
    #     CHOICE_vec = [141, 142, 143]
    #     initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
    #     randomize_vec = [False]*2 + [True]*1
    #     delta_vec = [None]*2 + [0.1]
    #     constraints_vec = [False]*3
    #     # CHOICE_vec = [141, 142, 143, 144]
    #     # initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 2
    #     # randomize_vec = [False]*2 + [True]*2
    #     # delta_vec = [None]*2 + [0.1, 0.3]
    #     # constraints_vec = [False]*4

    elif OPTION == 7:
        f = 0.003
        h = 8
        CHOICE_vec = [151, 152, 153]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False]*2 + [True]*1
        delta_vec = [None]*2 + [0.1]
        constraints_vec = [False]*3
        stratified = True
        # CHOICE_vec = [151, 152, 153, 154]
        # initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 2
        # randomize_vec = [False]*2 + [True]*2
        # delta_vec = [None]*2 + [0.1, 0.3]
        # constraints_vec = [False]*4
        # stratified = True

    # elif OPTION == 8:
    #     f = 0.001
    #     h = 3
    #     CHOICE_vec = [161, 162, 163]
    #     initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
    #     randomize_vec = [False]*2 + [True]*1
    #     delta_vec = [None]*2 + [0.1]
    #     constraints_vec = [False]*3
    #     # CHOICE_vec = [161, 162, 163, 164]
    #     # initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 2
    #     # randomize_vec = [False]*2 + [True]*2
    #     # delta_vec = [None]*2 + [0.1, 0.3]
    #     # constraints_vec = [False]*4

    elif OPTION == 9:
        f = 0.001
        h = 3
        CHOICE_vec = [171, 172, 173]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False]*2 + [True]*1
        delta_vec = [None]*2 + [0.1]
        constraints_vec = [False]*3
        stratified = True
        ymin2 = 6e-2
        ymax2 = 1
        # CHOICE_vec = [171, 172, 173, 174]
        # initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 2
        # randomize_vec = [False]*2 + [True]*2
        # delta_vec = [None]*2 + [0.1, 0.3]
        # constraints_vec = [False]*4


    elif OPTION == 10:
        f = 0.001
        h = 3
        d = 10
        CHOICE_vec = [181, 182, 183]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False] * 2 + [True] * 1
        delta_vec = [None] * 2 + [0.1]
        constraints_vec = [False] * 3
        stratified = True


    elif OPTION == 11:
        f = 0.05
        h = 8
        d = 25
        ymax2 = 0.08
        CHOICE_vec = [191, 192, 193]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False] * 2 + [True] * 1
        delta_vec = [None] * 2 + [0.1]
        constraints_vec = [False] * 3
        stratified = True

    elif OPTION == 12:
        f = 0.05
        h = 3
        d = 25
        ymax2 = 0.08
        CHOICE_vec = [201, 202, 203]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False] * 2 + [True] * 1
        delta_vec = [None] * 2 + [0.1]
        constraints_vec = [False] * 3
        stratified = True

    elif OPTION == 13:
        n=1000
        f = 0.01
        h = 3
        CHOICE_vec = [211, 212, 213]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False]*2 + [True]*1
        delta_vec = [None]*2 + [0.1]
        constraints_vec = [False]*3
        stratified = True
        ymin2 = 6e-2
        ymax2 = 1


    elif OPTION == 15:
        n=100000
        f = 0.01
        h = 3
        CHOICE_vec = [221, 222, 223]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False]*2 + [True]*1
        delta_vec = [None]*2 + [0.1]
        constraints_vec = [False]*3
        stratified = True
        ymin2 = 5e-3
        ymax2 = 2e-1

    elif OPTION == 16:      # variant on 13 with logarithm
        n=1000
        f = 0.01
        h = 3
        CHOICE_vec = [231, 232, 233]
        initial_H0_vec = [None] + [create_parameterized_H(3, h)] + [None] * 1
        randomize_vec = [False]*2 + [True]*1
        delta_vec = [None]*2 + [0.1]
        constraints_vec = [False]*3
        stratified = True
        ymin2 = 6e-2
        ymax2 = 1
        logarithm = True

    elif OPTION == 17:      
        f = 0.001
        h = 8
        ymax2 = 2
        ymin2 = 4e-2
        CHOICE_vec = [133]
        initial_H0_vec = [None] * 1
        randomize_vec = [True]*1
        delta_vec = [0.1]
        constraints_vec = [False]*3
        stratified = True

    else:
        raise Warning("Incorrect choice!")

    k = 3
    a = 1
    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)



    if CREATE_DATA:
        for CHOICE in CHOICE_vec:
            csv_filename = 'Fig_MHE_Variants_{}.csv'.format(CHOICE)
            save_csv_record(join(data_directory, csv_filename), header, append=False)

    # print("OPTION: {}".format(OPTION))



    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for rs in range(1, rep_differentGraphs+1):
            # print('Graph {}'.format(rs))

            # -- Create graph
            W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                                      distribution=distribution,
                                                      exponent=exponent,
                                                      directed=False,
                                                      debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for r in range(1, rep + 1):
                # print('Repetition {}'.format(r))

                X1, ind = replace_fraction_of_rows(X0, 1 - f, stratified=stratified)

                for CHOICE, initial_H0, randomize, delta, constraints in zip(CHOICE_vec, initial_H0_vec, randomize_vec, delta_vec, constraints_vec):
                    csv_filename = 'Fig_MHE_Variants_{}.csv'.format(CHOICE)

                    # -- Create estimates and compare against GT, or against each other
                    for length in range(1, length + 1):
                        for option in range(num_options):
                            start = time.time()

                            if smartInit:
                                startWeight = 0.2
                                initial_H0 = estimateH(X1, W, method='DHE', variant=variant,
                                                       distance=5,
                                                       EC=EC[option], weights=startWeight,
                                                       randomize=smartInitRandomize,
                                                       logarithm=logarithm)

                            # print(option)
                            # print(scaling_vec)
                            # print(scaling_vec[option])


                            H_est = estimateH(X1, W, method='DHE', variant=variant,
                                           distance=length, EC=EC[option], weights=scaling_vec[option],
                                           randomize=randomize,
                                           initial_H0=initial_H0,
                                              constraints = constraints,
                                           delta = delta
                                                       )
                            time_est = time.time() - start
                            diff = LA.norm(H_est - H0)

                            # if np.amin(H_est) < 0:
                            # if True:
                            #     print("\nCHOICE: {}, weight: {}".format(CHOICE, scaling_vec[option]))
                            #     print("length:{}".format(length))
                            #     print("H_est:\n{}".format(H_est))
                            #     print("diff: {}".format(diff))

                            tuple = [str(datetime.datetime.now())]
                            text = [option, variant, length, diff, time_est]
                            # text = np.asarray(text)  # (without np, entries get ugly format) not used here because it transforms integers to float !!
                            tuple.extend(text)
                            save_csv_record(join(data_directory, csv_filename), tuple)




    if SHOW_FIG2:

        for CHOICE, initial_h0, randomize, delta in zip(CHOICE_vec, initial_H0_vec, randomize_vec, delta_vec):
            csv_filename = 'Fig_MHE_Variants_{}.csv'.format(CHOICE)


            # -- Read, aggregate, and pivot data for all options
            df1 = pd.read_csv(join(data_directory, csv_filename))
            # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
            df2 = df1.groupby(['option', 'variant', 'length']).agg \
                ({'diff': [np.mean, np.std, np.size],  # Multiple Aggregates
                  'time': [np.mean, np.std],
                  })
            df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
            df2.reset_index(inplace=True)  # remove the index hierarchy
            df2.rename(columns={'diff_size': 'count'}, inplace=True)
            # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(30)))

            df2['length'] = df2['length'].astype(str)               # transform numbers into string for later join: '.join(col).strip()'
            df3 = df2.query('variant=="1"')  # We only focus on variant 1 (as close to row stochastic matrix as possible)
            # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(n=20)))

            df4 = pd.pivot_table(df3, index=['option'], columns=['length'], values=['diff_mean', 'diff_std'])  # Pivot
            # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(30)))
            df4.columns = ['_'.join(col).strip() for col in df4.columns.values]     # flatten the column hierarchy, requires to have only strings
            df4.reset_index(level=0, inplace=True)  # get length into columns
            # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(30)))

            # Add scaling factor for each row
            option = df4['option'].values       # extract the values from dataframe
            scaling = scaling_vec[option]       # look up the scaling factor in original list
            scaling = pd.Series(scaling)
            # print("scaling:\n{}".format(scaling))
            df5 = df4.assign(scaling=scaling.values)
            # print("\n-- df5 (length {}):\n{}".format(len(df5.index), df5.head(30)))

            # Filter rows
            select_rows = [i for i in range(num_options) if EC[i]]      # only those values for EC being tru
            df6 = df5[df5['option'].isin(select_rows)]
            # print("\n-- df6 (length {}):\n{}".format(len(df6.index), df6.head(30)))



            fig_filename = 'Fig_MHE_ScalingFactor_{}.pdf'.format(CHOICE)

            # -- Setup figure
            mpl.rcParams['backend'] = 'pdf'
            mpl.rcParams['lines.linewidth'] = 3
            mpl.rcParams['font.size'] = 14
            mpl.rcParams['axes.labelsize'] = 20
            mpl.rcParams['axes.titlesize'] = 16
            mpl.rcParams['xtick.labelsize'] = 16
            mpl.rcParams['ytick.labelsize'] = 16
            mpl.rcParams['legend.fontsize'] = 14
            mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
            mpl.rcParams['grid.color'] = '777777'  # grid color
            mpl.rcParams['figure.figsize'] = [4, 4]
            mpl.rcParams['xtick.major.pad'] = 4  # padding of tick labels: default = 4
            mpl.rcParams['ytick.major.pad'] = 4  # padding of tick labels: default = 4
            fig = plt.figure()
            ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

            # -- Extract values into columns (plotting dataframew with bars plus error lines and lines gave troubles)
            scaling = df6['scaling'].values  # .tolist() does not work with bar plot, requires np.array
            diff_mean_1 = df6['diff_mean_1'].values
            diff_mean_2 = df6['diff_mean_2'].values
            diff_mean_3 = df6['diff_mean_3'].values
            diff_mean_4 = df6['diff_mean_4'].values
            diff_mean_5 = df6['diff_mean_5'].values
            diff_std_5 = df6['diff_std_5'].values

            # -- Draw the plots
            p1 = ax.plot(scaling, diff_mean_1, color='black', linewidth=1, linestyle='--', label=r'$\ell_\mathrm{max} = 1$')
            p2 = ax.plot(scaling, diff_mean_2, color='orange', label=r'$\ell_\mathrm{max} = 2$')
            p3 = ax.plot(scaling, diff_mean_3, color='blue', label=r'$\ell_\mathrm{max} = 3$')
            p4 = ax.plot(scaling, diff_mean_4, color='green', label=r'$\ell_\mathrm{max} = 4$')
            p5 = ax.plot(scaling, diff_mean_5, color='red', marker='o', label=r'$\ell_\mathrm{max} = 5$')

            plt.xscale('log')
            plt.yscale('log')

            upper = diff_mean_5 + diff_std_5
            lower = diff_mean_5 - diff_std_5
            ax.fill_between(scaling, upper, lower, facecolor='red', alpha=0.2, edgecolor='none')


            # -- Title and legend
            if distribution == 'uniform':
                distribution_label = ',$uniform'
            else:
                distribution_label = '$'
            plt.title(r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(int(n / 1000), d, h, f, distribution_label))
            handles, labels = ax.get_legend_handles_labels()
            # print("labels: {}".format(labels))
            legend = plt.legend(handles, labels,
                                loc='upper center',     # 'upper right'
                                handlelength=2,
                                labelspacing=0,  # distance between label entries
                                handletextpad=0.3,  # distance between label and the line representation
                                # title='Variants',
                                borderaxespad=0.3,  # distance between legend and the outer axes
                                borderpad=0.1,  # padding inside legend box
                                )
            frame = legend.get_frame()
            frame.set_linewidth(0.0)
            frame.set_alpha(0.9)  # 0.8

            # -- Figure settings
            # ax.set_xticks(range(10))
            plt.grid(b=True, which='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
            plt.xlabel(r'Scaling factor $(\lambda)$', labelpad=0)
            plt.ylabel(r'L2 norm', labelpad=0)

            if xmin2 is None:
                xmin2 = plt.xlim()[0]
            if xmax2 is None:
                xmax2 = plt.xlim()[1]
            if ymin2 is None:
                ymin2 = plt.ylim()[0]
                ymin2 = max(ymin2, 0)
            if ymax2 is None:
                ymax2 = plt.ylim()[1]
            plt.xlim(xmin2, xmax2)
            plt.ylim(ymin2, ymax2)
            plt.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                # bottom='off',  # ticks along the bottom edge are off
                top='off',  # ticks along the top edge are off
                right='off',  # ticks along the top edge are off
                # labelbottom='off',    # labels along the bottom edge are off
            )

            plt.xticks(xtick_lab)
            # plt.yticks(ytick_lab, ytick_lab)

            if SHOW_PLOT:
                plt.show()
            if CREATE_PDF:
                plt.savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)
            if SHOW_PDF:
                showfig(join(figure_directory, fig_filename))
Esempio n. 15
0
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    # -- Setup
    CHOICE = choice
    #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron
    experiments = [CHOICE]
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf

    SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF
    STD_FILL = True
    TIMING = False
    CALCULATE_DATA_STATISTICS = False

    # -- Default Graph parameters
    rep_SameGraph = 10  # iterations on same graph

    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1

    alpha_vec = [0] * 10
    beta_vec = [0] * 10
    gamma_vec = [0] * 10
    s_vec = [0.5] * 10
    clip_on_vec = [True] * 10
    numMaxIt_vec = [10] * 10

    # Plotting Parameters
    xtick_lab = [0.001, 0.01, 0.1, 1]
    xtick_labels = ['0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    xmax = 1
    xmin = 0.0001
    ymin = 0.3
    ymax = 0.7
    labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr']
    facecolor_vec = [
        'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
        "#64B5CD"
    ]
    draw_std_vec = [False] * 4 + [True]
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [4, 4, 2, 1, 2, 2]
    marker_vec = [None, 'o', 'x', '^', 'v', '+']
    markersize_vec = [0, 8, 8, 8, 8, 8, 8]

    option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
    learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']

    Macro_Accuracy = False
    EC = True  # Non-backtracking for learning
    constraints = True  # True
    weight_vec = [None] * 3 + [10, 10] * 2
    randomize_vec = [False] * 4 + [True] * 2
    k = 3
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True
    gradient = True
    doubly_stochastic = True
    num_restarts = None

    raw_std_vec = range(10)
    numberOfSplits = 1

    select_lambda_vec = [False] * 20
    lambda_vec = None

    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    FILENAMEZ = ""
    legend_location = ""
    fig_label = ""
    H_heuristic = ""

    def choose(choice):
        # -- Default Graph parameters
        nonlocal n
        nonlocal d
        nonlocal rep_SameGraph
        nonlocal FILENAMEZ
        nonlocal initial_h0
        nonlocal exponent
        nonlocal length
        nonlocal variant

        nonlocal alpha_vec
        nonlocal beta_vec
        nonlocal gamma_vec
        nonlocal s_vec
        nonlocal clip_on_vec
        nonlocal numMaxIt_vec

        # Plotting Parameters
        nonlocal xtick_lab
        nonlocal xtick_labels
        nonlocal ytick_lab
        nonlocal xmax
        nonlocal xmin
        nonlocal ymin
        nonlocal ymax
        nonlocal labels
        nonlocal facecolor_vec
        nonlocal draw_std_vec
        nonlocal linestyle_vec
        nonlocal linewidth_vec
        nonlocal marker_vec
        nonlocal markersize_vec
        nonlocal legend_location

        nonlocal option_vec
        nonlocal learning_method_vec

        nonlocal Macro_Accuracy
        nonlocal EC
        nonlocal constraints
        nonlocal weight_vec
        nonlocal randomize_vec
        nonlocal k
        nonlocal err
        nonlocal avoidNeighbors
        nonlocal convergencePercentage_W
        nonlocal stratified
        nonlocal gradient
        nonlocal doubly_stochastic
        nonlocal num_restarts
        nonlocal numberOfSplits
        nonlocal H_heuristic

        nonlocal select_lambda_vec
        nonlocal lambda_vec
        nonlocal f_vec

        if choice == 0:
            None

        elif choice == 304:  ## with varying weights
            FILENAMEZ = 'prop37'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'Prop37'
            legend_location = 'lower right'
            n = 62000
            d = 34.8
            select_lambda_vec = [False] * 5
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]

        elif choice == 305:  # DCEr Only experiment
            choose(605)
            choose(304)

            select_lambda_vec = [False] * 6

        elif choice == 306:
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 307:  # heuristic comparison
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            learning_method_vec.append('Heuristic')
            labels.append('Heuristic')
            H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- MovieLens dataset
        elif choice == 401:
            FILENAMEZ = 'movielens'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'MovieLens'
            legend_location = 'upper left'

            n = 26850
            d = 25.0832029795

        elif choice == 402:
            choose(401)
            select_lambda_vec = [False] * 3 + [
                True
            ] * 3  # allow to choose lambda for different f in f_vec

            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 403:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 404:
            choose(401)

            select_lambda_vec = [
                True
            ] * 3  # allow to choose lambda for different f in f_vec
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            labels = ['GS', 'DCEr', 'Homophily']
            facecolor_vec = ['black', "#C44E52", "#64B5CD"]
            draw_std_vec = [False, True, False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 2, 2, 2, 2]
            marker_vec = [None, '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]

            weight_vec = [None, 10, None]
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
            randomize_vec = [False, True, False]
            learning_method_vec = ['GT', 'DHE']  #TODO

        elif choice == 405:  # DCEr ONLY experiment
            choose(605)
            choose(401)
            learning_method_vec += ['Holdout']
            labels += ['Holdout']

        elif choice == 406:  # comparison with a static heuristic matrix
            choose(402)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        elif choice == 407:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [1] * 21  # same length as f_vec

        elif choice == 408:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from
        # data/sigmod-movielens-fig.csv
        elif choice == 409:
            choose(402)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52",
                "#C44E52", "#CCB974", "#64B5CD"
            ]
            labels = [
                'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10',
                'Holdout'
            ]
            draw_std_vec = [False] * 5 + [True] * 2 + [False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2]
            marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8]
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]
            legend_location = 'upper left'
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # -- Yelp dataset
        elif choice == 501:
            FILENAMEZ = 'yelp'
            Macro_Accuracy = True
            weight_vec = [None] * 3 + [10, 10]
            gradient = True
            ymin = 0.1
            ymax = 0.75
            fig_label = 'Yelp'
            legend_location = 'upper left'

            n = 4301900  # for figure
            d = 6.56  # for figure

        # -- Flickr dataset
        elif choice == 601:
            FILENAMEZ = 'flickr'
            Macro_Accuracy = True
            fig_label = 'Flickr'
            legend_location = 'lower right'
            ymin = 0.3
            ymax = 0.7
            n = 2007369
            d = 18.1

        elif choice == 602:  ## with varying weights
            choose(601)

            select_lambda_vec = [False] * 4 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 603:  ## with varying weights
            choose(602)

            select_lambda_vec = [False] * 3 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        elif choice == 604:  ## with weight = 1
            choose(603)

            lambda_vec = [0.5] * 21  # same length as f_vec

        elif choice == 605:
            choose(601)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD", 'orange'
            ]
            draw_std_vec = [False] + [True] * 10
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [3] * 10
            marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x']
            markersize_vec = [0] + [8] * 10

            randomize_vec = [True] * 8
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]

            learning_method_vec = [
                'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE'
            ]
            select_lambda_vec = [False] * 8
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            weight_vec = [0, 0, 1, 2, 5, 10, 15]

            labels = ['GT'] + [
                i + ' {}'.format(weight_vec[ix])
                for ix, i in enumerate(['DCEr'] * 6)
            ]

        elif choice == 606:  # heuristic experiment
            choose(602)
            labels.append('Heuristic')
            learning_method_vec.append('Heuristic')
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- DBLP dataset
        elif choice == 701:
            FILENAMEZ = 'dblp'
            Macro_Accuracy = True
            ymin = 0.2
            ymax = 0.5
            fig_label = 'DBLP'
            legend_location = 'lower right'
            n = 2241258  # for figure
            d = 26.11  # for figure

        # -- ENRON dataset
        elif choice == 801:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            ymin = 0.3
            ymax = 0.75
            fig_label = 'Enron'
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            legend_location = 'upper left'
            n = 46463  # for figures
            d = 23.4  # for figures

        elif choice == 802:  ### WITH ADAPTIVE WEIGHTS
            choose(801)

            select_lambda_vec = [False] * 4 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 803:  ### WITH ADAPTIVE WEIGHTS
            choose(802)

            lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [
                1
            ] * 6  # same length as f_vec

        elif choice == 804:
            choose(803)

        elif choice == 805:
            choose(605)
            choose(801)
            #learning_method_vec += ['Holdout']
            #labels += ['Holdout']
        elif choice == 806:  # Heuristic experiment
            choose(802)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08],
                                    [0.08, 0.08, 0.76, 0.08],
                                    [0.08, 0.76, 0.08, 0.76],
                                    [0.08, 0.08, 0.76, 0.08]])

        # MASC Dataset
        elif choice == 901:
            FILENAMEZ = 'masc'
            Macro_Accuracy = False
            fig_label = 'MASC'
            legend_location = 'lower right'
            n = 0
            d = 0
            ymin = 0
            num_restarts = 100

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        # MASC collapsed Dataset
        elif choice == 1001:
            FILENAMEZ = 'masc-collapsed'
            fig_label = 'MASC Collapsed'
            legend_location = 'lower right'
            n = 43724
            d = 7.2
            ymin = 0
            num_restarts = 20
            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 1002:
            choose(1001)
            Macro_Accuracy = True

        # MASC Reduced dataset
        elif choice == 1101:
            FILENAMEZ = 'masc-reduced'
            fig_label = 'MASC Reduced'
            legend_location = 'lower right'
            n = 31000
            d = 8.3
            ymin = 0
            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 1102:
            choose(1101)
            Macro_Accuracy = True

        else:
            raise Warning("Incorrect choice!")

    def _f_worker_(X0, W, f, f_index):
        RANDOMSEED = None  # For repeatability
        random.seed(RANDOMSEED)  # seeds some other python random generator
        np.random.seed(
            seed=RANDOMSEED
        )  # seeds the actually used numpy random generator; both are used and thus needed

        X1, ind = replace_fraction_of_rows(X0,
                                           1 - f,
                                           avoidNeighbors=avoidNeighbors,
                                           W=W,
                                           stratified=stratified)
        X2 = introduce_errors(X1, ind, err)


        for option_index, (label, select_lambda, learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
                enumerate(zip(labels, select_lambda_vec, learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):
            learn_time = -1
            # -- Learning
            if learning_method == 'GT':
                H2c = H0c
            elif learning_method == 'Heuristic':
                # print('Heuristic')
                H2c = H_heuristic

            elif learning_method == 'Holdout':
                # print('Holdout')
                H2 = estimateH_baseline_serial(
                    X2,
                    ind,
                    W,
                    numMax=numMaxIt,
                    # ignore_rows=ind,
                    numberOfSplits=numberOfSplits,
                    # method=learning_method, variant=1,
                    # distance=length,
                    EC=EC,
                    alpha=alpha,
                    beta=beta,
                    gamma=gamma,
                    doubly_stochastic=doubly_stochastic)
                H2c = to_centering_beliefs(H2)

            else:
                if "DCEr" in learning_method:
                    learning_method = "DCEr"
                elif "DCE" in learning_method:
                    learning_method = "DCE"

                # -- choose optimal lambda: allows to specify different lambda for different f
                # print("option: ", option_index)
                if select_lambda == True:
                    weight = lambda_vec[f_index]
                    # print("weight : ", weight)
                else:
                    weight = weights

                # -- learn H
                learn_start = time.time()
                H2 = estimateH(X2,
                               W,
                               method=learning_method,
                               variant=1,
                               distance=length,
                               EC=EC,
                               weights=weight,
                               randomrestarts=num_restarts,
                               randomize=randomize,
                               constraints=constraints,
                               gradient=gradient,
                               doubly_stochastic=doubly_stochastic)
                learn_time = time.time() - learn_start
                H2c = to_centering_beliefs(H2)

            # if learning_method not in ['GT', 'GS']:

            # print(FILENAMEZ, f, learning_method)
            # print(H2c)

            # -- Propagation
            prop_start = time.time()
            # X2c = to_centering_beliefs(X2, ignoreZeroRows=True)       # try without
            eps_max = eps_convergence_linbp_parameterized(H2c,
                                                          W,
                                                          method='noecho',
                                                          alpha=alpha,
                                                          beta=beta,
                                                          gamma=gamma,
                                                          X=X2)
            eps = s * eps_max
            # print("Max eps: {}, eps: {}".format(eps_max, eps))
            # eps = 1

            try:
                F, actualIt, actualPercentageConverged = \
                    linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                  method='noecho',
                                                  alpha=alpha, beta=beta, gamma=gamma,
                                                  numMaxIt=numMaxIt,
                                                  convergencePercentage=convergencePercentage_W,
                                                  debug=2)
                prop_time = time.time() - prop_start
                if Macro_Accuracy:
                    accuracy_X = matrix_difference_classwise(X0,
                                                             F,
                                                             ignore_rows=ind)
                    precision = matrix_difference_classwise(
                        X0, F, similarity='precision', ignore_rows=ind)
                    recall = matrix_difference_classwise(X0,
                                                         F,
                                                         similarity='recall',
                                                         ignore_rows=ind)
                else:
                    accuracy_X = matrix_difference(X0, F, ignore_rows=ind)
                    precision = matrix_difference(X0,
                                                  F,
                                                  similarity='precision',
                                                  ignore_rows=ind)
                    recall = matrix_difference(X0,
                                               F,
                                               similarity='recall',
                                               ignore_rows=ind)

                result = [str(datetime.datetime.now())]
                text = [
                    label, f, accuracy_X, precision, recall, learn_time,
                    prop_time
                ]
                result.extend(text)
                # print("method: {}, f: {}, actualIt: {}, accuracy: {}, precision:{}, recall: {}, learning time: {}, propagation time: {}".format(label, f, actualIt, accuracy_X, precision, recall, learn_time, prop_time))
                save_csv_record(join(data_directory, csv_filename), result)

            except ValueError as e:

                print("ERROR: {} with {}: d={}, h={}".format(
                    e, learning_method, d, h))
                raise e

        return 'success'

    def multi_run_wrapper(args):
        """Wrapper to unpack arguments passed to the pool worker. 
        
        NOTE: This method could be removed by upgrading to Python>=3.3, which
        includes the multiprocessing.starmap_async() function, which allows
        multiple arguments to be passed to the map function.  
        """

        return _f_worker_(*args)

    for choice in experiments:

        choose(choice)
        filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format(
            choice, FILENAMEZ)
        csv_filename = '{}.csv'.format(filename)

        header = [
            'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall',
            'learntime', 'proptime'
        ]
        if CREATE_DATA:
            save_csv_record(join(data_directory, csv_filename),
                            header,
                            append=False)

        # print("choice: {}".format(choice))

        # --- print data statistics
        if CALCULATE_DATA_STATISTICS:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())
            d = (len(W.nonzero()[0]) * 2) / n

            k = len(X0[0])

            print("FILENAMEZ:", FILENAMEZ)
            print("k:", k)
            print("n:", n)
            print("d:", d)

            # -- Graph statistics
            n_vec = calculate_nVec_from_Xd(Xd)
            print("n_vec:\n", n_vec)
            d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd)
            print("d_vec:\n", d_vec)
            P = calculate_Ptot_from_graph(W, Xd)
            print("P:\n", P)
            for i in range(k):
                Phi = calculate_degree_correlation(W, X0, i, NB=True)
                print("Degree Correlation, Class {}:\n{}".format(i, Phi))

            # -- Various compatibilities
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            print("H0 w/  constraints:\n", np.round(H0, 2))
            #raw_input() # Why?

            H2 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H4 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H5 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H6 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H7 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)

            print()
            # print("H MCE w/o constraints:\n", np.round(H0, 3))
            print("H MCE w/  constraints:\n", np.round(H2, 3))
            # print("H DCE 2 w/o constraints:\n", np.round(H4, 3))
            print("H DCE 2 w/  constraints:\n", np.round(H5, 3))
            # print("H DCE 10 w/o constraints:\n", np.round(H6, 3))
            print("H DCE 20 w/  constraints:\n", np.round(H7, 3))

            print()
            H_row_vec = H_observed(W, X0, 3, NB=True, variant=1)
            print("H_est_1:\n", np.round(H_row_vec[0], 3))
            print("H_est_2:\n", np.round(H_row_vec[1], 3))
            print("H_est_3:\n", np.round(H_row_vec[2], 3))

        # --- Create data
        if CREATE_DATA or ADD_DATA:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())  ## number of nodes in graph
            k = len(X0[0])
            d = (len(W.nonzero()[0]) * 2) / n
            #print(n)
            #print(d)
            #print("contraint = {}".format(constraints))
            #print('select lambda: {}'.format(len(select_lambda_vec)))
            #print('learning method: {}'.format(len(learning_method_vec)))
            #print('alpha: {}'.format(len(alpha_vec)))
            #print('beta: {}'.format(len(beta_vec)))
            #print('gamma: {}'.format(len(gamma_vec)))
            #print('s: {}'.format(len(s_vec)))
            #print('maxit: {}'.format(len(numMaxIt_vec)))
            #print('weight: {}'.format(len(weight_vec)))
            #print('randomize: {}'.format(len(randomize_vec)))
            # ---  Calculating True Compatibility matrix
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=constraints,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            # print(H0)
            H0c = to_centering_beliefs(H0)

            num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph

            # Starts a thread pool with 10 fewer than the max number your computer
            # has available assuming one thread per cpu - this is meant for
            # supercomputer.
            #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()-10))
            # Use this for a reasonably powerful home computer
            #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()/2))
            # Use this for anything else
            pool = multiprocessing.Pool(2)

            f_processes = f_vec * rep_SameGraph
            workers = []
            results = [(X0, W, f, ix)
                       for ix, f in enumerate(f_vec)] * rep_SameGraph
            # print('Expected results: {}'.format(num_results))
            try:  # hacky fix due to a bug in 2.7 multiprocessing
                # Distribute work for evaluating accuracy over the thread pool using
                # a hacky method due to python 2.7 multiprocessing not being fully
                # featured
                pool.map_async(multi_run_wrapper, results).get(num_results * 2)
            except multiprocessing.TimeoutError as e:
                continue
            finally:
                pool.close()
                pool.join()

        # -- Read data for all options and plot
        df1 = pd.read_csv(join(data_directory, csv_filename))
        acc_filename = '{}_accuracy_plot.pdf'.format(filename)
        pr_filename = '{}_PR_plot.pdf'.format(filename)
        if TIMING:
            print('=== {} Timing Results ==='.format(FILENAMEZ))
            print('Prop Time:\navg: {}\nstddev: {}'.format(
                np.average(df1['proptime'].values),
                np.std(df1['proptime'].values)))
            for learning_method in labels:
                rs = df1.loc[df1["method"] == learning_method]
                avg = np.average(rs['learntime'])
                std = np.std(rs['learntime'])
                print('{} Learn Time:\navg: {}\nstd: {}'.format(
                    learning_method, avg, std))

        sslhv.plot(df1,
                   join(figure_directory, acc_filename),
                   n=n,
                   d=d,
                   k=k,
                   labels=labels,
                   dataset=FILENAMEZ,
                   line_styles=linestyle_vec,
                   xmin=xmin,
                   ymin=ymin,
                   xmax=xmax,
                   ymax=ymax,
                   marker_sizes=markersize_vec,
                   draw_stds=draw_std_vec,
                   markers=marker_vec,
                   line_colors=facecolor_vec,
                   line_widths=linewidth_vec,
                   legend_location=legend_location,
                   show=SHOW_PDF,
                   save=CREATE_PDF,
                   show_plot=SHOW_PLOT)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False):
    CHOICE = choice

    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf

    STD_FILL = True
    #
    SHORTEN_LENGTH = False

    fig_filename = 'Fig_homophily_{}.pdf'.format(CHOICE)
    csv_filename = 'Fig_homophily_{}.csv'.format(CHOICE)
    header = ['currenttime',
              'option',
              'f',
              'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)


    # -- Default Graph parameters
    k = 3
    rep_DifferentGraphs = 1
    rep_SameGraph = 2
    initial_h0 = None
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    constraint = True

    variant = 1
    EC = True                   # Non-backtracking for learning
    global f_vec, labels, facecolor_vec

    s = 0.5
    err = 0
    numMaxIt = 10
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True


    clip_on_vec = [True] * 10
    draw_std_vec = range(10)
    ymin = 0.3
    ymax = 1
    xmin = 0.001
    xmax = 1
    xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    xtick_labels = ['1e-5', '0.01\%', '0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [5, 2, 3, 3, 3, 3] + [3]*10
    marker_vec = [None, '^', 'v', 'o', '^'] + [None]*10
    markersize_vec = [0, 8, 8, 8, 6, 6] + [6]*10
    facecolor_vec = ['black', "#C44E52",  "#64B5CD"]


    # -- Options with propagation variants
    if CHOICE == 101:
        n = 10000
        h = 3
        d = 15
        f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)]
        option_vec = ['opt1', 'opt2', 'opt3']
        learning_method_vec = ['GT','DHE','Homophily']
        weight_vec = [None] + [10] + [None]
        randomize_vec = [None] + [True] + [None]
        xmin = 0.001
        ymin = 0.3
        ymax = 1
        labels = ['GS', 'DCEr', 'Homophily']

    else:
        raise Warning("Incorrect choice!")

    a = 1
    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)

    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs):  # create several graphs with same parameters

            W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution,
                                                      exponent=exponent, directed=False, debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for j in range(rep_SameGraph):  # repeat several times for same graph
                # print("Graph:{} and j: {}".format(i,j))

                ind = None
                for f in f_vec:
                    X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified)
                    X2 = introduce_errors(X1, ind, err)

                    for option_index, (option, learning_method,  weights, randomize) in \
                            enumerate(zip(option_vec, learning_method_vec, weight_vec, randomize_vec)):

                        # -- Learning
                        if learning_method == 'GT':
                            H2 = H0
                        elif learning_method == 'Homophily':
                            H2 = np.identity(k)

                        elif learning_method == 'DHE':
                            H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, constraints=constraint)
                            # print("learning_method:", learning_method)
                            # print("H:\n{}".format(H2))

                        # -- Propagation
                        H2c = to_centering_beliefs(H2)
                        X2c = to_centering_beliefs(X2, ignoreZeroRows=True)

                        try:
                            eps_max = eps_convergence_linbp_parameterized(H2c, W,
                                                                          method='noecho',
                                                                          X=X2)
                            eps = s * eps_max

                            F, actualIt, actualPercentageConverged = \
                                linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                              method='noecho',
                                                              numMaxIt=numMaxIt,
                                                              convergencePercentage=convergencePercentage_W,
                                                              debug=2)
                        except ValueError as e:
                            print (
                            "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                        else:
                            accuracy_X = matrix_difference_classwise(X0, F, ignore_rows=ind)


                            tuple = [str(datetime.datetime.now())]
                            text = [option_vec[option_index],
                                    f,
                                    accuracy_X]
                            tuple.extend(text)
                            # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option_vec[option_index], f, actualIt, accuracy_X))
                            save_csv_record(join(data_directory, csv_filename), tuple)



    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))
    desred_decimals = 7
    df1['f'] = df1['f'].apply(lambda x: round(x,desred_decimals))                   # rounding due to different starting points
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))


    # Aggregate repetitions
    df2 = df1.groupby(['option', 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(10)))

    # Pivot table
    df3 = pd.pivot_table(df2, index=['f'], columns=['option'], values=['accuracy_mean', 'accuracy_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(10)))

    # Extract values
    X_f = df3['f'].values                     # plot x values
    Y=[]
    Y_std=[]
    for option in option_vec:
        Y.append(df3['accuracy_mean_{}'.format(option)].values)
        if STD_FILL:
            Y_std.append(df3['accuracy_std_{}'.format(option)].values)


    if SHORTEN_LENGTH:
        SHORT_FACTOR = 2        ## KEEP EVERY Nth ELEMENT
        X_f  = np.copy(X_f[list(range(0, len(X_f), SHORT_FACTOR)), ])

        for i in range(len(Y)):
            Y[i] = np.copy(Y[i][list(range(0, len(Y[i]), SHORT_FACTOR)), ])
            if STD_FILL:
                Y_std[i] = np.copy(Y_std[i][list(range(0, len(Y_std[i]), SHORT_FACTOR)),])






    if CREATE_PDF or SHOW_PLOT or SHOW_PDF:

        # -- Setup figure
        mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['font.size'] = 16
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


        #  -- Drawing
        if STD_FILL:
            for choice, (option, facecolor) in enumerate(zip(option_vec, facecolor_vec)):
                ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice],
                                facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')

        for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
                enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):
            P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
                    markersize=markersize, markeredgewidth=1, clip_on=clip_on, markeredgecolor='black')

        plt.xscale('log')

        # -- Title and legend
        distribution_label = '$'
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        n_label = '{}k'.format(int(n / 1000))
        if n < 1000:
            n_label='{}'.format(n)
        a_label = ''
        if a != 1:
            a_label = ', a\!=\!{}'.format(a)

        titleString = r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}{}{}'.format(n_label, d, h, a_label, distribution_label)
        plt.title(titleString)

        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(handles, labels,
                            loc='upper left',     # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            borderaxespad=0.2,  # distance between legend and the outer axes
                            borderpad=0.3,  # padding inside legend box
                            numpoints=1,  # put the marker only once
                            )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)


        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))

        grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Label Sparsity $(f)$', labelpad=0)      # labelpad=0
        ylabel(r'Accuracy', labelpad=0)

        xlim(xmin, xmax)
        ylim(ymin, ymax)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)
        
        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory, fig_filename))  # shows actually created PDF
Esempio n. 17
0
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    global n
    global d
    global rep_SameGraph
    global FILENAMEZ
    global initial_h0
    global H0c
    global exponent
    global length
    global variant

    global alpha_vec
    global beta_vec
    global gamma_vec
    global s_vec
    global clip_on_vec
    global numMaxIt_vec

    # Plotting Parameters
    global xtick_lab
    global xtick_labels
    global ytick_lab
    global xmax
    global xmin
    global ymin
    global ymax
    global labels
    global facecolor_vec
    global draw_std_vec
    global linestyle_vec
    global linewidth_vec
    global marker_vec
    global markersize_vec
    global legend_location

    global option_vec
    global learning_method_vec

    global Macro_Accuracy
    global EC
    global constraints
    global weight_vec
    global randomize_vec
    global k
    global fig_label
    global err
    global avoidNeighbors
    global convergencePercentage_W
    global stratified
    global gradient
    global doubly_stochastic
    global numberOfSplits

    global select_lambda_vec
    global lambda_vec
    global f_vec
    # -- Setup
    CHOICE = choice
    #500 Yelp, 600 Flickr, 700 DBLP, 800 Enron
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    STD_FILL = True

    CALCULATE_DATA_STATISTICS = False

    # -- Default Graph parameters
    rep_SameGraph = 3  # iterations on same graph

    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1

    alpha_vec = [0] * 10
    beta_vec = [0] * 10
    gamma_vec = [0] * 10
    s_vec = [0.5] * 10
    clip_on_vec = [True] * 10
    numMaxIt_vec = [10] * 10

    # Plotting Parameters
    xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    xtick_labels = ['0.001\%', '0.01\%', '0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    xmax = 1
    xmin = 0.0001
    ymin = 0.3
    ymax = 0.7
    labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
    facecolor_vec = [
        'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
        "#64B5CD"
    ]
    draw_std_vec = [0, 3, 4, 4, 4, 4]
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [4, 4, 2, 1, 2]
    marker_vec = [None, 'o', 'x', '^', 'v', '+']
    markersize_vec = [0, 8, 8, 8, 8, 8, 8]

    option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
    learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']

    Macro_Accuracy = False
    EC = True  # Non-backtracking for learning
    constraints = True  # True
    weight_vec = [None] * 3 + [10, 10]
    randomize_vec = [False] * 4 + [True]
    k = 3
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True
    gradient = True
    doubly_stochastic = True

    draw_std_vec = range(10)
    numberOfSplits = 1

    select_lambda_vec = [False] * 20
    lambda_vec = None

    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    FILENAMEZ = ""
    legend_location = ""
    fig_label = ""
    global exp_backoff
    exp_backoff = [2**n for n in range(6, 12)]

    def choose(choice):
        # -- Default Graph parameters
        global n
        global d
        global rep_SameGraph
        global FILENAMEZ
        global initial_h0
        global exponent
        global length
        global variant

        global alpha_vec
        global beta_vec
        global gamma_vec
        global s_vec
        global clip_on_vec
        global numMaxIt_vec

        # Plotting Parameters
        global xtick_lab
        global xtick_labels
        global ytick_lab
        global xmax
        global xmin
        global ymin
        global ymax
        global labels
        global facecolor_vec
        global draw_std_vec
        global linestyle_vec
        global linewidth_vec
        global marker_vec
        global markersize_vec
        global legend_location

        global option_vec
        global learning_method_vec

        global Macro_Accuracy
        global EC
        global constraints
        global weight_vec
        global randomize_vec
        global k
        global fig_label
        global err
        global avoidNeighbors
        global convergencePercentage_W
        global stratified
        global gradient
        global doubly_stochastic
        global numberOfSplits

        global select_lambda_vec
        global lambda_vec
        global f_vec
        if choice == 0:
            None

        elif choice == 304:  ## with varying weights
            FILENAMEZ = 'prop37'
            Macro_Accuracy = True
            fig_label = 'Prop37'
            legend_location = 'lower right'
            n = 62000
            d = 34.8
            select_lambda_vec = [False] * 5
            # select_lambda_vec = [False] * 3 + [True] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            # lambda_vec = [0.5] * 21  # same length as f_vec

        elif choice == 305:  # Test row stochastic cases
            choose(304)
            doubly_stochastic = False

        # -- Yelp dataset
        elif choice == 501:
            FILENAMEZ = 'yelp'
            Macro_Accuracy = True
            weight_vec = [None] * 3 + [10, 10]
            gradient = True
            ymin = 0.1
            ymax = 0.75
            fig_label = 'Yelp'
            legend_location = 'upper left'

            n = 4301900  # for figure
            d = 6.56  # for figure

        # -- Flickr dataset
        elif choice == 601:
            FILENAMEZ = 'flickr'
            Macro_Accuracy = True
            fig_label = 'Flickr'
            legend_location = 'lower right'
            n = 2007369
            d = 18.1

        elif choice == 602:  ## with varying weights
            choose(601)

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 603:  ## with varying weights
            choose(602)

            select_lambda_vec = [False] * 3 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        elif choice == 604:  ## with weight = 1
            draw_std_vec = [4]
            choose(603)

            lambda_vec = [0.5] * 21  # same length as f_vec

        # -- DBLP dataset
        elif choice == 701:
            FILENAMEZ = 'dblp.txt'
            Macro_Accuracy = True
            ymin = 0.2
            ymax = 0.5
            fig_label = 'DBLP'
            legend_location = 'lower right'
            n = 2241258  # for figure
            d = 26.11  # for figure

        # -- ENRON dataset
        elif choice == 801:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            ymin = 0.3
            ymax = 0.75
            fig_label = 'Enron'
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            legend_location = 'upper left'
            n = 46463  # for figures
            d = 23.4  # for figures

        elif choice == 802:  ### WITH ADAPTIVE WEIGHTS
            choose(801)

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 803:  ### WITH ADAPTIVE WEIGHTS
            choose(802)

            lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [
                1
            ] * 6  # same length as f_vec

        elif choice == 804:
            choose(803)

        elif choice == 805:
            choose(801)
            doubly_stochastic = False

        elif choice == 821:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            constraints = True  # True
            gradient = True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [0.2, 0.2]

            randomize_vec = [False] * 4 + [True]
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.7
            labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Enron'
            legend_location = 'lower right'
            n = 46463  # for figures
            d = 23.4  # for figures

            alpha = 0.0
            beta = 0.0
            gamma = 0.0
            s = 0.5
            numMaxIt = 10

            select_lambda_vec = [False] * 3 + [True] * 2
            lambda_vec = [0.2] * 13 + [10] * 8  # same length as f_vec
            captionText = "DCE weight=[0.2*13] [10*8], s={}, numMaxIt={}".format(
                s, numMaxIt)

        # -- Cora dataset
        elif choice == 901:
            FILENAMEZ = 'cora'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.9
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Cora'
            legend_location = 'lower right'
            n = 2708
            d = 7.8

        # -- Citeseer dataset
        elif CHOICE == 1001:
            FILENAMEZ = 'citeseer'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Citeseer'
            legend_location = 'lower right'
            n = 3312
            d = 5.6

        elif CHOICE == 1101:
            FILENAMEZ = 'hep-th'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.1
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Hep-th'
            legend_location = 'lower right'
            n = 27770
            d = 5.6

        elif CHOICE == 1204:
            FILENAMEZ = 'pokec-gender'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.000015
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [0, 3, 4, 4, 4, 4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Pokec-Gender'
            legend_location = 'lower right'
            n = 1632803
            d = 54.6

        else:
            raise Warning("Incorrect choice!")

    choose(CHOICE)

    csv_filename = 'Fig_End-to-End_accuracy_{}_{}.csv'.format(
        CHOICE, FILENAMEZ)
    header = ['currenttime', 'method', 'f', 'precision', 'recall', 'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # print("choice: {}".format(CHOICE))

    # --- print data statistics
    if CALCULATE_DATA_STATISTICS:

        Xd, W = load_Xd_W_from_csv(
            join(realDataDir, FILENAMEZ) + '-classes.csv',
            join(realDataDir, FILENAMEZ) + '-neighbors.csv')

        X0 = from_dictionary_beliefs(Xd)
        n = len(Xd.keys())
        d = (len(W.nonzero()[0]) * 2) / n

        print("FILENAMEZ:", FILENAMEZ)
        print("n:", n)
        print("d:", d)

        # -- Graph statistics
        n_vec = calculate_nVec_from_Xd(Xd)
        print("n_vec:\n", n_vec)
        d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd)
        print("d_vec:\n", d_vec)
        P = calculate_Ptot_from_graph(W, Xd)
        print("P:\n", P)

        # -- Various compatibilities
        H0 = estimateH(X0,
                       W,
                       method='MHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=1,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        print("H0 w/  constraints:\n", np.round(H0, 2))
        raw_input()

        H2 = estimateH(X0,
                       W,
                       method='MHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=1,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H4 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=2,
                       randomize=False,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H5 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=2,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H6 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=2,
                       EC=EC,
                       weights=10,
                       randomize=False,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H7 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=2,
                       EC=EC,
                       weights=10,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)

        # print("H MCE w/o constraints:\n", np.round(H0, 3))
        print("H MCE w/  constraints:\n", np.round(H2, 3))
        # print("H DCE 2 w/o constraints:\n", np.round(H4, 3))
        print("H DCE 2 w/  constraints:\n", np.round(H5, 3))
        # print("H DCE 10 w/o constraints:\n", np.round(H6, 3))
        print("H DCE 20 w/  constraints:\n", np.round(H7, 3))

        H_row_vec = H_observed(W, X0, 3, NB=True, variant=1)
        print("H_est_1:\n", np.round(H_row_vec[0], 3))
        print("H_est_2:\n", np.round(H_row_vec[1], 3))
        print("H_est_3:\n", np.round(H_row_vec[2], 3))

    # --- Create data
    if CREATE_DATA or ADD_DATA:

        Xd, W = load_Xd_W_from_csv(
            join(realDataDir, FILENAMEZ) + '-classes.csv',
            join(realDataDir, FILENAMEZ) + '-neighbors.csv')

        X0 = from_dictionary_beliefs(Xd)
        n = len(Xd.keys())  ## number of nodes in graph

        d = (len(W.nonzero()[0]) * 2) / n
        # print(n)
        # print(d)
        # print("contraint = {}".format(constraints))

        # ---  Calculating True Compatibility matrix
        H0 = estimateH(X0,
                       W,
                       method='MHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=1,
                       randomize=False,
                       constraints=constraints,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        # print(H0)
        H0c = to_centering_beliefs(H0)

        graph_workers = []
        gq = multiprocessing.Queue()
        for j in range(rep_SameGraph):  # repeat several times for same graph

            # print("Graph: {}".format(j))
            graph_workers.append(
                multiprocessing.Process(target=graph_worker, args=(X0, W, gq)))

        for gw in graph_workers:
            gw.start()

        for gw in graph_workers:
            for t in exp_backoff:
                gw.join(t)
                if gw.exitcode is None:
                    print(
                        "failed to join graph worker {} after {} seconds, retrying"
                        .format(gw, t))
                else:
                    continue
            print("Failed to join graph worker {}.".format(gw))

        gq.put('STOP')
        for i in iter(gq.get, 'STOP'):
            save_csv_record(join(data_directory, csv_filename), i)

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    acc_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format(
        CHOICE, FILENAMEZ)
    pr_filename = 'Fig_End-to-End_PR_realData{}_{}.pdf'.format(
        CHOICE, FILENAMEZ)
    # generate_figure(data_directory, acc_filename, df1)
    # generate_figure(data_directory, pr_filename, df1, metric='pr')

    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(5)))

    # Aggregate repetitions
    if "option" in df1.columns.values:
        pivot_col = "option"
        pivot_vec = option_vec
    else:
        pivot_col = "method"
        pivot_vec = learning_method_vec

    df2 = df1.groupby([pivot_col, 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(500)))

    # Pivot table
    df3 = pd.pivot_table(df2,
                         index='f',
                         columns=pivot_col,
                         values=['accuracy_mean', 'accuracy_std'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(5)))

    # Extract values
    X_f = df3['f'].values  # plot x values
    Y = []
    Y_std = []
    for val in pivot_vec:
        Y.append(df3['accuracy_mean_{}'.format(val)].values)
        if STD_FILL:
            Y_std.append(df3['accuracy_std_{}'.format(val)].values)

    if CREATE_PDF or SHOW_PDF or SHOW_PLOT:
        print("Setting up figure...")

        # -- Setup figure
        # remove 4 last characters ".txt"
        fig_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format(
            CHOICE, FILENAMEZ)
        mpl.rc(
            'font', **{
                'family': 'sans-serif',
                'sans-serif': [u'Arial', u'Liberation Sans']
            })
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14  # 6
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams[
            'xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        #  -- Drawing
        if STD_FILL:
            for choice, (option,
                         facecolor) in enumerate(zip(option_vec,
                                                     facecolor_vec)):
                if choice in draw_std_vec:
                    ax.fill_between(X_f,
                                    Y[choice] + Y_std[choice],
                                    Y[choice] - Y_std[choice],
                                    facecolor=facecolor,
                                    alpha=0.2,
                                    edgecolor=None,
                                    linewidth=0)
                    ax.plot(X_f,
                            Y[choice] + Y_std[choice],
                            linewidth=0.5,
                            color='0.8',
                            linestyle='solid')
                    ax.plot(X_f,
                            Y[choice] - Y_std[choice],
                            linewidth=0.5,
                            color='0.8',
                            linestyle='solid')

        for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
                enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):
            ax.plot(X_f,
                    Y[choice],
                    linewidth=linewidth,
                    color=color,
                    linestyle=linestyle,
                    label=label,
                    zorder=4,
                    marker=marker,
                    markersize=markersize,
                    markeredgewidth=1,
                    clip_on=clip_on)

        # -- Title and legend
        if n < 1000:
            n_label = '{}'.format(n)
        else:
            n_label = '{}k'.format(int(n / 1000))

        title(r'$\!\!\!\!\!\!\!${}: $n={}, d={}$'.format(
            fig_label, n_label, np.round(d, 1)))
        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            labels,
            loc=legend_location,  # 'upper right'
            handlelength=2,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            # title='Variants',
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8
        plt.xscale('log')

        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)

        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))

        grid(b=True,
             which='major',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True,
             which='minor',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Label Sparsity $(f)$', labelpad=0)  # labelpad=0
        ylabel(r'Accuracy', labelpad=0)

        xlim(xmin, xmax)
        ylim(ymin, ymax)

        if CREATE_PDF:
            print("saving PDF of figure...")
            savefig(join(figure_directory, fig_filename),
                    format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)

        if SHOW_PLOT:
            print("Showing plot...")
            plt.show()

        if SHOW_PDF:
            print("Showing pdf...")
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
Esempio n. 18
0
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, show_fig=True):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    CREATE_PDF = create_pdf
    SHOW_PDF=show_pdf
    SHOW_FIG1 = show_fig        # bar diagram
    SHOW_FIG2 = False      # curve

    csv_filename = 'Fig_MHE_Variants_{}.csv'.format(CHOICE)
    header = ['currenttime',
              'option',     # one option corresponds to one choice of weight vector. In practice, one choice of scaling factor (for weight vector)
              'variant',    # 1, 2, 3 (against GT), and 1-2, 1-3, 2-3 (against each other)
              'length',
              'diff',
              'time']       # L2 norm between H and estimate
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)


    # Default Graph parameters and options
    n = 10000
    f = 0.1
    h = 8
    distribution = 'uniform'
    randomize = False
    initial_h0 = None           # initial vector to start finding optimal H
    initial_H0 = None
    exponent = -0.3
    length = 5
    rep = 10       #
    EC = [False] + [True] * 31
    scaling_vec = [1, 0.1, 0.14, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    num_options = len(scaling_vec)
    scaling_vec = np.array(scaling_vec)
    weight = np.array([np.power(scaling_vec, i) for i in range(5)])
    weight = weight.transpose()
    ymin1 = None
    ymax1 = None
    xmin2 = None
    xmax2 = None
    ymin2 = None
    ymax2 = None
    # fig1_index = [0, 11, 16, 21, 23, 24, 25, 26]         # which index of scaling options to display if CHOICE_FIG_BAR_VARIANT==True
    fig1_index = [21]
    smartInit = False
    smartInitRandomize = False
    delta = 0.1
    variant_vec = [1,2, 3]      # for figure 1
    variant_vec = [1]           # for figure 2, to speed up calculations

    if CHOICE == 1:     # ok
        n = 1000
        d = 10
        ymax2 = 0.24

    elif CHOICE == 2:   # ok
        n = 1000
        d = 10
        distribution = 'powerlaw'
        ymax2 = 0.24

    elif CHOICE == 3:   # ok
        n = 1000
        d = 5
        distribution = 'powerlaw'
        ymax2 = 0.4

    elif CHOICE == 4:   # ok
        n = 1000
        d = 25
        distribution = 'powerlaw'
        ymax2 = 0.16


    elif CHOICE == 10:  # ok
        d = 10
        ymax2 = 0.1

    elif CHOICE == 11:  # (selection)
        d = 10
        distribution = 'powerlaw'
        exponent = -0.5
        ymax2 = 0.1
        ymax1 = 0.14

    elif CHOICE == 12:
        d = 3
        ymax2 = 0.19
        ymax1 = 0.2

    elif CHOICE == 13:
        d = 25
        ymax2 = 0.05

    elif CHOICE == 14:  # selection (for comparison)
        d = 25
        distribution = 'powerlaw'
        ymax2 = 0.046
        ymax1 = 0.08

    elif CHOICE == 15:   # selection
        d = 25
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.095



    elif CHOICE == 16:   # selection TODO !!!
        d = 25
        f = 0.01
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.4
        ymin2 = 0

    elif CHOICE == 17:   # selection TODO !!!
        d = 25
        f = 0.003
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0

    elif CHOICE == 18:   # selection TODO !!!
        d = 25
        f = 0.001
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0



    elif CHOICE == 20:
        h = 3
        d = 10
        ymax1 = 0.12

    elif CHOICE == 21:      # selection (for comparison against f=0.05: 26)
        h = 3
        d = 10
        distribution = 'powerlaw'
        exponent = -0.5
        ymax1 = 0.15
        ymax2 = 0.099

    elif CHOICE == 22:      # selection (for comparison with start from GT: 44)
        h = 3
        d = 3
        ymax1 = 0.25
        ymax2 = 0.39

    elif CHOICE == 23:      # ok
        h = 3
        d = 25
        ymax1 = 0.1
        ymax2 = 0.12

    elif CHOICE == 24:
        h = 3
        d = 25
        distribution = 'powerlaw'
        ymax1 = 0.08

    elif CHOICE == 25:      # main selection
        h = 3
        d = 25
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.125

    elif CHOICE == 26:      # selection, #=200
        h = 3
        d = 10
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.21
        ymax2 = 0.26

    elif CHOICE == 27:      # selection, #=200
        d = 10
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.21

    elif CHOICE == 60:   # ??? #=50 !!!, 50 more
        d = 25
        f = 0.01
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.6

    elif CHOICE == 61:   # ??? #=50, 100 more
        d = 25
        f = 0.005
        distribution = 'powerlaw'
        ymax1 = 0.99
        ymax2 = 0.99

    elif CHOICE == 62:   # ??? #=50, 150 more
        h = 3
        d = 25
        f = 0.01
        distribution = 'powerlaw'
        ymax1 = 0.6
        ymax2 = 0.6

    elif CHOICE == 63:   # ??? #=50, 150 more
        h = 3
        d = 25
        f = 0.005
        distribution = 'powerlaw'
        ymax1 = 1.2
        ymax2 = 1.0




    # --- Randomization ---
    # randomized 22
    elif CHOICE == 32:
        randomize = True
        h = 3
        d = 3
        ymax1 = 0.25
        ymax2 = 0.4


    # --- GT ---
    # version of 22 where GT is supplied to start optimiziation
    # just to check if the global optimum of the energy function actually corresponds to the GT
    elif CHOICE == 42:      # selection, #=200 (for comparison with start from GT)
        initial_h0 = [0.2, 0.6, 0.2]        # start optimization at optimal point
        h = 3
        d = 3
        ymax1 = 0.25
        ymax2 = 0.39

    # version of 15 where GT is supplied to start optimiziation
    elif CHOICE == 43:      # selection, #=200
        initial_h0 = [0.1, 0.8, 0.1]  # start optimization at optimal point
        h = 8
        d = 25
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.095

    # version of 12 where GT is supplied to start optimiziation
    elif CHOICE == 44:      # selection, #=200 (for comparison)
        initial_h0 = [0.1, 0.8, 0.1]  # start optimization at optimal point
        h = 8
        d = 3
        ymax1 = 0.2
        ymax2 = 0.19

    # version of 25 where GT is supplied to start optimiziation
    elif CHOICE == 45:      # selection, #=200
        h = 3
        d = 25
        f = 0.05
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.125


    # version of 12 where GT is supplied to start optimiziation
    elif CHOICE == 46:   # selection TODO !!!
        initial_h0 = [0.1, 0.8, 0.1]  # start optimization at optimal point
        d = 25
        f = 0.01
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 0.4
        ymin2 = 0.0

    # version of 12 where GT is supplied to start optimiziation
    elif CHOICE == 47:   # selection TODO !!!
        initial_h0 = [0.1, 0.8, 0.1]  # start optimization at optimal point
        d = 25
        f = 0.003
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0


    # version of 12 with smart init
    elif CHOICE == 48:   # selection TODO !!!
        d = 25
        f = 0.003
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0
        smartInit = True


    # version of 12 with smart init
    elif CHOICE == 49:   # selection TODO !!!
        d = 25
        f = 0.003
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0
        smartInit = True
        smartInitRandomize = True       # initialize optimization at several random points for smart init only


    elif CHOICE == 50:   # selection TODO !!!
        initial_h0 = [0.1, 0.8, 0.1]  # start optimization at optimal point
        d = 25
        f = 0.001
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0

    elif CHOICE == 51:   # selection TODO !!!
        d = 25
        f = 0.001
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0
        randomize = True        # start optimization at several random points
        delta = 0.1

    elif CHOICE == 52:   # selection TODO !!!
        d = 25
        f = 0.001
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0
        randomize = True        # start optimization at several random points
        delta = 0.2

    elif CHOICE == 53:   # selection TODO !!!
        d = 25
        f = 0.001
        distribution = 'powerlaw'
        ymax1 = 0.15
        ymax2 = 1.
        ymin2 = 0.0
        randomize = True        # start optimization at several random points
        delta = 0.3




    else:
        raise Warning("Incorrect choice!")


    k = 3
    a = 1
    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for r in range(1, rep+1):
            # print('Repetition {}'.format(r))

            # -- Create graph
            W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                                      distribution=distribution,
                                                      exponent=exponent,
                                                      directed=False,
                                                      debug=False)
            X0 = from_dictionary_beliefs(Xd)
            X1, ind = replace_fraction_of_rows(X0, 1 - f)

            # -- Create estimates and compare against GT, or against each other
            H_est={}
            for length in range(1, length + 1):
                for option in range(num_options):
                    for variant in variant_vec:
                        start = time.time()

                        if smartInit:
                            startWeight = 0.2
                            initial_H0 = estimateH(X1, W, method='DHE', variant=variant,
                                                   distance=5,
                                                   EC=EC[option], weights=startWeight,
                                                   randomize=smartInitRandomize)

                        H_est[variant] = estimateH(X1, W, method='DHE', variant=variant,
                                                   distance=length, EC=EC[option], weights=weight[option],
                                                   randomize=randomize,
                                                   initial_h0=initial_h0,
                                                   initial_H0=initial_H0,
                                                   delta = delta
                                                   )
                        time_est = time.time() - start
                        diff = LA.norm(H_est[variant] - H0)

                        tuple = [str(datetime.datetime.now())]
                        text = [option, variant, length, diff, time_est]
                        # text = np.asarray(text)  # (without np, entries get ugly format) not used here because it transforms integers to float !!
                        tuple.extend(text)
                        save_csv_record(join(data_directory, csv_filename), tuple)

                    # -- Compare against each other
                    for variant1 in variant_vec:
                        for variant2 in variant_vec:
                            if variant1 < variant2:
                                diff = LA.norm(H_est[variant1] - H_est[variant2])

                                tuple = [str(datetime.datetime.now())]
                                text = [option, "{}-{}".format(variant1, variant2), length, diff, time_est]
                                tuple.extend(text)
                                save_csv_record(join(data_directory, csv_filename), tuple)

    if SHOW_FIG1:
        # -- Read, aggregate, and pivot data for all options
        df1 = pd.read_csv(join(data_directory, csv_filename))
        # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
        df2 = df1.groupby(['option', 'variant', 'length']).agg \
            ({'diff': [np.mean, np.std, np.size],  # Multiple Aggregates
              'time': [np.mean, np.std],
              })
        df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
        df2.reset_index(inplace=True)  # remove the index hierarchy
        df2.rename(columns={'diff_size': 'count'}, inplace=True)
        # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(90)))


        # -- Create one separate figure for each option
        for option in range(num_options):
            if option not in fig1_index:
                continue
            scaling = scaling_vec[option]

            fig_filename = 'Fig_MHE_Variants_{}_{}.pdf'.format(CHOICE, option)

            df3 = df2.query('option==@option')  # Query
            df3 = pd.pivot_table(df3, index=['length'], columns=['variant'], values=['diff_mean', 'diff_std'])  # Pivot
            # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

            df3.columns = ['_'.join(col).strip() for col in df3.columns.values]     # flatten the column hierarchy
            df3.reset_index(level=0, inplace=True)                                  # get length into columns
            # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
            df3.drop(['diff_std_1-2', 'diff_std_1-3', 'diff_std_2-3', ], axis=1, inplace=True)
            # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))


            # -- Setup figure
            mpl.rcParams['backend'] = 'pdf'
            mpl.rcParams['lines.linewidth'] = 3
            mpl.rcParams['font.size'] = 16
            mpl.rcParams['axes.labelsize'] = 20
            mpl.rcParams['axes.titlesize'] = 16
            mpl.rcParams['xtick.labelsize'] = 16
            mpl.rcParams['ytick.labelsize'] = 16
            mpl.rcParams['legend.fontsize'] = 14
            mpl.rcParams['axes.edgecolor'] = '111111'   # axes edge color
            mpl.rcParams['grid.color'] = '777777'   # grid color
            mpl.rcParams['figure.figsize'] = [4, 4]
            mpl.rcParams['xtick.major.pad'] = 4     # padding of tick labels: default = 4
            mpl.rcParams['ytick.major.pad'] = 4         # padding of tick labels: default = 4
            fig = plt.figure()
            ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


            # -- Extract values into columns (plotting dataframew with bars plus error lines and lines gave troubles)
            l_vec = df3['length'].values                   # .tolist() does not work with bar plot, requires np.array
            diff_mean_1 = df3['diff_mean_1'].values
            diff_mean_2 = df3['diff_mean_2'].values
            diff_mean_3 = df3['diff_mean_3'].values
            diff_std_1 = df3['diff_std_1'].values
            diff_std_3 = df3['diff_std_2'].values
            diff_std_2 = df3['diff_std_3'].values


            # -- Draw the bar plots
            width = 0.2       # the width of the bars
            bar1 = ax.bar(l_vec-1.5*width, diff_mean_1, width, color='blue',
                          yerr=diff_std_1, error_kw={'ecolor':'black', 'linewidth':2},    # error-bars colour
                          label=r'1')
            bar2 = ax.bar(l_vec-0.5*width, diff_mean_2, width, color='darkorange',
                          yerr=diff_std_2, error_kw={'ecolor':'black', 'linewidth':2},  # error-bars colour
                          label=r'2')
            bar3 = ax.bar(l_vec+0.5*width, diff_mean_3, width, color='green',
                          yerr=diff_std_1, error_kw={'ecolor':'black', 'linewidth':2},    # error-bars colour
                          label=r'3')

            if CHOICE == 15 and option == 0:
                ax.annotate(np.round(diff_mean_1[1], 2), xy=(1.6, 0.15), xytext=(0.8, 0.122),
                            arrowprops=dict(facecolor='black', arrowstyle="->"), )


            # -- Legend
            handles, labels = ax.get_legend_handles_labels()
            # print("labels: {}".format(labels))
            legend = plt.legend(handles, labels,
                                loc='upper right',
                                handlelength=2,
                                labelspacing=0,             # distance between label entries
                                handletextpad=0.3,          # distance between label and the line representation
                                title='Variants',
                                borderaxespad=0.3,        # distance between legend and the outer axes
                                borderpad=0.1,                # padding inside legend box
                                )
            frame = legend.get_frame()
            frame.set_linewidth(0.0)
            frame.set_alpha(0.8)        # 0.8


            # -- Title and figure settings
            if distribution == 'uniform':
                distribution_label = ',$uniform'
            else:
                distribution_label = '$'
            plt.title(r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(int(n / 1000), d, h, f, distribution_label))
            # ax.set_xticks(range(10))
            plt.grid(b=True, which='both', alpha=0.2, linestyle='solid', axis='y', linewidth=0.5)       # linestyle='dashed', which='minor'
            plt.xlabel(r'Max path length ($\ell_{{\mathrm{{max}}}})$', labelpad=0)
            plt.ylabel(r'L2 norm', labelpad=0)

            if ymin1 is None:
                ymin1 = plt.ylim()[0]
                ymin1 = max(ymin1, 0)
            if ymax1 is None:
                ymax1 = plt.ylim()[1]
            plt.ylim(ymin1, ymax1)
            plt.xlim(0.5,5.5)
            plt.xticks([1, 2, 3, 4, 5])
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                # labelbottom='off',    # labels along the bottom edge are off
            )
            plt.annotate(r'$\lambda={:g}$'.format(float(scaling)), xycoords = 'axes fraction', xy=(0.5, 0.9), ha="center", va="center")

            if CREATE_PDF:
                plt.savefig(join(figure_directory, fig_filename), format='pdf',
                        dpi=None,
                        edgecolor='w',
                        orientation='portrait',
                        transparent=False,
                        bbox_inches='tight',
                        pad_inches=0.05,
                        frameon=None)
            if SHOW_FIG1:
                plt.show()
            if SHOW_PDF:
                os.system('{} "'.format(open_cmd[sys.platform]) + join(figure_directory, fig_filename) + '"')       # shows actually created PDF

    if SHOW_FIG2:
        # -- Read, aggregate, and pivot data for all options
        df1 = pd.read_csv(join(data_directory, csv_filename))
        # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
        df2 = df1.groupby(['option', 'variant', 'length']).agg \
            ({'diff': [np.mean, np.std, np.size],  # Multiple Aggregates
              'time': [np.mean, np.std],
              })
        df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
        df2.reset_index(inplace=True)  # remove the index hierarchy
        df2.rename(columns={'diff_size': 'count'}, inplace=True)
        # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(90)))

        df2['length'] = df2['length'].astype(str)               # transform numbers into string for later join: '.join(col).strip()'
        df3 = df2.query('variant=="1"')  # We only focus on variant 1 (as close to row stochastic matrix as possible)
        # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(n=20)))

        df4 = pd.pivot_table(df3, index=['option'], columns=['length'], values=['diff_mean', 'diff_std'])  # Pivot
        # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(30)))
        df4.columns = ['_'.join(col).strip() for col in df4.columns.values]     # flatten the column hierarchy, requires to have only strings
        df4.reset_index(level=0, inplace=True)  # get length into columns
        # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(30)))

        # Add scaling factor for each row
        option = df4['option'].values       # extract the values from dataframe
        scaling = scaling_vec[option]       # look up the scaling factor in original list
        scaling = pd.Series(scaling)
        # print("scaling:\n{}".format(scaling))
        df5 = df4.assign(scaling=scaling.values)
        # print("\n-- df5 (length {}):\n{}".format(len(df5.index), df5.head(30)))

        # Filter rows
        select_rows = [i for i in range(num_options) if EC[i]]      # only those values for EC being tru
        df6 = df5[df5['option'].isin(select_rows)]
        # print("\n-- df6 (length {}):\n{}".format(len(df6.index), df6.head(30)))



        fig_filename = 'Fig_MHE_ScalingFactor_{}.pdf'.format(CHOICE)

        # -- Setup figure
        mpl.rcParams['backend'] = 'pdf'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = 14
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams['xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Extract values into columns (plotting dataframew with bars plus error lines and lines gave troubles)
        scaling = df6['scaling'].values  # .tolist() does not work with bar plot, requires np.array
        diff_mean_1 = df6['diff_mean_1'].values
        diff_mean_2 = df6['diff_mean_2'].values
        diff_mean_3 = df6['diff_mean_3'].values
        diff_mean_4 = df6['diff_mean_4'].values
        diff_mean_5 = df6['diff_mean_5'].values
        diff_std_5 = df6['diff_std_5'].values

        # -- Draw the plots
        p1 = ax.plot(scaling, diff_mean_1, color='black', linewidth=1, linestyle='--', label=r'$\ell_\mathrm{max} = 1$')
        p2 = ax.plot(scaling, diff_mean_2, color='orange', label=r'$\ell_\mathrm{max} = 2$')
        p3 = ax.plot(scaling, diff_mean_3, color='blue', label=r'$\ell_\mathrm{max} = 3$')
        p4 = ax.plot(scaling, diff_mean_4, color='green', label=r'$\ell_\mathrm{max} = 4$')
        p5 = ax.plot(scaling, diff_mean_5, color='red', marker='o', label=r'$\ell_\mathrm{max} = 5$')
        plt.xscale('log')

        upper = diff_mean_5 + diff_std_5
        lower = diff_mean_5 - diff_std_5
        ax.fill_between(scaling, upper, lower, facecolor='red', alpha=0.2, edgecolor='none')


        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        plt.title(r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(int(n / 1000), d, h, f, distribution_label))
        handles, labels = ax.get_legend_handles_labels()
        # print("labels: {}".format(labels))
        legend = plt.legend(handles, labels,
                            loc='upper center',     # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            # title='Variants',
                            borderaxespad=0.3,  # distance between legend and the outer axes
                            borderpad=0.1,  # padding inside legend box
                            )
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings
        # ax.set_xticks(range(10))
        plt.grid(b=True, which='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'$\lambda$', labelpad=0)
        plt.ylabel(r'L$^2$ norm', labelpad=0)

        if xmin2 is None:
            xmin2 = plt.xlim()[0]
        if xmax2 is None:
            xmax2 = plt.xlim()[1]
        if ymin2 is None:
            ymin2 = plt.ylim()[0]
            ymin2 = max(ymin2, 0)
        if ymax2 is None:
            ymax2 = plt.ylim()[1]
        plt.xlim(xmin2, xmax2)
        plt.ylim(ymin2, ymax2)
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            # bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            right='off',  # ticks along the top edge are off
            # labelbottom='off',    # labels along the bottom edge are off
        )

        if CREATE_PDF:
            plt.savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)
        if SHOW_FIG2:
            plt.show()
        if SHOW_PDF:
            os.system('{} "'.format(open_cmd[sys.platform]) + join(figure_directory, fig_filename) + '"')  # shows actually created PDF
Esempio n. 19
0
def test_H_observed():
    """Illustrate H_observed"""
    print(
        "\n\n-- test_H_observed(): 'H_observed', uses: 'planted_distribution_model_H' --"
    )

    # --- Parameters for graph
    n = 3000
    a = 1
    h = 8
    d = 2
    k = 3
    distribution = 'uniform'

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=None,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)

    # --- Print first rows of matrices
    distance = 8

    print("First rows of powers of H0:")
    for k in range(1, distance + 1):
        print("{}: {}".format(k, np.linalg.matrix_power(H0, k)[0]))

    H_vec = H_observed(W, X0, distance=distance, NB=False)
    H_vec_EC = H_observed(W, X0, distance=distance, NB=True)

    print("First rows of H_vec (without NB)")
    for i, H in enumerate(H_vec):  # skip the first entry in list
        print("{}: {}".format(i + 1, H[0]))

    print("First rows of H_vec (with NB)")
    for i, H in enumerate(H_vec_EC):
        print("{}: {}".format(i + 1, H[0]))

    # --- Print just the top entry in first row (easier to compare)
    h_vec = []
    for k in range(1, distance + 1):
        h_vec.append(np.max(np.linalg.matrix_power(H0, k)[0]))

    hrow_vec = []
    for H in H_vec:
        hrow_vec.append(np.max(H[0]))

    hrow_EC_vec = []
    for H in H_vec_EC:
        hrow_EC_vec.append(np.max(H[0]))

    print("\nh_vec:\n{}".format(np.around(h_vec, 3)))
    print("hrow_vec (estimated without NB):\n{}".format(np.around(hrow_vec,
                                                                  3)))
    print("hrow_EC_vec (estimated with NB):\n{}".format(
        np.around(hrow_EC_vec, 3)))
Esempio n. 20
0
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    global n
    global d
    global rep_SameGraph
    global FILENAMEZ
    global csv_filename
    global initial_h0
    global exponent
    global length
    global variant

    global alpha_vec
    global beta_vec
    global gamma_vec
    global s_vec
    global clip_on_vec
    global numMaxIt_vec

    # Plotting Parameters
    global xtick_lab
    global xtick_labels
    global ytick_lab
    global xmax
    global xmin
    global ymin
    global ymax
    global labels
    global facecolor_vec
    global draw_std_vec
    global linestyle_vec
    global linewidth_vec
    global marker_vec
    global markersize_vec
    global legend_location

    global option_vec
    global learning_method_vec

    global Macro_Accuracy
    global EC
    global constraints
    global weight_vec
    global randomize_vec
    global k
    global err
    global avoidNeighbors
    global convergencePercentage_W
    global stratified
    global gradient
    global doubly_stochastic
    global num_restarts
    global numberOfSplits
    global H_heuristic

    global select_lambda_vec
    global lambda_vec
    global f_vec
    global H0c

    # -- Setup
    CHOICE = choice
    #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron
    experiments = [CHOICE]
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf

    SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF
    STD_FILL = True
    TIMING = False
    CALCULATE_DATA_STATISTICS = False

    # -- Default Graph parameters
    rep_SameGraph = 10  # iterations on same graph

    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1

    alpha_vec = [0] * 10
    beta_vec = [0] * 10
    gamma_vec = [0] * 10
    s_vec = [0.5] * 10
    clip_on_vec = [True] * 10
    numMaxIt_vec = [10] * 10

    # Plotting Parameters
    xtick_lab = [0.001, 0.01, 0.1, 1]
    xtick_labels = ['0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    xmax = 1
    xmin = 0.0001
    ymin = 0.3
    ymax = 0.7
    labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr']
    facecolor_vec = [
        'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
        "#64B5CD"
    ]
    draw_std_vec = [False] * 4 + [True]
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [4, 4, 2, 1, 2, 2]
    marker_vec = [None, 'o', 'x', '^', 'v', '+']
    markersize_vec = [0, 8, 8, 8, 8, 8, 8]

    option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
    learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']

    Macro_Accuracy = False
    EC = True  # Non-backtracking for learning
    constraints = True  # True
    weight_vec = [None] * 3 + [10, 10] * 2
    randomize_vec = [False] * 4 + [True] * 2
    k = 3
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True
    gradient = True
    doubly_stochastic = True
    num_restarts = None

    raw_std_vec = range(10)
    numberOfSplits = 1

    select_lambda_vec = [False] * 20
    lambda_vec = None

    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    FILENAMEZ = ""
    legend_location = ""
    fig_label = ""
    H_heuristic = ""

    def choose(choice):
        global n
        global d
        global rep_SameGraph
        global FILENAMEZ
        global initial_h0
        global exponent
        global length
        global variant

        global alpha_vec
        global beta_vec
        global gamma_vec
        global s_vec
        global clip_on_vec
        global numMaxIt_vec

        # Plotting Parameters
        global xtick_lab
        global xtick_labels
        global ytick_lab
        global xmax
        global xmin
        global ymin
        global ymax
        global labels
        global facecolor_vec
        global draw_std_vec
        global linestyle_vec
        global linewidth_vec
        global marker_vec
        global markersize_vec
        global legend_location

        global option_vec
        global learning_method_vec

        global Macro_Accuracy
        global EC
        global constraints
        global weight_vec
        global randomize_vec
        global k
        global err
        global avoidNeighbors
        global convergencePercentage_W
        global stratified
        global gradient
        global doubly_stochastic
        global num_restarts
        global numberOfSplits
        global H_heuristic

        global select_lambda_vec
        global lambda_vec
        global f_vec

        # -- Default Graph parameters

        if choice == 0:
            None

        elif choice == 304:  ## with varying weights
            FILENAMEZ = 'prop37'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'Prop37'
            legend_location = 'lower right'
            n = 62000
            d = 34.8
            select_lambda_vec = [False] * 5
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]

        elif choice == 305:  # DCEr Only experiment
            choose(605)
            choose(304)

            select_lambda_vec = [False] * 6

        elif choice == 306:
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 307:  # heuristic comparison
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            learning_method_vec.append('Heuristic')
            labels.append('Heuristic')
            H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- MovieLens dataset
        elif choice == 401:
            FILENAMEZ = 'movielens'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'MovieLens'
            legend_location = 'upper left'

            n = 26850
            d = 25.0832029795

        elif choice == 402:
            choose(401)
            select_lambda_vec = [False] * 3 + [
                True
            ] * 3  # allow to choose lambda for different f in f_vec

            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 403:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 404:
            choose(401)

            select_lambda_vec = [
                True
            ] * 3  # allow to choose lambda for different f in f_vec
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            labels = ['GS', 'DCEr', 'Homophily']
            facecolor_vec = ['black', "#C44E52", "#64B5CD"]
            draw_std_vec = [False, True, False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 2, 2, 2, 2]
            marker_vec = [None, '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]

            weight_vec = [None, 10, None]
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
            randomize_vec = [False, True, False]
            learning_method_vec = ['GT', 'DHE']  #TODO

        elif choice == 405:  # DCEr ONLY experiment
            choose(605)
            choose(401)
            learning_method_vec += ['Holdout']
            labels += ['Holdout']

        elif choice == 406:  # comparison with a static heuristic matrix
            choose(402)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        elif choice == 407:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [1] * 21  # same length as f_vec

        elif choice == 408:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from
        # data/sigmod-movielens-fig.csv
        elif choice == 409:
            choose(402)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52",
                "#C44E52", "#CCB974", "#64B5CD"
            ]
            labels = [
                'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10',
                'Holdout'
            ]
            draw_std_vec = [False] * 5 + [True] * 2 + [False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2]
            marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8]
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]
            legend_location = 'upper left'
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # -- Yelp dataset
        elif choice == 501:
            FILENAMEZ = 'yelp'
            Macro_Accuracy = True
            weight_vec = [None] * 3 + [10, 10]
            gradient = True
            ymin = 0.1
            ymax = 0.75
            fig_label = 'Yelp'
            legend_location = 'upper left'

            n = 4301900  # for figure
            d = 6.56  # for figure

        # -- Flickr dataset
        #elif choice == 601:
        #    FILENAMEZ = 'flickr'
        #    Macro_Accuracy = True
        #    fig_label = 'Flickr'
        #    legend_location = 'lower right'
        #    ymin = 0.3
        #    ymax = 0.7
        #    n = 2007369
        #    d = 18.1

        #elif choice == 602: ## with varying weights
        #    choose(601)

        #    select_lambda_vec = [False] * 4 + [True]*2  # allow to choose lambda for different f in f_vec
        #    f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)]
        #    lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        #elif choice == 603:     ## with varying weights
        #    choose(602)

        #    select_lambda_vec = [False] * 3 + [True] * 2  # allow to choose lambda for different f in f_vec
        #    # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        #elif choice == 604:     ## with weight = 1
        #    choose(603)

        #    lambda_vec = [0.5] * 21  # same length as f_vec

        # -- Flickr dataset
        elif choice == 601:
            FILENAMEZ = 'flickr'
            Macro_Accuracy = True
            fig_label = 'Flickr'
            legend_location = 'lower right'
            n = 2007369
            d = 18.1

        elif choice == 602:  ## with varying weights
            choose(601)

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 603:  ## with varying weights
            choose(602)

            select_lambda_vec = [False] * 3 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        elif choice == 604:  ## with weight = 1
            draw_std_vec = [4]
            choose(603)

            lambda_vec = [0.5] * 21  # same length as f_vec

        elif choice == 605:
            choose(601)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD", 'orange'
            ]
            draw_std_vec = [False] + [True] * 10
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [3] * 10
            marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x']
            markersize_vec = [0] + [8] * 10

            randomize_vec = [True] * 8
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]

            learning_method_vec = [
                'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE'
            ]
            select_lambda_vec = [False] * 8
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            weight_vec = [0, 0, 1, 2, 5, 10, 15]

            labels = ['GT'] + [
                i + ' {}'.format(weight_vec[ix])
                for ix, i in enumerate(['DCEr'] * 6)
            ]

        elif choice == 606:  # heuristic experiment
            choose(602)
            labels.append('Heuristic')
            learning_method_vec.append('Heuristic')
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- DBLP dataset
        elif choice == 701:
            FILENAMEZ = 'dblp'
            Macro_Accuracy = True
            ymin = 0.2
            ymax = 0.5
            fig_label = 'DBLP'
            legend_location = 'lower right'
            n = 2241258  # for figure
            d = 26.11  # for figure

        # -- ENRON dataset
        elif choice == 801:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            ymin = 0.3
            ymax = 0.75
            fig_label = 'Enron'
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            legend_location = 'upper left'
            n = 46463  # for figures
            d = 23.4  # for figures

        elif choice == 802:  ### WITH ADAPTIVE WEIGHTS
            choose(801)

            select_lambda_vec = [False] * 4 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 803:  ### WITH ADAPTIVE WEIGHTS
            choose(802)

            lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [
                1
            ] * 6  # same length as f_vec

        elif choice == 804:
            choose(803)

        elif choice == 805:
            choose(605)
            choose(801)
            #learning_method_vec += ['Holdout']
            #labels += ['Holdout']
        elif choice == 806:  # Heuristic experiment
            choose(802)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08],
                                    [0.08, 0.08, 0.76, 0.08],
                                    [0.08, 0.76, 0.08, 0.76],
                                    [0.08, 0.08, 0.76, 0.08]])

        elif choice == 821:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            constraints = True  # True
            gradient = True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [0.2, 0.2]

            randomize_vec = [False] * 4 + [True]
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.7
            labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Enron'
            legend_location = 'lower right'
            n = 46463  # for figures
            d = 23.4  # for figures

            alpha = 0.0
            beta = 0.0
            gamma = 0.0
            s = 0.5
            numMaxIt = 10

            select_lambda_vec = [False] * 3 + [True] * 2
            lambda_vec = [0.2] * 13 + [10] * 8  # same length as f_vec

        # -- Cora dataset
        elif choice == 901:
            FILENAMEZ = 'cora'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.9
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Cora'
            legend_location = 'lower right'
            n = 2708
            d = 7.8

        # -- Citeseer dataset
        elif CHOICE == 1001:
            FILENAMEZ = 'citeseer'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Citeseer'
            legend_location = 'lower right'
            n = 3312
            d = 5.6

        elif CHOICE == 1101:
            FILENAMEZ = 'hep-th'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.1
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Hep-th'
            legend_location = 'lower right'
            n = 27770
            d = 5.6

        elif choice == 1102:
            choose(1101)
            Macro_Accuracy = True

        elif CHOICE == 1204:
            FILENAMEZ = 'pokec-gender'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.000015
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [0, 3, 4, 4, 4, 4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Pokec-Gender'
            legend_location = 'lower right'
            n = 1632803
            d = 54.6

        else:
            raise Warning("Incorrect choice!")

    for choice in experiments:

        choose(choice)
        filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format(
            choice, FILENAMEZ)
        csv_filename = '{}.csv'.format(filename)

        header = [
            'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall',
            'learntime', 'proptime'
        ]
        if CREATE_DATA:
            save_csv_record(join(data_directory, csv_filename),
                            header,
                            append=False)

        # print("choice: {}".format(choice))

        # --- print data statistics
        if CALCULATE_DATA_STATISTICS:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())
            d = (len(W.nonzero()[0]) * 2) / n

            k = len(X0[0])

            print("FILENAMEZ:", FILENAMEZ)
            print("k:", k)
            print("n:", n)
            print("d:", d)

            # -- Graph statistics
            n_vec = calculate_nVec_from_Xd(Xd)
            print("n_vec:\n", n_vec)
            d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd)
            print("d_vec:\n", d_vec)
            P = calculate_Ptot_from_graph(W, Xd)
            print("P:\n", P)
            for i in range(k):
                Phi = calculate_degree_correlation(W, X0, i, NB=True)
                print("Degree Correlation, Class {}:\n{}".format(i, Phi))

            # -- Various compatibilities
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            print("H0 w/  constraints:\n", np.round(H0, 2))
            #raw_input() # Why?

            H2 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H4 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H5 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H6 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H7 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)

            print()
            # print("H MCE w/o constraints:\n", np.round(H0, 3))
            print("H MCE w/  constraints:\n", np.round(H2, 3))
            # print("H DCE 2 w/o constraints:\n", np.round(H4, 3))
            print("H DCE 2 w/  constraints:\n", np.round(H5, 3))
            # print("H DCE 10 w/o constraints:\n", np.round(H6, 3))
            print("H DCE 20 w/  constraints:\n", np.round(H7, 3))

            print()
            H_row_vec = H_observed(W, X0, 3, NB=True, variant=1)
            print("H_est_1:\n", np.round(H_row_vec[0], 3))
            print("H_est_2:\n", np.round(H_row_vec[1], 3))
            print("H_est_3:\n", np.round(H_row_vec[2], 3))

        # --- Create data
        if CREATE_DATA or ADD_DATA:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())  ## number of nodes in graph
            k = len(X0[0])
            d = (len(W.nonzero()[0]) * 2) / n
            #print(n)
            #print(d)
            #print("contraint = {}".format(constraints))
            #print('select lambda: {}'.format(len(select_lambda_vec)))
            #print('learning method: {}'.format(len(learning_method_vec)))
            #print('alpha: {}'.format(len(alpha_vec)))
            #print('beta: {}'.format(len(beta_vec)))
            #print('gamma: {}'.format(len(gamma_vec)))
            #print('s: {}'.format(len(s_vec)))
            #print('maxit: {}'.format(len(numMaxIt_vec)))
            #print('weight: {}'.format(len(weight_vec)))
            #print('randomize: {}'.format(len(randomize_vec)))
            # ---  Calculating True Compatibility matrix
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=constraints,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            # print(H0)
            H0c = to_centering_beliefs(H0)

            num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph

            # Starts a thread pool with at least 2 threads, and a lot more if you happen to be on a supercomputer
            pool = multiprocessing.Pool(max(2,
                                            multiprocessing.cpu_count() - 4))

            f_processes = f_vec * rep_SameGraph
            workers = []
            results = [(X0, W, f, ix)
                       for ix, f in enumerate(f_vec)] * rep_SameGraph
            # print('Expected results: {}'.format(num_results))
            try:  # hacky fix due to a bug in 2.7 multiprocessing
                # Distribute work for evaluating accuracy over the thread pool using
                # a hacky method due to python 2.7 multiprocessing not being fully
                # featured
                pool.map_async(multi_run_wrapper, results).get(num_results * 2)
            except multiprocessing.TimeoutError as e:
                continue
            finally:
                pool.close()
                pool.join()

        # -- Read data for all options and plot
        df1 = pd.read_csv(join(data_directory, csv_filename))
        acc_filename = '{}_accuracy_plot.pdf'.format(filename)
        pr_filename = '{}_PR_plot.pdf'.format(filename)
        if TIMING:
            print('=== {} Timing Results ==='.format(FILENAMEZ))
            print('Prop Time:\navg: {}\nstddev: {}'.format(
                np.average(df1['proptime'].values),
                np.std(df1['proptime'].values)))
            for learning_method in labels:
                rs = df1.loc[df1["method"] == learning_method]
                avg = np.average(rs['learntime'])
                std = np.std(rs['learntime'])
                print('{} Learn Time:\navg: {}\nstd: {}'.format(
                    learning_method, avg, std))

        sslhv.plot(df1,
                   join(figure_directory, acc_filename),
                   n=n,
                   d=d,
                   k=k,
                   labels=labels,
                   dataset=FILENAMEZ,
                   line_styles=linestyle_vec,
                   xmin=xmin,
                   ymin=ymin,
                   xmax=xmax,
                   ymax=ymax,
                   marker_sizes=markersize_vec,
                   draw_stds=draw_std_vec,
                   markers=marker_vec,
                   line_colors=facecolor_vec,
                   line_widths=linewidth_vec,
                   legend_location=legend_location,
                   show=SHOW_PDF,
                   save=CREATE_PDF,
                   show_plot=SHOW_PLOT)
Esempio n. 21
0
def run(choice,
        variant,
        create_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    """main parameterized method to produce all figures.
    Can be run from external jupyther notebook or method to produce all figures in PDF
    """

    # %% -- Setup
    CREATE_DATA = create_data
    CHOICE = choice
    VARIANT = variant
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    SHOW_PDF = show_pdf
    SHOW_TITLE = True
    LEGEND_MATCH_COLORS = False
    SHOW_DISTRIBUTION_IN_TITLE = True

    SHOW_BACKTRACK_ESTIMATE = True
    SHOW_NONBACKTRACK_ESTIMATE = True
    plot_colors = ['darkgreen', 'darkorange', 'blue']
    label_vec = [
        r'$\mathbf{H}^{\ell}\,\,\,\,$', r'$\mathbf{\hat P}^{(\ell)}$',
        r'$\mathbf{\hat P}_{\mathrm{NB}}^{(\ell)}$'
    ]

    csv_filename = 'Fig_Backtracking_Advantage_{}.csv'.format(CHOICE)
    fig_filename = 'Fig_Backtracking_Advantage_{}-{}.pdf'.format(
        CHOICE, VARIANT)

    header = [
        'currenttime',
        'choice',  # H, Hrow, HrowEC
        'l',
        'valueH',  # maximal values in first row of H
        'valueM'
    ]  # average value across entries in M
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # %% -- Default parameters
    ymin = 0.3
    ymax = 1
    exponent = None

    # %% -- CHOICES and VARIANTS
    if CHOICE == 1:  # n=1000, shows NB to be slight lower for l=2: probably due to sampling issues (d=3, thus very few points available)
        n = 1000
        h = 8
        d = 3
        f = 0.1
        distribution = 'uniform'
        rep = 10000
        length = 8

    elif CHOICE == 2:
        n = 1000
        h = 8
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 10000
        length = 8

    elif CHOICE == 3:  # nice: shows nicely that difference is even bigger for smaller h
        n = 1000
        h = 3
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 10000
        length = 8
        ymax = 0.8

    elif CHOICE == 4:
        n = 10000
        h = 3
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 100
        length = 8
        ymin = 0.333
        ymax = 0.65

    elif CHOICE == 5:
        n = 10000
        h = 3
        d = 3
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8

    elif CHOICE == 6:  # n=1000, the powerlaw problem with small graphs and high exponent
        n = 1000
        h = 8
        d = 3
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 10000
        length = 8

    elif CHOICE == 7:
        n = 10000
        h = 8
        d = 3
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8
        # ymin = 0.4
        ymax = 1

    elif CHOICE == 8:
        n = 10000
        h = 8
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8
        # ymin = 0.4
        ymax = 1

    elif CHOICE == 9:  # shows lower NB due to problem with sampling from high powerlaw -0.5
        n = 10000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 1000
        length = 8

    elif CHOICE == 10:
        n = 10000
        h = 8
        d = 3
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 1000
        length = 8

    elif CHOICE == 11:  # problem: shows that NB is too low (probably because of problem with sampling from -0.5 factor)
        n = 1000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 1000
        length = 8

    elif CHOICE == 12:  # problem: shows no problem with NB (probably because no problem with sampling from -0.2 factor)
        n = 1000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.2
        rep = 1000
        length = 8

    elif CHOICE == 20:
        n = 10000
        h = 3
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.3
        rep = 1000
        length = 8
        ymin = 0.333
        ymax = 0.65

    elif CHOICE == 21:  # originally used before color change
        n = 10000
        h = 3
        d = 25
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.3
        rep = 1000
        length = 8
        ymin = 0.333
        ymax = 0.65

        if VARIANT == 1:
            SHOW_TITLE = False
            plot_colors = ['red', 'blue', 'darkorange']
            label_vec = [r'$\mathbf{H}^{\ell}\quad\quad$', 'naive', 'better']
            LEGEND_MATCH_COLORS = True

        if VARIANT == 2:
            SHOW_TITLE = False
            plot_colors = ['red', 'blue', 'darkorange']
            label_vec = [r'$\mathbf{H}^{\ell}\quad\quad$', 'naive', 'better']
            SHOW_NONBACKTRACK_ESTIMATE = False
            LEGEND_MATCH_COLORS = True

        if VARIANT == 3:
            SHOW_TITLE = False
            plot_colors = ['red', 'blue', 'darkorange']
            label_vec = [r'$\mathbf{H}^{\ell}\quad\quad$', 'naive', 'better']
            SHOW_BACKTRACK_ESTIMATE = False
            SHOW_NONBACKTRACK_ESTIMATE = False
            LEGEND_MATCH_COLORS = True

        if VARIANT == 4:
            plot_colors = ['red', 'blue', 'darkorange']
            LEGEND_MATCH_COLORS = True

    elif CHOICE == 25:
        n = 10000
        h = 8
        d = 5
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8

    elif CHOICE == 26:
        n = 10000
        h = 8
        d = 25
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8
        ymax = 0.9
        ymin = 0.4

    elif CHOICE == 27:
        n = 10000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.3
        rep = 1000
        length = 8
        ymax = 0.9
        ymin = 0.33

    elif CHOICE == 31:
        n = 10000
        h = 3
        d = 10
        f = 0.1
        distribution = 'uniform'
        length = 8
        ymin = 0.333
        ymax = 0.65
        SHOW_DISTRIBUTION_IN_TITLE = False
        plot_colors = ['red', 'blue', 'darkorange']
        LEGEND_MATCH_COLORS = True

        if VARIANT == 0:
            rep = 1000

        if VARIANT == 1:
            rep = 20

    else:
        raise Warning("Incorrect choice!")

    k = 3
    a = 1
    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))

    # %% -- Create data
    if CREATE_DATA:

        # Calculations H
        print("Max entry of first rows of powers of H0:")
        for l in range(1, length + 1):
            valueH = np.max(np.linalg.matrix_power(H0, l)[0])

            tuple = [str(datetime.datetime.now())]
            text = ['H', l, valueH, '']
            text = np.asarray(text)  # without np, entries get ugly format
            tuple.extend(text)
            print("{}: {}".format(l, valueH))
            save_csv_record(join(data_directory, csv_filename), tuple)

        # Calculations Hrow and HrowEC
        for r in range(rep):
            print('Repetition {}'.format(r))

            # Create graph
            start = time.time()
            W, Xd = planted_distribution_model_H(
                n,
                alpha=alpha0,
                H=H0,
                d_out=
                d,  # notice that for undirected graphs, actual degree = 2*d
                distribution=distribution,
                exponent=exponent,
                directed=False,
                debug=False)
            X0 = from_dictionary_beliefs(Xd)
            X1, ind = replace_fraction_of_rows(X0, 1 - f)
            time_calc = time.time() - start
            # print("\nTime for graph:{}".format(time_calc))

            print("Average outdegree: {}".format(
                calculate_average_outdegree_from_graph(W)))

            # Calculate H_vec and M_vec versions (M_vec to calculate the average number of entries in M)
            H_vec = H_observed(W, X1, distance=length, NB=False, variant=1)
            H_vec_EC = H_observed(W, X1, distance=length, NB=True, variant=1)
            M_vec = M_observed(W, X1, distance=length, NB=False)
            M_vec_EC = M_observed(W, X1, distance=length, NB=True)

            # Calculation H_vec
            # print("Max entry of first rows of H_vec")
            for l, H in enumerate(H_vec):
                valueH = H[0][
                    (l + 1) %
                    2]  # better than 'value = np.max(H[0])', otherwise sometimes chooses another higher entry -> biased estimate
                valueM = np.average(M_vec[l + 1])
                # print(M_vec[l+1])
                # print(valueM)

                tuple = [str(datetime.datetime.now())]
                text = ['Hrow', l + 1, valueH, valueM]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("{}: {}".format(l + 1, value))
                save_csv_record(join(data_directory, csv_filename), tuple)

            # Calculation H_vec_EC
            # print("Max entry of first rows of H_vec_EC")
            for l, H in enumerate(H_vec_EC):
                valueH = H[0][(l + 1) % 2]
                valueM = np.average(M_vec_EC[l + 1])
                # print(M_vec_EC[l+1])
                # print(valueM)

                tuple = [str(datetime.datetime.now())]
                text = ['HrowEC', l + 1, valueH, valueM]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("{}: {}".format(l + 1, value))
                save_csv_record(join(data_directory, csv_filename), tuple)

    #%% -- Read, aggregate, and pivot data
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
    df2 = df1.groupby(['choice', 'l']).agg \
        ({'valueH': [np.mean, np.std, np.size],  # Multiple Aggregates
          'valueM': [np.mean],
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'valueH_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(30)))
    df3 = pd.pivot_table(df2,
                         index=['l'],
                         columns=['choice'],
                         values=['valueH_mean', 'valueH_std',
                                 'valueM_mean'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    # df3.drop(['valueM_mean_H', 'valueH_std_H'], axis=1, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.reset_index(level=0, inplace=True)  # get l into columns
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    #%% -- Setup figure
    mpl.rcParams['backend'] = 'pdf'
    mpl.rcParams['lines.linewidth'] = 3
    mpl.rcParams['font.size'] = 16
    mpl.rcParams['axes.labelsize'] = 20
    mpl.rcParams['axes.titlesize'] = 16
    mpl.rcParams['xtick.labelsize'] = 16
    mpl.rcParams['ytick.labelsize'] = 16
    mpl.rcParams['legend.fontsize'] = 20
    mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
    mpl.rcParams['grid.color'] = '777777'  # grid color
    mpl.rcParams['figure.figsize'] = [4, 4]
    mpl.rcParams['xtick.major.pad'] = 4  # padding of tick labels: default = 4
    mpl.rcParams['ytick.major.pad'] = 4  # padding of tick labels: default = 4

    fig = plt.figure()
    ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

    #%% -- Extract values into columns (plotting dataframew with bars plus error lines and lines gave troubles)
    l_vec = df3['l'].values  # .tolist() does not work with bar plot
    mean_H_vec = df3['valueH_mean_H'].values
    mean_Hrow_vec = df3['valueH_mean_Hrow'].values
    mean_Hrow_vecEC = df3['valueH_mean_HrowEC'].values
    std_Hrow_vec = df3['valueH_std_Hrow'].values
    std_Hrow_vecEC = df3['valueH_std_HrowEC'].values

    #%% -- Draw the plot and annotate
    width = 0.3  # the width of the bars
    if SHOW_BACKTRACK_ESTIMATE:
        left_vec = l_vec
        if SHOW_NONBACKTRACK_ESTIMATE:
            left_vec = left_vec - width
        bar1 = ax.bar(
            left_vec,
            mean_Hrow_vec,
            width,
            color=plot_colors[1],
            yerr=std_Hrow_vec,
            error_kw={
                'ecolor': 'black',
                'linewidth': 2
            },  # error-bars colour
            label=label_vec[1])
    if SHOW_NONBACKTRACK_ESTIMATE:
        bar2 = ax.bar(
            l_vec,
            mean_Hrow_vecEC,
            width,
            color=plot_colors[2],
            yerr=std_Hrow_vecEC,
            error_kw={
                'ecolor': 'black',
                'linewidth': 2
            },  # error-bars colour
            label=label_vec[2])
    gt = ax.plot(l_vec,
                 mean_H_vec,
                 color=plot_colors[0],
                 linestyle='solid',
                 linewidth=2,
                 marker='o',
                 markersize=10,
                 markeredgewidth=2,
                 markerfacecolor='None',
                 markeredgecolor=plot_colors[0],
                 label=label_vec[0])

    if CHOICE == 4 or CHOICE == 20:
        ax.annotate(
            np.round(mean_Hrow_vec[1], 2),
            xy=(2.15, 0.65),
            xytext=(2.1, 0.60),
            arrowprops=dict(facecolor='black', arrowstyle="->"),
        )

    #%% -- Legend
    if distribution == 'uniform' and SHOW_DISTRIBUTION_IN_TITLE:
        distribution_label = ',$uniform'
    else:
        distribution_label = '$'
    if SHOW_TITLE:
        plt.title(
            r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.
            format(int(n / 1000), 2 * d, h, f, distribution_label
                   ))  # notice that actual d is double than in one direction

    handles, labels = ax.get_legend_handles_labels()
    legend = plt.legend(
        handles,
        labels,
        loc='upper right',
        handlelength=1.5,
        labelspacing=0,  # distance between label entries
        handletextpad=0.3,  # distance between label and the line representation
        # title='Iterations'
        borderaxespad=0.1,  # distance between legend and the outer axes
        borderpad=0.1,  # padding inside legend box
        numpoints=1,  # put the marker only once
    )

    if LEGEND_MATCH_COLORS:  # TODO: how to get back the nicer line spacing defined in legend above after changing the legend text colors
        legend.get_texts()[0].set_color(plot_colors[0])
        if SHOW_BACKTRACK_ESTIMATE:
            legend.get_texts()[1].set_color(plot_colors[1])
        if SHOW_NONBACKTRACK_ESTIMATE:
            legend.get_texts()[2].set_color(plot_colors[2])

    frame = legend.get_frame()
    frame.set_linewidth(0.0)
    frame.set_alpha(0.8)  # 0.8

    # %% -- Figure settings & plot
    ax.set_xticks(range(10))
    plt.grid(b=True,
             which='both',
             alpha=0.2,
             linestyle='solid',
             axis='y',
             linewidth=0.5)  # linestyle='dashed', which='minor'
    plt.xlabel(r'Path length ($\ell$)', labelpad=0)
    plt.ylim(ymin, ymax)  # placed after yticks
    plt.xlim(0.5, 5.5)
    plt.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=
        'off',  # ticks along the bottom edge are off        TODO: Paul, this does not work anymore :(    1/26/2020
        top='off',  # ticks along the top edge are off
        # labelbottom='off',    # labels along the bottom edge are off
    )

    if CREATE_PDF:
        plt.savefig(
            join(figure_directory, fig_filename),
            format='pdf',
            dpi=None,
            edgecolor='w',
            orientation='portrait',
            transparent=False,
            bbox_inches='tight',
            pad_inches=0.05,
            # frameon=None
        )
    if SHOW_PDF:
        showfig(join(figure_directory, fig_filename))
    if SHOW_PLOT:
        plt.show()
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False):

    verbose = False
    repeat_diffGraph = 1000
    SUBSET = True
    NOGT = False        ## Not draw Ground Truth Comparison
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf

    STD_FILL = False

    csv_filename = 'Fig_fast_optimal_restarts_Accv2_{}.csv'.format(CHOICE)
    fig_filename = 'Fig_fast_optimal_restarts_Accv2_{}.pdf'.format(CHOICE)
    header = ['currenttime',
              'k',
              'restarts',
              'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)




    # -- Default Graph parameters
    global f_vec, labels, facecolor_vec
    global number_of_restarts



    initial_h0 = None
    distribution = 'powerlaw'
    exponent = -0.3  # for powerlaw
    length = 4  # path length
    constraint = True
    gradient = True
    variant = 1
    EC = True
    delta = 0.001
    numMaxIt = 10
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True

    learning_method = 'DHE'
    weights = 10
    randomize = True
    return_min_energy = True
    number_of_restarts = [8, 6, 5, 4]



    clip_on_vec = [True] * 20
    draw_std_vec = range(10)
    ymin = 0.3
    ymax = 1
    xmin = 0.001
    xmax = 1
    xtick_lab = []
    xtick_labels = []
    ytick_lab = np.arange(0, 1.1, 0.1)
    linestyle_vec = ['solid','solid','solid'] * 20
    linewidth_vec = [4,4,4,4]*10
    marker_vec = ['x', 'v', '^', '+', '>', '<'] *10
    markersize_vec = [10, 8, 8, 8 ,8 ,8 ,8 ]*10
    facecolor_vec = ["#C44E52", "#4C72B0", "#8172B2",  "#CCB974",  "#55A868", "#64B5CD"]*5




    # -- Options mainly change k

    if CHOICE == 101:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 10, 13, 16, 18, 20]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [30, 20, 10, 7, 5, 4, 3, 2, 1, 50, 99, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 102:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        # number_of_restarts = [30, 20, 10, 7, 5, 4, 3, 2, 1, 50, 99, 100]

        number_of_restarts = [20, 10, 5, 4, 3, 2]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 103:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 99]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 104:
        n = 10000
        h = 8
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 99]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]



    elif CHOICE == 105:
        n = 10000
        h = 8
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]

    elif CHOICE == 106:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 107:

        n = 10000
        h = 8
        d = 15
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [10, 5, 4, 3, 2, 99]
        # number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['x', 'v', '^', 's', 'o',  's', None] * 10
        markersize_vec = [10, 6, 6, 6, 6, 6, 6] * 10

        labels = [r'$r=$' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]

    elif CHOICE == 108:

        n = 10000
        h = 8
        d = 15
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [10, 5, 4, 3, 2, 99]
        # number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['x', 'v', '^', 's', 'o',  's', None] * 10
        markersize_vec = [10, 6, 6, 6, 6, 6, 6] * 10

        labels = [r'$r=$' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]
        repeat_diffGraph = 10

    else:
        raise Warning("Incorrect choice!")

    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))



    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for _ in range(repeat_diffGraph):

            for k in k_vec:
                a = [1.] * k
                k_star = int(k * (k - 1) / 2)
                alpha0 = np.array(a)
                alpha0 = alpha0 / np.sum(alpha0)

                # Generate Graph
                # print("Generating Graph: n={} h={} d={} k={}".format(n, h, d, k))
                H0 = create_parameterized_H(k, h, symmetric=True)
                W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False)
                H0_vec = transform_HToh(H0)
                # print("\nGold standard {}".format(np.round(H0_vec, decimals=3)))

                X0 = from_dictionary_beliefs(Xd)
                X2, ind = replace_fraction_of_rows(X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=None, stratified=stratified)

                h0 = [1.] * int(k_star)
                h0 = np.array(h0)
                h0 = h0 / k

                delta = 1 / (3 * k)
                # print("delta: ", delta)

                perm = []
                while len(perm) < number_of_restarts[0]:
                    temp = []
                    for _ in range(k_star):
                        temp.append(random.choice([-delta, delta]))
                    if temp not in perm:
                        perm.append(temp)
                    if len(perm) >= 2 ** (k_star):
                        break

                E_list = []   ## format = [[energy, H_vec], []..]
                for vec in perm:
                    H2_vec, energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC,
                                               weights=weights, randomize=False, constraints=constraint,
                                               gradient=gradient, return_min_energy=True, verbose=verbose,
                                               initial_h0=h0 + np.array(vec))
                    E_list.append([energy, list(H2_vec)])

                # print("All Optimizaed vector:")
                # [print(i) for i in E_list ]

                # print("Outside Energy:{} optimized vec:{} \n".format(min_energy_vec[0], optimized_Hvec))

                # min_energy_vec = min(E_list)
                # optimized_Hvec = min_energy_vec[1]
                #
                # print("\nEnergy:{} optimized vec:{}  \n\n".format(min_energy_vec[0],optimized_Hvec))
                #
                #

                GTr_optimized_Hvec, GTr_energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC,
                                                   weights=weights, randomize=False, constraints=constraint,
                                                   gradient=gradient, return_min_energy=True, verbose=verbose,
                                                   initial_h0=H0_vec)

                uninformative_optimized_Hvec, uninformative_energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC,
                                                   weights=weights, randomize=False, constraints=constraint,
                                                   gradient=gradient, return_min_energy=True, verbose=verbose,
                                                   initial_h0=h0)


                iterative_permutations = list(E_list)
                for restartz in number_of_restarts:
                    if k==2 or k == 3 and restartz > 8 and restartz<99:
                        continue

                    if restartz <= number_of_restarts[0]:
                        iterative_permutations = random.sample(iterative_permutations, restartz)
                    # print("For restart:{}, we have vectors:\n".format(restartz))
                    # [print(i) for i in  iterative_permutations]


                    if restartz == 100:       ## for GT
                        H2c = to_centering_beliefs(H0)
                        # print("\nGT: ", transform_HToh(H0,k))

                    elif restartz == 99:       ## for DCEr init with GT
                        H2c = to_centering_beliefs(transform_hToH(GTr_optimized_Hvec, k))
                        # print("\nGTr: ", GTr_optimized_Hvec)

                    elif restartz == 1:  ## for DCEr with uninformative initial
                        H2c = to_centering_beliefs(transform_hToH(uninformative_optimized_Hvec, k))
                        # print("\nUninformative: ", uninformative_optimized_Hvec)

                    elif restartz == 50:  ## for min{DCEr , GTr}
                        # print("Length:",len(E_list))
                        # [print(i) for i in E_list]
                        mod_E_list = list(E_list)+[[GTr_energy , list(GTr_optimized_Hvec)]]     #Add GTr to list and take min
                        # print("Mod Length:", len(mod_E_list))
                        # [print(i) for i in mod_E_list]
                        min_energy_vec = min(mod_E_list)
                        # print("\nSelected for 50:",min_energy_vec)
                        optimized_Hvec = min_energy_vec[1]

                        H2c = to_centering_beliefs(transform_hToH(optimized_Hvec, k))

                    else:
                        min_energy_vec = min(iterative_permutations)
                        optimized_Hvec = min_energy_vec[1]
                        H2c = to_centering_beliefs(transform_hToH(optimized_Hvec, k))

                    # print("Inside Chosen Energy:{} optimized vec:{} \n".format(min_energy_vec[0], optimized_Hvec))

                    try:
                        eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', X=X2)
                        s = 0.5
                        eps = s * eps_max

                        F, actualIt, actualPercentageConverged = \
                            linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                          method='noecho',
                                                          numMaxIt=numMaxIt,
                                                          convergencePercentage=convergencePercentage_W,
                                                          debug=2)
                    except ValueError as e:
                        print(
                            "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                    else:
                        acc = matrix_difference_classwise(X0, F, ignore_rows=ind)

                        tuple = [str(datetime.datetime.now())]
                        text = [k,
                                restartz,
                                acc]
                        tuple.extend(text)

                        if verbose:
                            print("\nGold standard {}".format(np.round(H0_vec, decimals=3)))
                        # print("k:{}  Restart:{}  OptimizedVec:{}  Energy:{}  Accuracy:{}".format(k, restartz, np.round(min_energy_vec[1], decimals=3), min_energy_vec[0], acc  ))
                        # print("k:{}  Restart:{}   Accuracy:{}".format(k, 1, L2_dist))
                        save_csv_record(join(data_directory, csv_filename), tuple)



    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(20)))

    # Aggregate repetitions
    df2 = df1.groupby(['k', 'restarts']).agg \
        ({'accuracy': [np.mean, np.std, np.size], })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    df2['restarts'] = df2['restarts'].astype(str)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(20)))

    # Pivot table
    df3 = pd.pivot_table(df2, index=['k'], columns=['restarts'], values=['accuracy_mean', 'accuracy_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(10)))




    df4 = df3.drop('k', axis=1)
    if NOGT:
        df4 = df3.drop(['k', 'accuracy_mean_0', 'accuracy_mean_1', 'accuracy_std_0', 'accuracy_std_1'], axis=1)

    # df4 = df3.drop(['k', 'accuracy_mean_100', 'accuracy_std_100'], axis=1)


    df5 = df4.div(df4.max(axis=1), axis=0)
    df5['k'] = df3['k']
    # print("\n-- df5 (length {}):\n{}".format(len(df5.index), df5.head(100)))

    # df5 = df3     ## for normalization

    X_f = df5['k'].values            # read k from values instead
    Y=[]
    Y_std=[]
    for rez in number_of_restarts:
        if NOGT:
            if rez == 100 or rez==99:
                continue
        Y.append(df5['accuracy_mean_{}'.format(rez)].values)
        if STD_FILL:
            Y_std.append(df5['accuracy_std_{}'.format(rez)].values)



    if CREATE_PDF or SHOW_PDF or SHOW_PLOT:

        # -- Setup figure
        mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['font.size'] = 16
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


        #  -- Drawing
        if STD_FILL:
            for choice, (option, facecolor) in enumerate(zip(number_of_restarts, facecolor_vec)):
                if option == 100:  ## GT
                    if NOGT:
                        continue
                    facecolor = 'black'
                elif option == 99:  ## GT-r
                    if NOGT:
                        continue
                    facecolor = 'black'

                ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice],
                                facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')

        for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
                enumerate(zip(number_of_restarts, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):

            if option == 100:     ## GT
                if NOGT:
                    continue
                linestyle='dashed'
                linewidth=3
                color='black'
                label='GS'
                marker='x'
                markersize=6
            elif option == 99:       ## GT-r
                if NOGT:
                    continue
                linestyle='dashed'
                linewidth=2
                color='black'
                label='Global Minima'
                marker = None
                markersize = 6
            elif option == 1:     ## GT
                color="#CCB974"
                linewidth = 2
                label='Uninfo'
            elif option == 50:       ## GT-r
                label='min{30,GTr}'

            P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
                    markersize=markersize, markeredgecolor='black',  markeredgewidth=1, clip_on=clip_on)

        # plt.xscale('log')

        # -- Title and legend
        distribution_label = '$'
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        n_label = '{}k'.format(int(n / 1000))
        if n < 1000:
            n_label='{}'.format(n)

        titleString = r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{} $'.format(n_label, d, h, f)
        title(titleString)

        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(handles, labels,
                            loc='lower left',     # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            borderaxespad=0.2,  # distance between legend and the outer axes
                            borderpad=0.3,  # padding inside legend box
                            numpoints=1,  # put the marker only once
                            # bbox_to_anchor=(1.1, 0)
                            )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        plt.xticks(xtick_lab, xtick_labels)
        # plt.yticks(ytick_lab, ytick_lab)


        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.2f'))
        # ax.xaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.0f'))

        grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Number of Classes $(k)$', labelpad=0)      # labelpad=0
        ylabel(r'Relative Accuracy', labelpad=0)

        xlim(2.9, 7.1)
        #
        ylim(0.65, 1.015)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)
        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory, fig_filename))  # shows actually created PDF
Esempio n. 23
0
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False):

    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    STD_FILL = True


    csv_filename = 'Fig_End-to-End_accuracy_VaryK_{}.csv'.format(CHOICE)
    header = ['currenttime',
              'option',
              'k',
              'f',
              'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)


    # -- Default Graph parameters
    rep_SameGraph = 10       # iterations on same graph
    initial_h0 = None           # initial vector to start finding optimal H
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    variant = 1
    EC = True                   # Non-backtracking for learning
    ymin = 0.3
    ymax = 1
    xmax = 8
    xtick_lab = [2,3,4,5,6,7, 8]
    xtick_labels = ['2', '3', '4', '5', '6', '7', '8']
    ytick_lab = np.arange(0, 1.1, 0.1)
    f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)]
    k_vec = [3, 4, 5 ]
    rep_DifferentGraphs = 10   # iterations on different graphs
    err = 0
    avoidNeighbors = False
    gradient = False
    pruneRandom = False
    convergencePercentage_W = None
    stratified = True
    label_vec = ['*'] * 10
    clip_on_vec = [False] * 10
    draw_std_vec = range(10)
    numberOfSplits = 1
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [5, 4, 3, 3] + [3] * 10
    marker_vec = [None, None, 'o', 'x', 'o', '^', 'o', 'x', 'o', '^', 'o', 'x', 'o', '^']
    markersize_vec = [0, 0, 4, 8] + [6] * 10
    facecolor_vec = ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"]


    # -- Options with propagation variants
    if CHOICE == 500:     ## 1k nodes
        n = 1000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        learning_method_vec = ['GS', 'MHE', 'DHE']
        weight_vec = [10] * 3
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 2 + [True]
        xmin = 3.
        ymin = 0.
        ymax = 1.
        label_vec = ['GS', 'MCE', 'DCEr']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [3, 4, 5, 6]

    elif CHOICE == 501:        ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        learning_method_vec = ['GT', 'MHE', 'DHE']
        weight_vec = [10] * 3
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 2 + [True]
        xmin = 2.
        ymin = 0.
        ymax = 1.
        label_vec = ['GT', 'MCE', 'DCEr']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [2, 3, 4, 5]


    elif CHOICE == 502:        ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.6
        ymax = 1.
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]



    elif CHOICE == 503:        ## 10k nodes
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.3
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [6, 7, 8]
        clip_on_vec = [True] * 10

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]



    elif CHOICE == 504:        ## 10k nodes
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        xmax = 7
        ymin = 0.2
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        # k_vec = [2, 3, 4, 5, 6, 7, 8]
        k_vec = [7]
        clip_on_vec = [True] * 10




    elif CHOICE == 505:        ## 10k nodes    with f = 0.005
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        xmax = 7
        ymin = 0.2
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.005]
        k_vec = [2, 3, 4, 5, 6, 7]
        # k_vec = [7]
        clip_on_vec = [True] * 10

    # elif CHOICE == 506:        ## 10k nodes    with f = 0.005
    #     n = 10000
    #     h = 3
    #     d = 25
    #     option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
    #     learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
    #     weight_vec = [10] * 10
    #     alpha_vec = [0] * 10
    #     beta_vec = [0] * 10
    #     gamma_vec = [0] * 10
    #     s_vec = [0.5] * 10
    #     numMaxIt_vec = [10] * 10
    #     randomize_vec = [False] * 4 + [True] + [False]
    #     xmin = 2
    #     xmax = 7
    #     ymin = 0.2
    #     ymax = 0.9
    #     label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr']
    #     facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
    #     f_vec = [0.005]
    #     k_vec = [2,3,4,5,6,7]
    #     # k_vec = [7]
    #     clip_on_vec = [True] * 10




    elif CHOICE == 506:        ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        xmax = 7
        ymin = 0.2
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.005]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [5]
        clip_on_vec = [True] * 10

        rep_SameGraph = 1       # iterations on same graph
        rep_DifferentGraphs = 1  # iterations on same graph

    elif CHOICE == 507:  ## 10k nodes   with gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GS', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.1
        ymax = 0.9
        label_vec = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [6, 7, 8]
        clip_on_vec = [True] * 10

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]

        gradient = True
        pruneRandom = True


    elif CHOICE == 508:  ## 10k nodes   with gradient and PruneRandom
        n = 1000
        h = 3
        d = 10
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GS', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.1
        ymax = 0.9
        label_vec = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [6, 7, 8]
        clip_on_vec = [True] * 10

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]

        gradient = True
        pruneRandom = True
        rep_DifferentGraphs = 1
        rep_SameGraph = 1



    else:
        raise Warning("Incorrect choice!")


    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs):  # create several graphs with same parameters
            # print("\ni: {}".format(i))

            for k in k_vec:
                # print("\nk: {}".format(k))

                H0 = create_parameterized_H(k, h, symmetric=True)
                H0c = to_centering_beliefs(H0)

                a = [1.] * k
                alpha0 = np.array(a)
                alpha0 = alpha0 / np.sum(alpha0)

                W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                                          distribution=distribution,
                                                          exponent=exponent,
                                                          directed=False,
                                                          debug=False)
                X0 = from_dictionary_beliefs(Xd)

                for j in range(rep_SameGraph):  # repeat several times for same graph
                    # print("j: {}".format(j))

                    ind = None
                    for f in f_vec:             # Remove fraction (1-f) of rows from X0 (notice that different from first implementation)
                        X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified)
                        X2 = introduce_errors(X1, ind, err)



                        for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
                                enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):

                            # -- Learning
                            if learning_method == 'GT':
                                H2c = H0c


                            elif learning_method == 'Holdout':


                                H2 = estimateH_baseline_serial(X2, ind, W, numMax=numMaxIt,
                                                               # ignore_rows=ind,
                                                               numberOfSplits=numberOfSplits,
                                                               # method=learning_method, variant=1, distance=length,
                                                               EC=EC,
                                                               alpha=alpha, beta=beta, gamma=gamma)
                                H2c = to_centering_beliefs(H2)

                            elif learning_method != 'DHE':
                                H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize)
                                H2c = to_centering_beliefs(H2)

                            else:
                                H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, gradient=gradient, randomrestarts=pruneRandom)
                                H2c = to_centering_beliefs(H2)


                            # -- Propagation
                            X2c = to_centering_beliefs(X2, ignoreZeroRows=True)       # try without
                            eps_max = eps_convergence_linbp_parameterized(H2c, W,
                                                                          method='noecho',
                                                                          alpha=alpha, beta=beta, gamma=gamma,
                                                                          X=X2)
                            eps = s * eps_max
                            try:
                                F, actualIt, actualPercentageConverged = \
                                    linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                                  method='noecho',
                                                                  alpha=alpha, beta=beta, gamma=gamma,
                                                                  numMaxIt=numMaxIt,
                                                                  convergencePercentage=convergencePercentage_W,
                                                                  debug=2)
                            except ValueError as e:
                                print (
                                "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                            else:
                                accuracy_X = matrix_difference(X0, F, ignore_rows=ind)

                                tuple = [str(datetime.datetime.now())]
                                text = [option_vec[option_index],
                                        k,
                                        f,
                                        accuracy_X]
                                # text = ['' if v is None else v for v in text]       # TODO: test with vocabularies
                                # text = np.asarray(text)         # without np, entries get ugly format
                                tuple.extend(text)
                                # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option_vec[option_index], f, actualIt, accuracy_X))
                                save_csv_record(join(data_directory, csv_filename), tuple)


    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # -- Aggregate repetitions
    df2 = df1.groupby(['option', 'k', 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size, np.median],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # -- Pivot table
    df3 = pd.pivot_table(df2, index=['f', 'k'], columns=['option'], values=[ 'accuracy_mean', 'accuracy_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(100)))



    # X_f = k_vec
    X_f = df3['k'].values            # read k from values instead

    Y_hash = defaultdict(dict)
    Y_hash_std = defaultdict(dict)
    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = list()
            Y_hash_std[f][option] = list()
    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = df3.loc[df3['f'] == f]['accuracy_mean_{}'.format(option)].values
            Y_hash_std[f][option] = df3.loc[df3['f'] == f]['accuracy_std_{}'.format(option)].values




    if CREATE_PDF or SHOW_PLOT or SHOW_PDF:

        # -- Setup figure
        fig_filename = 'Fig_End-to-End_accuracy_varyK_{}.pdf'.format(CHOICE)
        mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        opt_f_vecs = [(option, f) for option in option_vec for f in f_vec]

        for ((option, f), color, linewidth, clip_on, linestyle, marker, markersize) in \
            zip(opt_f_vecs, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec):

            # label = learning_method_vec[option_vec.index(option)]
            label = label_vec[option_vec.index(option)]
            # label = label + " " + str(f)

            if STD_FILL:


                # print((X_f))
                # print(Y_hash[f][option])


                ax.fill_between(X_f, Y_hash[f][option] + Y_hash_std[f][option], Y_hash[f][option] - Y_hash_std[f][option],
                                facecolor=color, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X_f, Y_hash[f][option] + Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X_f, Y_hash[f][option] - Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid')

            ax.plot(X_f, Y_hash[f][option], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
                markersize=markersize, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on)

        if CHOICE==507:
            Y_f = [1/float(i) for i in X_f]

            ax.plot(X_f, Y_f, linewidth=2, color='black', linestyle='dashed',
                    label='Random', zorder=4, marker='x',
                markersize=8, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on)

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        if n < 1000:
            n_label='{}'.format(n)
        else:
            n_label = '{}k'.format(int(n / 1000))

        title(r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(n_label, d, h, f, distribution_label))
        handles, label_vec = ax.get_legend_handles_labels()
        legend = plt.legend(handles, label_vec,
                            loc='upper right',  # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            borderaxespad=0.2,  # distance between legend and the outer axes
                            borderpad=0.3,  # padding inside legend box
                            numpoints=1,  # put the marker only once
                            )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8


        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))

        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')

        grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Number of Classes $(k)$', labelpad=0)      # labelpad=0
        ylabel(r'Accuracy', labelpad=0)

        xlim(xmin, xmax)
        ylim(ymin, ymax)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)

        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory, fig_filename))
def run(choice, variant, create_data=False, show_plot=False, create_pdf=False, show_pdf=False, append_data=False):
    """main parameterized method to produce all figures.
    Can be run from external jupyther notebook or method to produce all figures, optionally as PDF
    CHOICE uses a different saved experimental run
    VARIANT uses a different wayt o plot
    """

    # %% -- Setup
    CREATE_DATA = create_data
    APPEND_DATA = append_data   # allows to add more data, requires CREATE_DATA to be true
    CHOICE = choice
    VARIANT = variant
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    SHOW_PDF = show_pdf
    BOTH = True  # show both figures for W and H
    SHOW_TITLE = True  # show parameters in title of plot
    f = 1  # fraction of labeled nodes for H estimation

    csv_filename = 'Fig_Scaling_Hrow_{}.csv'.format(CHOICE)
    fig_filename = 'Fig_Scaling_Hrow_{}-{}.pdf'.format(CHOICE, VARIANT)

    plot_colors = ['darkorange', 'blue']
    header = ['currenttime',
              'choice',  # W, or H
              'l',
              'time']
    if CREATE_DATA and not APPEND_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=APPEND_DATA)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed

    # %% -- Default parameters
    n = 10000
    ymax = 10
    h = 3
    d = 10  # actual degree is double
    distribution = 'uniform'
    exponent = None

    # %% -- CHOICES and VARIANTS
    if CHOICE == 1:
        W_repeat = [0, 0, 30, 5, 3, 1]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4.3
        W_annotate_y = 1
        H_annotate_x = 6
        H_annotate_y = 0.005

    elif CHOICE == 2:  # small exponent 3, does not show the advantage well
        d = 3
        W_repeat = [0, 0, 10, 5, 5, 5, 5, 5, 5]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 5
        W_annotate_y = 0.08
        H_annotate_x = 6.5
        H_annotate_y = 0.004

    elif CHOICE == 3:  # small exponent 2, does not show the advantage well
        d = 2
        W_repeat = [0, 0, 50, 50, 50, 50, 50, 50, 50]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 6.5
        W_annotate_y = 0.02
        H_annotate_x = 6.5
        H_annotate_y = 0.004

    elif CHOICE == 4:
        distribution = 'powerlaw'
        exponent = -0.5
        W_repeat = [0, 0, 50, 9, 5, 3]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4
        W_annotate_y = 1
        H_annotate_x = 6.5
        H_annotate_y = 0.006

        if VARIANT == 1:
            plot_colors = ['blue', 'darkorange']
            SHOW_TITLE = False

        if VARIANT == 2:
            plot_colors = ['blue', 'darkorange']
            BOTH = False
            SHOW_TITLE = False

    elif CHOICE == 5:
        distribution = 'powerlaw'
        exponent = -0.5
        W_repeat = [0, 0, 1, 1]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0] + [1] * 8
        W_annotate_x = 4
        W_annotate_y = 1
        H_annotate_x = 6.5
        H_annotate_y = 0.006

    elif CHOICE == 11:
        W_repeat = [0, 0, 1, 1, 0, 0]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4.3
        W_annotate_y = 1
        H_annotate_x = 6
        H_annotate_y = 0.005

    elif CHOICE == 12:
        W_repeat = [0, 0, 31, 11, 5, 3, 3, 3, 3]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4.3
        W_annotate_y = 2.5
        H_annotate_x = 5.5
        H_annotate_y = 0.004
        f = 0.1
        plot_colors = ['blue', 'darkorange']
        ymax = 100

        if VARIANT == 1:    # TODO: when trying to add additional data, then it creates 7 instead of 4 rows,
                            # but the same code idea of CREATE vs ADD data appears to work in Fig_MHE_Optimal_Lambda, for that to replicate run below
                            # run(12, 1, create_pdf=True, show_pdf=True, create_data=False, append_data=True)
            W_repeat = [0, 0, 0, 0, 0, 0, 0, 0, 0]  # index starts with 0. useful only for W^2 and later
            H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]

    else:
        raise Warning("Incorrect choice!")

    # %% -- Create data
    if CREATE_DATA or APPEND_DATA:

        # Create graph
        k = 3
        a = 1
        alpha0 = np.array([a, 1., 1.])
        alpha0 = alpha0 / np.sum(alpha0)
        H0 = create_parameterized_H(k, h, symmetric=True)
        start = time.time()
        W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                             distribution=distribution,
                                             exponent=exponent,
                                             directed=False,
                                             debug=False)
        X0 = from_dictionary_beliefs(Xd)
        time_calc = time.time() - start
        # print("\nTime for graph:{}".format(time_calc))
        # print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W)))

        # Calculations W
        for length, rep in enumerate(W_repeat):

            for _ in range(rep):
                start = time.time()
                if length == 2:
                    result = W.dot(W)
                elif length == 3:
                    result = W.dot(W.dot(W))  # naive enumeration used as nothing can be faster
                elif length == 4:
                    result = W.dot(W.dot(W.dot(W)))
                elif length == 5:
                    result = W.dot(W.dot(W.dot(W.dot(W))))
                elif length == 6:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W)))))
                elif length == 7:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W))))))
                elif length == 8:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W)))))))
                elif length == 9:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W))))))))
                time_calc = time.time() - start

                tuple = [str(datetime.datetime.now())]
                text = ['W',
                        length,
                        time_calc]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("W, d: {}, time: {}".format(length, time_calc))
                save_csv_record(join(data_directory, csv_filename), tuple)

        # Calculations H_NB
        for length, rep in enumerate(H_repeat):

            for _ in range(rep):
                X0 = from_dictionary_beliefs(Xd)
                X1, ind = replace_fraction_of_rows(X0, 1 - f)

                start = time.time()
                result = H_observed(W, X=X1, distance=length, NB=True, variant=1)
                time_calc = time.time() - start

                tuple = [str(datetime.datetime.now())]
                text = ['H',
                        length,
                        time_calc]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("H, d: {}, time: {}".format(length, time_calc))
                save_csv_record(join(data_directory, csv_filename), tuple)

        # Calculate and display M statistics
        for length, _ in enumerate(H_repeat):
            M = M_observed(W, X=X0, distance=length, NB=True)
            M = M[-1]
            s = np.sum(M)
            # print("l: {}, sum: {:e}, M:\n{}".format(length, s, M))

    # %% -- Read, aggregate, and pivot data
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
    df2 = df1.groupby(['choice', 'l']).agg \
        ({'time': [np.max, np.mean, np.median, np.min, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(30)))
    df3 = pd.pivot_table(df2, index=['l'], columns=['choice'], values='time_median', )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    #%% -- Setup figure
    mpl.rcParams['backend'] = 'pdf'
    mpl.rcParams['lines.linewidth'] = 3
    mpl.rcParams['font.size'] = 20
    mpl.rcParams['axes.labelsize'] = 20
    mpl.rcParams['axes.titlesize'] = 16
    mpl.rcParams['xtick.labelsize'] = 16
    mpl.rcParams['ytick.labelsize'] = 16
    mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
    mpl.rcParams['grid.color'] = '777777'  # grid color
    mpl.rcParams['figure.figsize'] = [4, 4]
    mpl.rcParams['xtick.major.pad'] = 6  # padding of tick labels: default = 4
    mpl.rcParams['ytick.major.pad'] = 4  # padding of tick labels: default = 4
    fig = plt.figure()
    ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

    #%% -- Draw the plot and annotate
    df4 = df3['H']
    # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(30)))

    Y1 = df3['W'].plot(logy=True, color=plot_colors[0], marker='o', legend=None,
                       clip_on=False,  # cut off data points outside of plot area
                       # zorder=3
                       )  # style='o', kind='bar', style='o-',

    plt.annotate(r'$\mathbf{W}^\ell$',
                 xy=(W_annotate_x, W_annotate_y),
                 color=plot_colors[0],
                 )

    if BOTH:
        Y2 = df3['H'].plot(logy=True, color=plot_colors[1], marker='o', legend=None,
                           clip_on=False,  # cut off data points outside of plot area
                           zorder=3
                           )  # style='o', kind='bar', style='o-',

        plt.annotate(r'$\mathbf{\hat P}_{\mathrm{NB}}^{(\ell)}$',
                     xy=(H_annotate_x, H_annotate_y),
                     color=plot_colors[1],
                     )
    if SHOW_TITLE:
        plt.title(r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}$'.format(int(n / 1000), 2 * d, h, f))

    # %% -- Figure settings & plot
    plt.grid(b=True, which='both', alpha=0.2, linestyle='solid', axis='y', linewidth=0.5)  # linestyle='dashed', which='minor'
    plt.xlabel(r'Path length ($\ell$)', labelpad=0)
    plt.ylabel(r'$\!$Time [sec]', labelpad=1)
    plt.ylim(0.001, ymax)  # placed after yticks
    plt.xticks(range(1, 9))

    if SHOW_PLOT:
        plt.show()
    if CREATE_PDF:
        plt.savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    # frameon=None
                    )
    if SHOW_PDF:
        # os.system('{} "'.format(open_cmd[sys.platform]) + join(figure_directory, fig_filename) + '"')       # shows actually created PDF
        showfig(join(figure_directory, fig_filename))  # shows actually created PDF       # TODO replace with this method
Esempio n. 25
0
def test_estimate_synthetic():
    print(
        "\n\n-- test_estimate_synthetic(): 'estimateH', uses: 'M_observed', 'planted_distribution_model_H', --"
    )

    # --- Parameters for graph
    n = 1000
    a = 1
    h = 8
    d = 25
    k = 3
    distribution = 'powerlaw'
    exponent = -0.3
    f = 0.05
    print("n={}, a={},d={}, f={}".format(n, a, d, f))

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    print("H0:\n{}".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- Print some neighbor statistics
    M_vec = M_observed(W, X0, distance=3, NB=True)
    print("\nNeighbor statistics in fully labeled graph:")
    print("M^(1): direct neighbors:\n{}".format(M_vec[1]))
    print("M^(2): distance-2 neighbors:\n{}".format(M_vec[2]))
    print("M^(3): distance-3 neighbors:\n{}".format(M_vec[3]))

    # --- MHE ---
    print("\nMHE: Estimate H based on X0 (fully labeled graph):")
    start = time.time()
    H1 = estimateH(X0, W, method='MHE', variant=1)
    H2 = estimateH(X0, W, method='MHE', variant=2)
    H3 = estimateH(X0, W, method='MHE', variant=3)
    time_est = time.time() - start
    print("Estimated H based on X0 (MHE), variant 1:\n{}".format(H1))
    print("Estimated H based on X0 (MHE), variant 2:\n{}".format(H2))
    print("Estimated H based on X0 (MHE), variant 3:\n{}".format(H3))
    print("Time for all three variants:{}".format(time_est))

    print("\nMHE: Estimate H based on X1 with f={}:".format(f))
    start = time.time()
    H1 = estimateH(X1, W, method='MHE', variant=1)
    H2 = estimateH(X1, W, method='MHE', variant=2)
    H3 = estimateH(X1, W, method='MHE', variant=3)
    time_est = time.time() - start
    print("Estimated H based on X1 (MHE), variant 1:\n{}".format(H1))
    print("Estimated H based on X1 (MHE), variant 2:\n{}".format(H2))
    print("Estimated H based on X1 (MHE), variant 3:\n{}".format(H3))
    print("Time for all three variants:{}".format(time_est))

    print(
        "\nMHE, variant=1: Estimate H based on X1 with f={}, but with initial correct vector:"
    )
    weight = [0, 0, 0, 0, 0]  # ignored for MHE
    initial_h0 = [0.1, 0.8, 0.1]
    H5 = estimateH(X1, W, method='MHE', weights=weight)
    H5_r = estimateH(X1, W, method='MHE', weights=weight, randomize=True)
    H5_i = estimateH(X1,
                     W,
                     method='MHE',
                     weights=weight,
                     initial_H0=transform_hToH(initial_h0, 3))
    print("Estimated H based on X5 only (MHE): \n{}".format(H5))
    print("Estimated H based on X5 only (MHE), randomize:\n{}".format(H5_r))
    print("Estimated H based on X5 only (MHE), initial=GT:\n{}".format(H5_i))

    # --- DHE ---
    print("\nDHE: Estimate H based on X1 with f={}:".format(f))
    start = time.time()
    H1 = estimateH(X1, W, method='DHE', variant=1, distance=1)
    H2 = estimateH(X1, W, method='DHE', variant=2, distance=1)
    H3 = estimateH(X1, W, method='DHE', variant=3, distance=1)
    time_est = time.time() - start
    print(
        "Estimated H based on X1 (DHE, distance=1), variant 1:\n{}".format(H1))
    print(
        "Estimated H based on X1 (DHE, distance=1), variant 2:\n{}".format(H2))
    print(
        "Estimated H based on X1 (DHE, distance=1), variant 3:\n{}".format(H3))
    print("Time for all three variants:{}".format(time_est))

    # --- LHE ---
    print("\nLHE: Estimate H based on X1 with f={}:".format(f))
    start = time.time()
    H1 = estimateH(X1, W, method='LHE')
    time_est = time.time() - start
    print("Estimated H based on X1 (LHE):\n{}".format(H1))
    print("Time for LHE:{}".format(time_est))

    # --- Baseline holdout method ---
    f2 = 0.5
    X2, ind2 = replace_fraction_of_rows(X0, 1 - f2)
    print("\nHoldout method: Estimate H based on X2 with f={}):".format(f2))
    start = time.time()
    H2 = estimateH_baseline_serial(X2=X2,
                                   ind=ind2,
                                   W=W,
                                   numberOfSplits=1,
                                   numMax=10)
    time_est = time.time() - start
    print("Estimated H based on X2 (Holdout method) with f={}:\n{}".format(
        f2, H2))
    print("Time for Holdout method:{}".format(
        time_est))  # TODO: result suggests this method does not work?
def run(choice,
        variant,
        create_data=False,
        add_data=False,
        create_graph=False,
        create_fig=True,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        shorten_length=False,
        show_arrows=True):
    """main parameterized method to produce all figures.
    Can be run from external jupyther notebook or method to produce all figures in PDF
    """

    # -- Setup
    CHOICE = choice  # determines the CSV data file to use
    VARIANT = variant  # determines the variant of how the figures are plotted
    CREATE_DATA = create_data  # starts new CSV file and stores experimental timing results
    ADD_DATA = add_data  # adds data to existing file
    CREATE_GRAPH = create_graph  # creates the actual graph for experiments (stores W and X in CSV files)

    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_FIG = create_fig
    CREATE_PDF = create_pdf
    SHORTEN_LENGTH = shorten_length  # to prune certain fraction of data to plot
    SHOW_SCALING_LABELS = True  # first entry in the legend is for the dashed line of scalability
    SHOW_TITLE = True  # show parameters in title of plot
    SHOW_DCER_WITH_BOX = True  # show DCER value in a extra box
    LABEL_FONTSIZE = 16  # size of number labels in figure
    SHOW_LINEAR = True  # show dashed line for linear scaling

    SHOW_ARROWS = show_arrows  # show extra visual comparison of speed-up

    csv_filename = 'Fig_Timing_{}.csv'.format(
        CHOICE)  # CSV filename includes CHOICE
    filename = 'Fig_Timing_{}-{}'.format(
        CHOICE, VARIANT)  # PDF filename includes CHOICE and VARIANT
    header = ['n', 'type', 'time']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    distribution = 'powerlaw'
    exponent = -0.3
    k = 3
    a = 1  # this value was erroneously set to 5 previously!!! TODO: fix everywhere else
    # err = 0
    avoidNeighbors = False
    f = 0.1
    est_EC = True  # !!! TODO: for graph estimation
    weights = 10
    pyamg = False
    convergencePercentage_W = None
    alpha = 0
    beta = 0
    gamma = 0
    s = 0.5
    numMaxIt = 10
    xtick_lab = [0.001, 0.01, 0.1, 1]
    ytick_lab = np.arange(0, 1, 0.1)
    xmin = 1e2
    xmax = 1e8
    # xmax = 1e6
    ymin = 1e-3
    ymax = 5e3
    color_vec = [
        "#4C72B0", "#55A868", "#8172B2", "#C44E52", "#CCB974", 'black',
        'black', "#64B5CD", "black"
    ]
    marker_vec = ['s', '^', 'x', 'o', 'None', 'None', 'None', 'None']
    linestyle_vec = ['solid'] * 6 + ['dashed']
    linewidth_vec = [3] * 3 + [4, 3, 4] + [3] * 7
    SHOWMAXNUMBER = True
    show_num_vec = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max']

    # %% -- Main Options
    if CHOICE == 3:
        n_vec = [
            100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400,
            204800, 409600, 819200, 1638400, 3276800, 6553600
        ]
        # # n_vec = [1638400]  # graph:  12021 sec = 3.4h, 18600 sec = 5h, 21824 sec (34000 sec old laptop)
        # # n_vec = [3276800]  # graph:  49481 sec = 13.8h, 68145 sec (125233 sec old laptop)
        # # n_vec = [6553600]  # graph: 145020 sec = 40h
        h = 8
        d = 5

        repeat_vec_vec = [[
            50, 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 5, 5, 3, 3, 3, 3
        ], [5, 5, 5, 5, 3, 3, 3, 3, 3, 1,
            1], [20, 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 3, 3, 1, 1, 1, 1]]
        method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']]

        if VARIANT == 1:
            method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']
            label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop']
            show_num_vec = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']

        if VARIANT == 2:  # version used for main paper figure
            method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']
            label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop']
            linestyle_vec = ['solid'] * 5 + ['dashed']
            SHOW_ARROWS = False

        if VARIANT == 3:  # version used for main paper figure
            method_vec_fig = ['DHEr', 'Holdout', 'prop']
            label_vec = [
                'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$'
            ]
            linestyle_vec = ['solid'] * 2 + ['dashed']
            color_vec = [
                "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black"
            ]
            marker_vec = ['o', 'x', 'None', 'None', 'None']
            linestyle_vec = ['solid'] * 3 + ['dashed']
            linewidth_vec = [4, 3, 4] + [3] * 7
            ymin = 1e-2
            SHOW_ARROWS = True

        if VARIANT == 4:  # figure used in slides
            method_vec_fig = ['prop']
            label_vec = ['Propagation']
            color_vec = ['black']
            marker_vec = ['None']
            linestyle_vec = ['solid'] * 1
            linewidth_vec = [2]
            ymin = 1e-2
            SHOW_ARROWS = False
            SHOW_SCALING_LABELS = False
            SHOW_TITLE = False
            SHOW_DCER_WITH_BOX = False
            LABEL_FONTSIZE = 20
            SHOW_LINEAR = False

        if VARIANT == 5:  # figure used in slides
            method_vec_fig = ['prop', 'Holdout']
            label_vec = ['Propagation', 'Baseline']
            color_vec = ['black', "#CCB974"]
            marker_vec = ['None', '^']
            linestyle_vec = ['solid'] * 2
            linewidth_vec = [2, 4]
            ymin = 1e-2
            SHOW_ARROWS = True
            SHOW_SCALING_LABELS = False
            SHOW_TITLE = False
            SHOW_DCER_WITH_BOX = False
            LABEL_FONTSIZE = 20
            SHOW_LINEAR = False

        if VARIANT == 6:  # figure used in slides
            method_vec_fig = ['prop', 'Holdout', 'DHEr']
            label_vec = ['Propagation', 'Baseline', 'Our method']
            color_vec = ['black', "#CCB974", "#C44E52"]
            marker_vec = ['None', '^', 'o', 'None', 'None']
            linestyle_vec = ['solid'] + ['solid'] * 2
            linewidth_vec = [2, 4, 4]
            ymin = 1e-2
            SHOW_ARROWS = True
            SHOW_SCALING_LABELS = False
            SHOW_TITLE = True
            SHOW_DCER_WITH_BOX = False
            LABEL_FONTSIZE = 20
            SHOW_LINEAR = False

        graph_cvs = 'Fig_Timing_SSLH_1'  # re-use existing large graphs

    elif CHOICE == 4:
        n_vec = [
            200,
            400,
            800,
            1600,
            3200,
            6400,
            12800,
            25600,
            51200,
            102400,
            204800,
            409600,
            819200,
        ]
        # n_vec = [819200]    # graph: 47905 sec = 13.3h. 90562 sec = 25h (180527 sec old laptop)
        h = 3
        d = 25
        repeat_vec_vec = [[
            50,
            50,
            50,
            50,
            50,
            50,
            20,
            10,
            10,
            5,
            3,
            3,
            3,
        ], [5, 5, 5, 3, 1, 1, 1, 1, 1],
                          [
                              20,
                              20,
                              10,
                              10,
                              10,
                              10,
                              10,
                              5,
                              5,
                              5,
                              1,
                              1,
                              1,
                          ]]
        method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']]

        VARIANT = 2

        if VARIANT == 1:
            method_vec_fig = [
                'MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max'
            ]
            label_vec = [
                'MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop',
                '$\epsilon_{\mathrm{max}}$'
            ]
            show_num_vec = [
                'MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max'
            ]

        if VARIANT == 2:
            method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']
            label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop']
            linestyle_vec = ['solid'] * 5 + ['dashed']

        if VARIANT == 3:
            method_vec_fig = ['DHEr', 'Holdout', 'prop']

            label_vec = [
                'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$'
            ]
            linestyle_vec = ['solid'] * 2 + ['dashed']
            color_vec = [
                "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black"
            ]
            marker_vec = ['o', 'x', 'None', 'None', 'None']
            linestyle_vec = ['solid'] * 3 + ['dashed']
            linewidth_vec = [4, 3, 4] + [3] * 7
            ymin = 1e-2

        graph_cvs = 'Fig_Timing_SSLH_2'  # re-use existing large graphs
        xmin = 1e3
        xmax = 5e7
        ymax = 1e3

    elif CHOICE == 2:
        # rep_Estimation = 10
        # n_vec = [200, 400, 800, 1600, 3200, 6400, 12800,
        #          25600, 51200, 102400, 204800, 409600, 819200]
        # repeat_vec = [20, 20, 20, 20, 20, 10, 10,
        #               10, 10, 10, 5, 5, 1]
        # n_vec = [819200]    # graph: 47905 sec = 13.3h. 90562 sec = 25h (180527 sec old laptop)
        n_vec = [1638400]  # !!! not done yet
        repeat_vec = [1]
        h = 3
        d = 25
        xmax = 5e7
        graph_cvs = 'Fig_Timing_SSLH_2'

    elif CHOICE == 10:  # same as 3 but with difference bars
        n_vec = [
            100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400,
            204800, 409600, 819200, 1638400, 3276800, 6553600
        ]
        # # n_vec = [1638400]  # graph:  12021 sec = 3.4h, 18600 sec = 5h, 21824 sec (34000 sec old laptop)
        # # n_vec = [3276800]  # graph:  49481 sec = 13.8h, 68145 sec (125233 sec old laptop)
        # # n_vec = [6553600]  # graph: 145020 sec = 40h
        h = 8
        d = 5

        repeat_vec_vec = [[
            50, 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 5, 5, 3, 3, 3, 3
        ], [5, 5, 5, 5, 3, 3, 3, 3, 3, 1,
            1], [20, 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 3, 3, 1, 1, 1, 1]]
        method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']]

        method_vec_fig = ['DHEr', 'Holdout', 'prop']
        label_vec = [
            'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$'
        ]
        linestyle_vec = ['solid'] * 2 + ['dashed']
        color_vec = [
            "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black"
        ]
        marker_vec = ['o', 'x', 'None', 'None', 'None']
        linestyle_vec = ['solid'] * 3 + ['dashed']
        linewidth_vec = [4, 3, 4] + [3] * 7
        ymin = 1e-2

        graph_cvs = 'Fig_Timing_SSLH_1'  # re-use existing large graphs

    else:
        raise Warning("Incorrect choice!")

    # %% -- Common options

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    H0c = to_centering_beliefs(H0)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    # print("CHOICE: {}".format(CHOICE))

    def save_tuple(n, label, time):
        tuple = [str(datetime.datetime.now())]
        text = [n, label, time]
        tuple.extend(text)
        print("time potential {}: {}".format(label, time))
        save_csv_record(join(data_directory, csv_filename), tuple)

    # %% -- Create data
    if CREATE_DATA or ADD_DATA:

        for repeat_vec, method_vec in zip(repeat_vec_vec, method_vec_vec):

            for n, repeat in zip(n_vec, repeat_vec):
                print("\nn: {}".format(n))
                # repeat = repeat_vec[j]

                # -- Graph
                if CREATE_GRAPH:
                    start = time.time()
                    W, Xd = planted_distribution_model(
                        n,
                        alpha=alpha0,
                        P=H0,
                        m=d * n,
                        distribution=distribution,
                        exponent=exponent,
                        directed=False,
                        debug=False)
                    X0 = from_dictionary_beliefs(Xd)
                    time_graph = time.time() - start

                    save_W(join(data_directory,
                                '{}_{}_W.csv'.format(graph_cvs, n)),
                           W,
                           saveWeights=False)
                    save_X(
                        join(data_directory,
                             '{}_{}_X.csv'.format(graph_cvs, n)), X0)
                    save_tuple(n, 'graph', time_graph)

                else:
                    W, _ = load_W(join(data_directory,
                                       '{}_{}_W.csv'.format(graph_cvs, n)),
                                  skiprows=1,
                                  zeroindexing=True,
                                  n=None,
                                  doubleUndirected=False)
                    X0, _, _ = load_X(join(data_directory,
                                           '{}_{}_X.csv'.format(graph_cvs, n)),
                                      n=None,
                                      k=None,
                                      skiprows=1,
                                      zeroindexing=True)

                # -- Repeat loop
                for i in range(repeat):
                    print("\n  repeat: {}".format(i))
                    X2, ind = replace_fraction_of_rows(
                        X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W)

                    for method in method_vec:

                        if method == 'DHE':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='DHE',
                                           variant=1,
                                           distance=5,
                                           EC=est_EC,
                                           weights=weights)
                            time_est = time.time() - start
                            save_tuple(n, 'DHE', time_est)

                        elif method == 'DHEr':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='DHE',
                                           variant=1,
                                           distance=5,
                                           EC=est_EC,
                                           weights=weights,
                                           randomize=True)
                            time_est = time.time() - start
                            save_tuple(n, 'DHEr', time_est)

                        elif method == 'MHE':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='MHE',
                                           variant=1,
                                           distance=1,
                                           EC=est_EC,
                                           weights=None)
                            time_est = time.time() - start
                            save_tuple(n, 'MHE', time_est)

                        elif method == 'LHE':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='LHE',
                                           variant=1,
                                           distance=1,
                                           EC=est_EC,
                                           weights=None)
                            time_est = time.time() - start
                            save_tuple(n, 'LHE', time_est)

                        elif method == 'Holdout':
                            start = time.time()
                            H2 = estimateH_baseline_serial(
                                X2,
                                ind,
                                W,
                                numMax=numMaxIt,
                                numberOfSplits=1,
                                # EC=EC,
                                # weights=weight,
                                alpha=alpha,
                                beta=beta,
                                gamma=gamma)
                            time_est = time.time() - start
                            save_tuple(n, 'Holdout', time_est)

                        elif method == 'prop':
                            H2c = to_centering_beliefs(H0)
                            X2c = to_centering_beliefs(
                                X2, ignoreZeroRows=True)  # try without
                            start = time.time()
                            eps_max = eps_convergence_linbp_parameterized(
                                H2c,
                                W,
                                method='noecho',
                                alpha=alpha,
                                beta=beta,
                                gamma=gamma,
                                X=X2,
                                pyamg=pyamg)
                            time_eps_max = time.time() - start
                            save_tuple(n, 'eps_max', time_eps_max)

                            # -- Propagate
                            eps = s * eps_max
                            try:
                                start = time.time()
                                F, actualIt, actualPercentageConverged = \
                                    linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                                  method='noecho',
                                                                  alpha=alpha, beta=beta, gamma=gamma,
                                                                  numMaxIt=numMaxIt,
                                                                  convergencePercentage=convergencePercentage_W,
                                                                  debug=2)
                                time_prop = time.time() - start
                            except ValueError as e:
                                print("ERROR: {}: d={}, h={}".format(e, d, h))
                            else:
                                save_tuple(n, 'prop', time_prop)

                        else:
                            raise Warning("Incorrect choice!")

    # %% -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(50)))

    # Aggregate repetitions
    df2 = df1.groupby(['n', 'type']).agg \
        ({'time': [np.mean, np.median, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # Pivot table
    df3 = pd.pivot_table(df2,
                         index=['n'],
                         columns=['type'],
                         values=['time_mean', 'time_median'])  # Pivot
    # df3 = pd.pivot_table(df2, index=['n'], columns=['type'], values=['time_mean', 'time_median', 'time_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    # Extract values
    X = df3['n'].values  # plot x values
    X = X * d / 2  # calculate edges (!!! notice dividing by 2 as one edge appears twice in symmetric adjacency matrix)
    Y = {}
    for method in method_vec_fig:
        # Y[method] = df3['time_mean_{}'.format(method)].values
        Y[method] = df3['time_median_{}'.format(method)].values

    if SHORTEN_LENGTH:
        SHORT_FACTOR = 4  ## KEEP EVERY Nth ELEMENT
        X = np.copy(X[list(range(0, len(X), SHORT_FACTOR)), ])
        for method in method_vec_fig:
            Y[method] = np.copy(
                Y[method][list(range(0, len(Y[method]), SHORT_FACTOR)), ])

    # %% -- Figure
    if CREATE_FIG:
        fig_filename = '{}.pdf'.format(
            filename)  # TODO: repeat pattern in other files
        mpl.rcParams['backend'] = 'agg'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = LABEL_FONTSIZE
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 12
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams[
            'xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Draw the plots
        if SHOW_LINEAR:
            ax.plot([1, 1e8], [1e-5, 1e3],
                    linewidth=1,
                    color='gray',
                    linestyle='dashed',
                    label='1sec/100k edges',
                    clip_on=True,
                    zorder=3)
        for i, (method, color, marker, linewidth, linestyle) in enumerate(
                zip(method_vec_fig, color_vec, marker_vec, linewidth_vec,
                    linestyle_vec)):
            ax.plot(X,
                    Y[method],
                    linewidth=linewidth,
                    color=color,
                    linestyle=linestyle,
                    label=label_vec[i],
                    clip_on=True,
                    marker=marker,
                    markersize=6,
                    markeredgewidth=1,
                    markeredgecolor='black',
                    zorder=4)

            # for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
            #         enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):
            #     P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
            #                 markersize=markersize, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on)

            if SHOWMAXNUMBER and method in show_num_vec:
                if method == 'DHEr' and SHOW_DCER_WITH_BOX:
                    j = np.argmax(np.ma.masked_invalid(
                        Y[method]))  # mask nan, then get index of max element
                    ax.annotate(int(np.round(Y[method][j])),
                                xy=(X[j] * 1.5, Y[method][j]),
                                color=color,
                                va='center',
                                bbox=dict(boxstyle="round,pad=0.3", fc="w"),
                                annotation_clip=False,
                                zorder=5)
                else:
                    j = np.argmax(np.ma.masked_invalid(
                        Y[method]))  # mask nan, then get index of max element
                    ax.annotate(int(np.round(Y[method][j])),
                                xy=(X[j] * 1.5, Y[method][j]),
                                color=color,
                                va='center',
                                annotation_clip=False,
                                zorder=5)

        if SHOW_ARROWS:
            dce_opt = 'DHEr'
            holdout_opt = 'Holdout'
            prop_opt = 'prop'

            j_holdout = np.argmax(np.ma.masked_invalid(Y[holdout_opt]))

            if dce_opt in Y:
                j_dce = np.argmax(np.ma.masked_invalid(Y[dce_opt]))
                ax.annotate(s='',
                            xy=(X[j_dce], Y[prop_opt][j_dce]),
                            xytext=(X[j_dce], Y[dce_opt][j_dce]),
                            arrowprops=dict(arrowstyle='<->'))
                ax.annotate(
                    str(int(np.round(Y[prop_opt][j_dce] / Y[dce_opt][j_dce])))
                    + 'x',
                    xy=(X[j_dce],
                        int(Y[prop_opt][j_dce] + Y[dce_opt][j_dce]) / 6),
                    color='black',
                    va='center',
                    fontsize=14,
                    # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                    annotation_clip=False,
                    zorder=5)

                ax.annotate(s='',
                            xy=(X[j_holdout], Y[holdout_opt][j_holdout]),
                            xytext=(X[j_holdout], Y[dce_opt][j_holdout]),
                            arrowprops=dict(arrowstyle='<->'))
                ax.annotate(
                    str(
                        int(
                            np.round(Y[holdout_opt][j_holdout] /
                                     Y[dce_opt][j_holdout]))) + 'x',
                    xy=(X[j_holdout],
                        int(Y[holdout_opt][j_holdout] + Y[dce_opt][j_holdout])
                        / 8),
                    color='black',
                    va='center',
                    fontsize=14,
                    # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                    annotation_clip=False,
                    zorder=5)

            else:  # in case dce_opt not shown, then show arrow as compared to prop method
                ax.annotate(s='',
                            xy=(X[j_holdout], Y[holdout_opt][j_holdout]),
                            xytext=(X[j_holdout], Y[prop_opt][j_holdout]),
                            arrowprops=dict(arrowstyle='<->'))
                ax.annotate(
                    str(
                        int(
                            np.round(Y[holdout_opt][j_holdout] /
                                     Y[prop_opt][j_holdout]))) + 'x',
                    xy=(X[j_holdout],
                        int(Y[holdout_opt][j_holdout] + Y[prop_opt][j_holdout])
                        / 8),
                    color='black',
                    va='center',
                    fontsize=14,
                    # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                    annotation_clip=False,
                    zorder=5)

        if SHOW_TITLE:
            plt.title(r'$\!\!\!d\!=\!{}, h\!=\!{}$'.format(d, h))

        handles, labels = ax.get_legend_handles_labels()
        if not SHOW_SCALING_LABELS and SHOW_LINEAR:
            handles = handles[1:]
            labels = labels[1:]

        legend = plt.legend(
            handles,
            labels,
            loc='upper left',  # 'upper right'
            handlelength=2,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        legend.set_zorder(3)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.2)  # 0.8

        # -- Figure settings and save
        plt.minorticks_on()
        plt.xscale('log')
        plt.yscale('log')
        minorLocator = LogLocator(
            base=10, subs=[0.1 * n for n in range(1, 10)], numticks=40
        )  # TODO: discuss with Paul trick that helped with grid lines last time; necessary in order to create the log locators (otherwise does now show the wanted ticks
        #         ax.xaxis.set_minor_locator(minorLocator)
        plt.xticks([1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9])
        plt.grid(True,
                 which='both',
                 axis='both',
                 alpha=0.2,
                 linestyle='-',
                 linewidth=1,
                 zorder=1)  # linestyle='dashed', which='minor', axis='y',
        # grid(b=True, which='minor', axis='x', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'Number of edges ($m$)', labelpad=0)  # labelpad=0
        plt.ylabel(r'Time [sec]', labelpad=0)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        # print(ax.get_xaxis().get_minor_locator())

        if CREATE_PDF:
            plt.savefig(
                join(figure_directory, fig_filename),
                format='pdf',
                dpi=None,
                edgecolor='w',
                orientation='portrait',
                transparent=False,
                bbox_inches='tight',
                pad_inches=0.05,
                # frameon=None
            )
        if SHOW_PDF:
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
        if SHOW_PLOT:
            plt.show()
Esempio n. 27
0
def test_gradient():
    print(
        "\n-- 'define_gradient_energy_H, define_energy_H, uses: planted_distribution_model_H, H_observed, M_observed, --"
    )

    # --- Parameters for graph
    n = 1000
    a = 1
    h = 8
    d = 25
    k = 3
    distribution = 'powerlaw'
    exponent = -0.3

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    f = 0.5
    print("Graph n={}, d={}, f={}".format(n, d, f))
    print("H0:\n{}\n".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- M_vec, H_vec statistics
    distance = 5

    print("M_vec:")
    M_vec = M_observed(W, X1, distance=distance)
    for i, M in enumerate(M_vec):
        print("{}:\n{}".format(i, M))

    print("H_vec:")
    H_vec = H_observed(W, X1, distance=distance)
    for i, H in enumerate(H_vec):
        print("{}:\n{}".format(i, H))

    # --- Gradient at multiple points for distance 1
    print("\n=Defining the gradient function with distance 1")
    distance = 1
    weights = [1, 0, 0, 0, 0]
    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    energy_H = define_energy_H(weights=weights,
                               distance=1,
                               H_vec_observed=H_vec)

    H_actual = H_vec[0]
    print(
        "1st example point: H_actual (row-stochastic frequencies of neighbors):\n{}"
        .format(H_actual))
    e = energy_H(H_actual)
    g = gradient_energy_H(H_actual)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point = transform_hToH(np.array([0.2, 0.6, 0.2]), 3)
    print("\n2nd example point: H_point:\n{}".format(H_point))
    e = energy_H(H_point)
    g = gradient_energy_H(H_point)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point2 = H_point - 0.45 * g
    print(
        "\n3rd example point in opposite direction of gradient: H_point2=H_point-0.45*gradient:\n{}"
        .format(H_point2))
    e = energy_H(H_point2)
    g = gradient_energy_H(H_point2)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    # --- Gradient at multiple points for distance 5
    distance = 5
    weights = [0, 0, 0, 0, 1]
    print("\n= Defining the gradient function with distance={} and weights={}".
          format(distance, weights))
    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    energy_H = define_energy_H(weights=weights,
                               distance=1,
                               H_vec_observed=H_vec)

    H_actual = H_vec[0]
    print("1st point: H_actual:\n{}".format(H_actual))
    e = energy_H(H_actual)
    g = gradient_energy_H(H_actual)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point = transform_hToH(np.array([0.2, 0.6, 0.2]), 3)
    print("\n2nd point: H_point:\n{}".format(H_point))
    e = energy_H(H_point)
    g = gradient_energy_H(H_point)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point2 = H_point - 1.5 * g
    print(
        "\n3rd point in opposite direction of gradient: H_point2:\n{}".format(
            H_point2))
    e = energy_H(H_point2)
    g = gradient_energy_H(H_point2)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)