Exemple #1
0
def test_Diff_matrices():
    """Creates two parameterized Hs, shows that standardized versions are identical"""
    print(
        "\n-- test_Diff_matrices(): 'create_parameterized_H', 'to_centering_beliefs', 'np.std', 'LA.norm' --"
    )
    h = 2
    H0 = create_parameterized_H(3, h, symmetric=True)
    print("H0:\n{}\n".format(H0))

    H0c = to_centering_beliefs(H0)
    print("H0c (centered):\n{}\n".format(H0c))

    std_H0 = np.std(H0)
    print("std(H0): {}".format(std_H0))
    std_H0c = np.std(H0c)
    print("std(H0c): {}\n".format(std_H0c))
    H0c_s = H0c.dot(1 / std_H0c)
    print("H0c_s (standardized centered):\n{}\n".format(H0c_s))

    H1 = create_parameterized_H(3, h * 4, symmetric=True)
    print("H1 (4 times stronger potential):\n{}\n".format(H1))
    H1c = to_centering_beliefs(H1)
    H1c_s = H1c.dot(1 / np.std(H1c))
    print("H1c_s (standardized centered):\n{}\n".format(H1c_s))

    diff = LA.norm(H0c_s - H1c_s)
    print("LA.norm(H0c_s - H1c_s) is quasi 0:\n{}\n".format(diff))
Exemple #2
0
    def calculate_accuracy(H,
                           X_train,
                           X_test,
                           train_ind,
                           test_ind,
                           W,
                           return_output,
                           s=0.5):  # all that is needed to propagate
        H0c = to_centering_beliefs(H)

        eps_max = eps_convergence_linbp_parameterized(H0c,
                                                      W,
                                                      method='noecho',
                                                      alpha=alpha,
                                                      beta=beta,
                                                      gamma=gamma,
                                                      X=X2)

        eps = s * eps_max

        F, actualIt, actualPercentageConverged = \
            linBP_symmetric_parameterized(X_train, W, H*eps,
                                          method='noecho',
                                          alpha=alpha, beta=beta, gamma=gamma,
                                          numMaxIt=numMax,
                                          convergencePercentage=0.99,
                                          convergenceThreshold=0.99,
                                          debug=2)

        n, k = F.shape
        for i in range(n):
            if i not in test_ind:
                F[i] = np.zeros(k)

        accuracy_X = matrix_difference(X_test,
                                       F,
                                       ignore_rows=list(train_ind),
                                       similarity='accuracy')

        print("Holdout accuracy: {}".format(accuracy_X))
        return_output.put(accuracy_X)  ## For Parallel
Exemple #3
0
    def calculate_accuracy(H, X_train, X_test, train_ind, test_ind, W, s=0.5):
        """Propagates from X_train numMax times, calculates accuracy over X_test
        """
        H0c = to_centering_beliefs(H)
        eps_max = eps_convergence_linbp_parameterized(
            H0c,
            W,  # TODO: an optimized version could attempt to calculate the spectral radius fewer times and re-use it for multiple splits
            method='noecho',
            alpha=alpha,
            beta=beta,
            gamma=gamma,
            X=X2)
        eps = s * eps_max
        F, actualIt, actualPercentageConverged = linBP_symmetric_parameterized(
            X_train,
            W,
            H * eps,
            method='noecho',
            alpha=alpha,
            beta=beta,
            gamma=gamma,
            numMaxIt=numMax,
            convergencePercentage=0.99,
            convergenceThreshold=0.99,
            debug=2)
        n, k = F.shape
        for i in range(n):
            if i not in test_ind:
                F[i] = np.zeros(k)

        # TODO For label imbalance, better to use CLASSWISE (macro-averaging) here
        accuracy_X = matrix_difference(X_test,
                                       F,
                                       ignore_rows=list(train_ind),
                                       similarity='accuracy')
        # print("accuracy now is {}".format(accuracy_X))
        return accuracy_X
Exemple #4
0
def _f_worker_(X0, W, f, f_index):
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    X1, ind = replace_fraction_of_rows(X0,
                                       1 - f,
                                       avoidNeighbors=avoidNeighbors,
                                       W=W,
                                       stratified=stratified)
    X2 = introduce_errors(X1, ind, err)


    for option_index, (label, select_lambda, learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
            enumerate(zip(labels, select_lambda_vec, learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):
        learn_time = -1
        # -- Learning
        if learning_method == 'GT':
            H2c = H0c
        elif learning_method == 'Heuristic':
            # print('Heuristic')
            H2c = H_heuristic

        elif learning_method == 'Holdout':
            # print('Holdout')
            H2 = estimateH_baseline_serial(
                X2,
                ind,
                W,
                numMax=numMaxIt,
                # ignore_rows=ind,
                numberOfSplits=numberOfSplits,
                # method=learning_method, variant=1,
                # distance=length,
                EC=EC,
                alpha=alpha,
                beta=beta,
                gamma=gamma,
                doubly_stochastic=doubly_stochastic)
            H2c = to_centering_beliefs(H2)

        else:
            if "DCEr" in learning_method:
                learning_method = "DCEr"
            elif "DCE" in learning_method:
                learning_method = "DCE"

            # -- choose optimal lambda: allows to specify different lambda for different f
            # print("option: ", option_index)
            if select_lambda == True:
                weight = lambda_vec[f_index]
                # print("weight : ", weight)
            else:
                weight = weights

            # -- learn H
            learn_start = time.time()
            H2 = estimateH(X2,
                           W,
                           method=learning_method,
                           variant=1,
                           distance=length,
                           EC=EC,
                           weights=weight,
                           randomrestarts=num_restarts,
                           randomize=randomize,
                           constraints=constraints,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            learn_time = time.time() - learn_start
            H2c = to_centering_beliefs(H2)

        # if learning_method not in ['GT', 'GS']:

        # print(FILENAMEZ, f, learning_method)
        # print(H2c)

        # -- Propagation
        prop_start = time.time()
        # X2c = to_centering_beliefs(X2, ignoreZeroRows=True)       # try without
        eps_max = eps_convergence_linbp_parameterized(H2c,
                                                      W,
                                                      method='noecho',
                                                      alpha=alpha,
                                                      beta=beta,
                                                      gamma=gamma,
                                                      X=X2)
        eps = s * eps_max
        # print("Max eps: {}, eps: {}".format(eps_max, eps))
        # eps = 1

        try:
            F, actualIt, actualPercentageConverged = \
                linBP_symmetric_parameterized(X2, W, H2c * eps,
                                              method='noecho',
                                              alpha=alpha, beta=beta, gamma=gamma,
                                              numMaxIt=numMaxIt,
                                              convergencePercentage=convergencePercentage_W,
                                              debug=2)
            prop_time = time.time() - prop_start
            if Macro_Accuracy:
                accuracy_X = matrix_difference_classwise(X0,
                                                         F,
                                                         ignore_rows=ind)
                precision = matrix_difference_classwise(X0,
                                                        F,
                                                        similarity='precision',
                                                        ignore_rows=ind)
                recall = matrix_difference_classwise(X0,
                                                     F,
                                                     similarity='recall',
                                                     ignore_rows=ind)
            else:
                accuracy_X = matrix_difference(X0, F, ignore_rows=ind)
                precision = matrix_difference(X0,
                                              F,
                                              similarity='precision',
                                              ignore_rows=ind)
                recall = matrix_difference(X0,
                                           F,
                                           similarity='recall',
                                           ignore_rows=ind)

            result = [str(datetime.datetime.now())]
            text = [
                label, f, accuracy_X, precision, recall, learn_time, prop_time
            ]
            result.extend(text)
            # print("method: {}, f: {}, actualIt: {}, accuracy: {}, precision:{}, recall: {}, learning time: {}, propagation time: {}".format(label, f, actualIt, accuracy_X, precision, recall, learn_time, prop_time))
            save_csv_record(join(data_directory, csv_filename), result)

        except ValueError as e:

            print("ERROR: {} with {}: d={}, h={}".format(
                e, learning_method, d, h))
            raise e

    return 'success'
Exemple #5
0
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    global n
    global d
    global rep_SameGraph
    global FILENAMEZ
    global csv_filename
    global initial_h0
    global exponent
    global length
    global variant

    global alpha_vec
    global beta_vec
    global gamma_vec
    global s_vec
    global clip_on_vec
    global numMaxIt_vec

    # Plotting Parameters
    global xtick_lab
    global xtick_labels
    global ytick_lab
    global xmax
    global xmin
    global ymin
    global ymax
    global labels
    global facecolor_vec
    global draw_std_vec
    global linestyle_vec
    global linewidth_vec
    global marker_vec
    global markersize_vec
    global legend_location

    global option_vec
    global learning_method_vec

    global Macro_Accuracy
    global EC
    global constraints
    global weight_vec
    global randomize_vec
    global k
    global err
    global avoidNeighbors
    global convergencePercentage_W
    global stratified
    global gradient
    global doubly_stochastic
    global num_restarts
    global numberOfSplits
    global H_heuristic

    global select_lambda_vec
    global lambda_vec
    global f_vec
    global H0c

    # -- Setup
    CHOICE = choice
    #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron
    experiments = [CHOICE]
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf

    SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF
    STD_FILL = True
    TIMING = False
    CALCULATE_DATA_STATISTICS = False

    # -- Default Graph parameters
    rep_SameGraph = 10  # iterations on same graph

    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1

    alpha_vec = [0] * 10
    beta_vec = [0] * 10
    gamma_vec = [0] * 10
    s_vec = [0.5] * 10
    clip_on_vec = [True] * 10
    numMaxIt_vec = [10] * 10

    # Plotting Parameters
    xtick_lab = [0.001, 0.01, 0.1, 1]
    xtick_labels = ['0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    xmax = 1
    xmin = 0.0001
    ymin = 0.3
    ymax = 0.7
    labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr']
    facecolor_vec = [
        'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
        "#64B5CD"
    ]
    draw_std_vec = [False] * 4 + [True]
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [4, 4, 2, 1, 2, 2]
    marker_vec = [None, 'o', 'x', '^', 'v', '+']
    markersize_vec = [0, 8, 8, 8, 8, 8, 8]

    option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
    learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']

    Macro_Accuracy = False
    EC = True  # Non-backtracking for learning
    constraints = True  # True
    weight_vec = [None] * 3 + [10, 10] * 2
    randomize_vec = [False] * 4 + [True] * 2
    k = 3
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True
    gradient = True
    doubly_stochastic = True
    num_restarts = None

    raw_std_vec = range(10)
    numberOfSplits = 1

    select_lambda_vec = [False] * 20
    lambda_vec = None

    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    FILENAMEZ = ""
    legend_location = ""
    fig_label = ""
    H_heuristic = ""

    def choose(choice):
        global n
        global d
        global rep_SameGraph
        global FILENAMEZ
        global initial_h0
        global exponent
        global length
        global variant

        global alpha_vec
        global beta_vec
        global gamma_vec
        global s_vec
        global clip_on_vec
        global numMaxIt_vec

        # Plotting Parameters
        global xtick_lab
        global xtick_labels
        global ytick_lab
        global xmax
        global xmin
        global ymin
        global ymax
        global labels
        global facecolor_vec
        global draw_std_vec
        global linestyle_vec
        global linewidth_vec
        global marker_vec
        global markersize_vec
        global legend_location

        global option_vec
        global learning_method_vec

        global Macro_Accuracy
        global EC
        global constraints
        global weight_vec
        global randomize_vec
        global k
        global err
        global avoidNeighbors
        global convergencePercentage_W
        global stratified
        global gradient
        global doubly_stochastic
        global num_restarts
        global numberOfSplits
        global H_heuristic

        global select_lambda_vec
        global lambda_vec
        global f_vec

        # -- Default Graph parameters

        if choice == 0:
            None

        elif choice == 304:  ## with varying weights
            FILENAMEZ = 'prop37'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'Prop37'
            legend_location = 'lower right'
            n = 62000
            d = 34.8
            select_lambda_vec = [False] * 5
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]

        elif choice == 305:  # DCEr Only experiment
            choose(605)
            choose(304)

            select_lambda_vec = [False] * 6

        elif choice == 306:
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 307:  # heuristic comparison
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            learning_method_vec.append('Heuristic')
            labels.append('Heuristic')
            H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- MovieLens dataset
        elif choice == 401:
            FILENAMEZ = 'movielens'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'MovieLens'
            legend_location = 'upper left'

            n = 26850
            d = 25.0832029795

        elif choice == 402:
            choose(401)
            select_lambda_vec = [False] * 3 + [
                True
            ] * 3  # allow to choose lambda for different f in f_vec

            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 403:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 404:
            choose(401)

            select_lambda_vec = [
                True
            ] * 3  # allow to choose lambda for different f in f_vec
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            labels = ['GS', 'DCEr', 'Homophily']
            facecolor_vec = ['black', "#C44E52", "#64B5CD"]
            draw_std_vec = [False, True, False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 2, 2, 2, 2]
            marker_vec = [None, '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]

            weight_vec = [None, 10, None]
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
            randomize_vec = [False, True, False]
            learning_method_vec = ['GT', 'DHE']  #TODO

        elif choice == 405:  # DCEr ONLY experiment
            choose(605)
            choose(401)
            learning_method_vec += ['Holdout']
            labels += ['Holdout']

        elif choice == 406:  # comparison with a static heuristic matrix
            choose(402)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        elif choice == 407:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [1] * 21  # same length as f_vec

        elif choice == 408:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from
        # data/sigmod-movielens-fig.csv
        elif choice == 409:
            choose(402)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52",
                "#C44E52", "#CCB974", "#64B5CD"
            ]
            labels = [
                'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10',
                'Holdout'
            ]
            draw_std_vec = [False] * 5 + [True] * 2 + [False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2]
            marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8]
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]
            legend_location = 'upper left'
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # -- Yelp dataset
        elif choice == 501:
            FILENAMEZ = 'yelp'
            Macro_Accuracy = True
            weight_vec = [None] * 3 + [10, 10]
            gradient = True
            ymin = 0.1
            ymax = 0.75
            fig_label = 'Yelp'
            legend_location = 'upper left'

            n = 4301900  # for figure
            d = 6.56  # for figure

        # -- Flickr dataset
        #elif choice == 601:
        #    FILENAMEZ = 'flickr'
        #    Macro_Accuracy = True
        #    fig_label = 'Flickr'
        #    legend_location = 'lower right'
        #    ymin = 0.3
        #    ymax = 0.7
        #    n = 2007369
        #    d = 18.1

        #elif choice == 602: ## with varying weights
        #    choose(601)

        #    select_lambda_vec = [False] * 4 + [True]*2  # allow to choose lambda for different f in f_vec
        #    f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)]
        #    lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        #elif choice == 603:     ## with varying weights
        #    choose(602)

        #    select_lambda_vec = [False] * 3 + [True] * 2  # allow to choose lambda for different f in f_vec
        #    # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        #elif choice == 604:     ## with weight = 1
        #    choose(603)

        #    lambda_vec = [0.5] * 21  # same length as f_vec

        # -- Flickr dataset
        elif choice == 601:
            FILENAMEZ = 'flickr'
            Macro_Accuracy = True
            fig_label = 'Flickr'
            legend_location = 'lower right'
            n = 2007369
            d = 18.1

        elif choice == 602:  ## with varying weights
            choose(601)

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 603:  ## with varying weights
            choose(602)

            select_lambda_vec = [False] * 3 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        elif choice == 604:  ## with weight = 1
            draw_std_vec = [4]
            choose(603)

            lambda_vec = [0.5] * 21  # same length as f_vec

        elif choice == 605:
            choose(601)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD", 'orange'
            ]
            draw_std_vec = [False] + [True] * 10
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [3] * 10
            marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x']
            markersize_vec = [0] + [8] * 10

            randomize_vec = [True] * 8
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]

            learning_method_vec = [
                'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE'
            ]
            select_lambda_vec = [False] * 8
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            weight_vec = [0, 0, 1, 2, 5, 10, 15]

            labels = ['GT'] + [
                i + ' {}'.format(weight_vec[ix])
                for ix, i in enumerate(['DCEr'] * 6)
            ]

        elif choice == 606:  # heuristic experiment
            choose(602)
            labels.append('Heuristic')
            learning_method_vec.append('Heuristic')
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- DBLP dataset
        elif choice == 701:
            FILENAMEZ = 'dblp'
            Macro_Accuracy = True
            ymin = 0.2
            ymax = 0.5
            fig_label = 'DBLP'
            legend_location = 'lower right'
            n = 2241258  # for figure
            d = 26.11  # for figure

        # -- ENRON dataset
        elif choice == 801:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            ymin = 0.3
            ymax = 0.75
            fig_label = 'Enron'
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            legend_location = 'upper left'
            n = 46463  # for figures
            d = 23.4  # for figures

        elif choice == 802:  ### WITH ADAPTIVE WEIGHTS
            choose(801)

            select_lambda_vec = [False] * 4 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 803:  ### WITH ADAPTIVE WEIGHTS
            choose(802)

            lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [
                1
            ] * 6  # same length as f_vec

        elif choice == 804:
            choose(803)

        elif choice == 805:
            choose(605)
            choose(801)
            #learning_method_vec += ['Holdout']
            #labels += ['Holdout']
        elif choice == 806:  # Heuristic experiment
            choose(802)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08],
                                    [0.08, 0.08, 0.76, 0.08],
                                    [0.08, 0.76, 0.08, 0.76],
                                    [0.08, 0.08, 0.76, 0.08]])

        elif choice == 821:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            constraints = True  # True
            gradient = True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [0.2, 0.2]

            randomize_vec = [False] * 4 + [True]
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.7
            labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Enron'
            legend_location = 'lower right'
            n = 46463  # for figures
            d = 23.4  # for figures

            alpha = 0.0
            beta = 0.0
            gamma = 0.0
            s = 0.5
            numMaxIt = 10

            select_lambda_vec = [False] * 3 + [True] * 2
            lambda_vec = [0.2] * 13 + [10] * 8  # same length as f_vec

        # -- Cora dataset
        elif choice == 901:
            FILENAMEZ = 'cora'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.9
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Cora'
            legend_location = 'lower right'
            n = 2708
            d = 7.8

        # -- Citeseer dataset
        elif CHOICE == 1001:
            FILENAMEZ = 'citeseer'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Citeseer'
            legend_location = 'lower right'
            n = 3312
            d = 5.6

        elif CHOICE == 1101:
            FILENAMEZ = 'hep-th'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.1
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Hep-th'
            legend_location = 'lower right'
            n = 27770
            d = 5.6

        elif choice == 1102:
            choose(1101)
            Macro_Accuracy = True

        elif CHOICE == 1204:
            FILENAMEZ = 'pokec-gender'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.000015
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [0, 3, 4, 4, 4, 4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Pokec-Gender'
            legend_location = 'lower right'
            n = 1632803
            d = 54.6

        else:
            raise Warning("Incorrect choice!")

    for choice in experiments:

        choose(choice)
        filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format(
            choice, FILENAMEZ)
        csv_filename = '{}.csv'.format(filename)

        header = [
            'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall',
            'learntime', 'proptime'
        ]
        if CREATE_DATA:
            save_csv_record(join(data_directory, csv_filename),
                            header,
                            append=False)

        # print("choice: {}".format(choice))

        # --- print data statistics
        if CALCULATE_DATA_STATISTICS:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())
            d = (len(W.nonzero()[0]) * 2) / n

            k = len(X0[0])

            print("FILENAMEZ:", FILENAMEZ)
            print("k:", k)
            print("n:", n)
            print("d:", d)

            # -- Graph statistics
            n_vec = calculate_nVec_from_Xd(Xd)
            print("n_vec:\n", n_vec)
            d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd)
            print("d_vec:\n", d_vec)
            P = calculate_Ptot_from_graph(W, Xd)
            print("P:\n", P)
            for i in range(k):
                Phi = calculate_degree_correlation(W, X0, i, NB=True)
                print("Degree Correlation, Class {}:\n{}".format(i, Phi))

            # -- Various compatibilities
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            print("H0 w/  constraints:\n", np.round(H0, 2))
            #raw_input() # Why?

            H2 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H4 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H5 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H6 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H7 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)

            print()
            # print("H MCE w/o constraints:\n", np.round(H0, 3))
            print("H MCE w/  constraints:\n", np.round(H2, 3))
            # print("H DCE 2 w/o constraints:\n", np.round(H4, 3))
            print("H DCE 2 w/  constraints:\n", np.round(H5, 3))
            # print("H DCE 10 w/o constraints:\n", np.round(H6, 3))
            print("H DCE 20 w/  constraints:\n", np.round(H7, 3))

            print()
            H_row_vec = H_observed(W, X0, 3, NB=True, variant=1)
            print("H_est_1:\n", np.round(H_row_vec[0], 3))
            print("H_est_2:\n", np.round(H_row_vec[1], 3))
            print("H_est_3:\n", np.round(H_row_vec[2], 3))

        # --- Create data
        if CREATE_DATA or ADD_DATA:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())  ## number of nodes in graph
            k = len(X0[0])
            d = (len(W.nonzero()[0]) * 2) / n
            #print(n)
            #print(d)
            #print("contraint = {}".format(constraints))
            #print('select lambda: {}'.format(len(select_lambda_vec)))
            #print('learning method: {}'.format(len(learning_method_vec)))
            #print('alpha: {}'.format(len(alpha_vec)))
            #print('beta: {}'.format(len(beta_vec)))
            #print('gamma: {}'.format(len(gamma_vec)))
            #print('s: {}'.format(len(s_vec)))
            #print('maxit: {}'.format(len(numMaxIt_vec)))
            #print('weight: {}'.format(len(weight_vec)))
            #print('randomize: {}'.format(len(randomize_vec)))
            # ---  Calculating True Compatibility matrix
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=constraints,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            # print(H0)
            H0c = to_centering_beliefs(H0)

            num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph

            # Starts a thread pool with at least 2 threads, and a lot more if you happen to be on a supercomputer
            pool = multiprocessing.Pool(max(2,
                                            multiprocessing.cpu_count() - 4))

            f_processes = f_vec * rep_SameGraph
            workers = []
            results = [(X0, W, f, ix)
                       for ix, f in enumerate(f_vec)] * rep_SameGraph
            # print('Expected results: {}'.format(num_results))
            try:  # hacky fix due to a bug in 2.7 multiprocessing
                # Distribute work for evaluating accuracy over the thread pool using
                # a hacky method due to python 2.7 multiprocessing not being fully
                # featured
                pool.map_async(multi_run_wrapper, results).get(num_results * 2)
            except multiprocessing.TimeoutError as e:
                continue
            finally:
                pool.close()
                pool.join()

        # -- Read data for all options and plot
        df1 = pd.read_csv(join(data_directory, csv_filename))
        acc_filename = '{}_accuracy_plot.pdf'.format(filename)
        pr_filename = '{}_PR_plot.pdf'.format(filename)
        if TIMING:
            print('=== {} Timing Results ==='.format(FILENAMEZ))
            print('Prop Time:\navg: {}\nstddev: {}'.format(
                np.average(df1['proptime'].values),
                np.std(df1['proptime'].values)))
            for learning_method in labels:
                rs = df1.loc[df1["method"] == learning_method]
                avg = np.average(rs['learntime'])
                std = np.std(rs['learntime'])
                print('{} Learn Time:\navg: {}\nstd: {}'.format(
                    learning_method, avg, std))

        sslhv.plot(df1,
                   join(figure_directory, acc_filename),
                   n=n,
                   d=d,
                   k=k,
                   labels=labels,
                   dataset=FILENAMEZ,
                   line_styles=linestyle_vec,
                   xmin=xmin,
                   ymin=ymin,
                   xmax=xmax,
                   ymax=ymax,
                   marker_sizes=markersize_vec,
                   draw_stds=draw_std_vec,
                   markers=marker_vec,
                   line_colors=facecolor_vec,
                   line_widths=linewidth_vec,
                   legend_location=legend_location,
                   show=SHOW_PDF,
                   save=CREATE_PDF,
                   show_plot=SHOW_PLOT)
def test_approx_spectral_radius():
    print("\n-- 'approx_spectral_radius' --")

    # --- Create the graph
    n = 1000
    alpha0 = [0.3334, 0.3333, 0.3333]
    h = 5
    P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]])
    m = 10000
    distribution = 'powerlaw'  # uniform powerlaw
    exponent = -0.3
    backEdgesAllowed = True
    sameInAsOutDegreeRanking = False
    debug = False
    start = time.time()
    W, Xd = planted_distribution_model(
        n,
        alpha=alpha0,
        P=P,
        m=m,
        distribution=distribution,
        exponent=exponent,
        backEdgesAllowed=backEdgesAllowed,
        sameInAsOutDegreeRanking=sameInAsOutDegreeRanking,
        debug=debug)
    print("n: {}".format(n))
    print("Time for graph generation: {}\n".format(time.time() - start))

    # --- Bigger graph with Kronecker
    M = kron(W.transpose(), P)

    # --- Time two variants
    start = time.time()
    rho1 = approx_spectral_radius(M, pyamg=True)
    print("Time for pyamg spectral radius: {}".format(time.time() - start))
    print("rho1: {}".format(rho1))

    start = time.time()
    rho2 = approx_spectral_radius(M, pyamg=False)
    print("Time for scipy spectral radius: {}".format(time.time() - start))
    print("rho2: {}".format(rho2))

    # --- 3 methods for spectral radisu, including non-sparse matrices
    H = create_parameterized_H(20, 2, symmetric=True)
    H = to_centering_beliefs(H)
    print("\nH:\n{}".format(H))

    start = time.time()
    rho1 = approx_spectral_radius(H, pyamg=True)
    print("Time for pyamg spectral radius: {}".format(time.time() - start))
    print("rho1: {}".format(rho1))

    start = time.time()
    rho2 = approx_spectral_radius(H, pyamg=False)
    print("Time for scipy spectral radius: {}".format(time.time() - start))
    print("rho2: {}".format(rho2))

    start = time.time()
    rho3 = approx_spectral_radius(H, pyamg=False, sparse=False)
    print("Time for non-sparse numpy spectral radius: {}".format(time.time() -
                                                                 start))
    print("rho3: {}".format(rho3))

    # --- For k=2, scipy made to default to non-sparse
    H = create_parameterized_H(2, 2, symmetric=True)
    print("\nH (k=2):\n{}".format(H))

    start = time.time()
    rho4 = approx_spectral_radius(H, pyamg=False)
    print("Time for scipy spectral radius: {}".format(time.time() - start))
    print("rho4: {}".format(rho4))
def test_matrix_difference():
    print(
        "\n-- 'matrix_difference' (cosine/cosine_ratio/l2), 'to_centering_beliefs' --"
    )
    X0 = np.array([
        [2, 0, 0],
        [2, 0, 2],
        [0, 1, 0],
        [0, 0, 3],
        [0, 0, 3],
        [1, 0, 2],
        [0, 3, 3],
        [0, 0, 0],
        [0, 1, 0],
        [0, 1, 0],
        [9, 9, 9],
        [9, 9, 9],
        [100, 100, 100],
    ])
    X1 = np.array([
        [1, 1, 2],
        [2, 1, 2],
        [3, 4, 0],
        [1, 1, 2],
        [2, 1, 1],
        [1, 2, 2],
        [1, 2, 3],
        [0, 0, 0],
        [1, 0, 0],
        [0, 2, 0],
        [9, 9, 9],
        [8, 9, 9],
        [100, 100, 101],
    ])
    print("X0:\n", X0)
    print("X1:\n", X1)

    result = matrix_difference(X0, X1, similarity='cosine', vector=True)
    print("cosine:\n", result)
    result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True)
    print("cosine_ratio:\n", result)
    result = matrix_difference(X0, X1, similarity='l2', vector=True)
    print("l2:\n", result)

    X0 = np.array([[1., 0., 0.], [0.30804075, 0.56206462, 0.12989463],
                   [0.32434628, 0.33782686, 0.33782686],
                   [0.30804075, 0.12989463, 0.56206462],
                   [0.14009173, 0.71981654, 0.14009173],
                   [0.32273419, 0.21860539, 0.45866042],
                   [0.33804084, 0.32391832, 0.33804084],
                   [0.45866042, 0.21860539, 0.32273419]])
    X1 = np.array([[1., 0., 0.], [0.22382029, 0.45296374, 0.32321597],
                   [0.32434628, 0.33782686, 0.33782686],
                   [0.22382029, 0.32321597, 0.45296374],
                   [0.2466463, 0.5067074, 0.2466463],
                   [0.32273419, 0.21860539, 0.45866042],
                   [0.33804084, 0.32391832, 0.33804084],
                   [0.45866042, 0.21860539, 0.32273419]])
    print("\nX0:\n", X0)
    print("X1:\n", X1)

    result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True)
    print("cosine_ratio:\n", result)

    # X0z = row_normalize_matrix(X0, norm='zscores')
    # X1z = row_normalize_matrix(X1, norm='zscores')
    X0z = to_centering_beliefs(X0)
    X1z = to_centering_beliefs(X1)

    print("\nX0z:\n", X0z)
    print("X1z:\n", X1z)

    result = matrix_difference(X0z,
                               X1z,
                               similarity='cosine_ratio',
                               vector=True)
    print("cosine_ratio zscores:\n", result)

    # actualPercentageConverged = matrix_convergence_percentage(X0z, X1z, threshold=convergenceCosineSimilarity)

    X0 = np.array([1, 0, 0])
    X1 = np.array([1, 1, 0])
    print("\nX0:\n", X0)
    print("X1:\n", X1)
    result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True)
    print("cosine_ratio zscores:\n", result)

    X0 = np.array([-30, -15, 45])
    X1 = np.array([-15, -30, 45])
    print("\nX0:\n", X0)
    print("X1:\n", X1)
    result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True)
    print("cosine_ratio zscores:\n", result)
def test_transform_beliefs():
    print("\n-- 'check_normalized_beliefs', 'to_centering_beliefs' --")
    X = np.array([[1.0001, 0, 0]])
    print("X:", X)
    assert check_normalized_beliefs(X)
    print("X centered:", to_centering_beliefs(X))

    Y = np.array([0.9999, 0, 0])
    print("Y:", Y)
    assert check_normalized_beliefs(Y)
    print("Y centered:", to_centering_beliefs(Y))

    Z = np.array([[1.001, 0, 0]])
    print("Z:", Z)
    assert not check_normalized_beliefs(Z)

    W = np.array([0.999, 0, 0])
    print("W:", W)
    assert not check_normalized_beliefs(W)

    print("\n-- 'check_centered_beliefs', 'from_centering_beliefs'")
    Xc = np.array([[1.0001, -1, 0]])
    print("Xc: ", Xc)
    assert check_centered_beliefs(Xc)
    print("Xc uncentered: ", from_centering_beliefs(Xc))

    Yc = np.array([0.9999, -1, 0])
    print("Yc: ", Yc)
    assert check_centered_beliefs(Yc)
    print("Yc uncentered: ", from_centering_beliefs(Yc))

    Zc = np.array([[1.001, -1, 0]])
    print("Zc: ", Zc)
    assert not check_centered_beliefs(Zc)

    Wc = np.array([0.999, -1, 0])
    print("Wc: ", Wc)
    assert not check_centered_beliefs(Wc)

    print(
        "\n-- 'to_centering_beliefs', 'from_centering_beliefs' for matrices --"
    )
    X = np.array([[1, 0, 0], [0.8, 0.2, 0], [1. / 3, 1. / 3, 1. / 3],
                  [0, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 0]])
    print("X original:\n", X)
    print("np.sum(X,1):\n", np.sum(X, 1))
    print("X.sum(axis=1, keepdims=True):\n", X.sum(axis=1, keepdims=True))
    print("X.shape:", X.shape)
    print("len(X.shape): ", len(X.shape))

    Xc = to_centering_beliefs(X, ignoreZeroRows=True)
    print("X centered (ignoringZeroRows=True):\n", Xc)
    Y = from_centering_beliefs(Xc)
    print("X again un-centered:\n", Y)

    fileNameX = join(data_directory, 'Torus_X.csv')
    X, _, _ = load_X(fileNameX, n=8, zeroindexing=False)
    X = X.dot(0.1)
    print("\nCentered X for Torus example as input\n", X)
    Xc = from_centering_beliefs(X)
    print("X un-centered:\n", Xc)

    X = np.array([[1, 0, 0]])
    print("\nX original:\n", X)
    Xc = to_centering_beliefs(X)
    print("X centered:\n", Xc)
    Y = from_centering_beliefs(Xc)
    print("X back non-centered:\n", Y)

    X = np.array([1, 0, 0])
    print("\nX original:\n", X)
    print("np.sum(X,0):", np.sum(X, 0))
    print("X.sum(axis=0, keepdims=True):", X.sum(axis=0, keepdims=True))
    print("X.shape: ", X.shape)
    print("len(X.shape): ", len(X.shape))
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False):
    CHOICE = choice

    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf

    STD_FILL = True
    #
    SHORTEN_LENGTH = False

    fig_filename = 'Fig_homophily_{}.pdf'.format(CHOICE)
    csv_filename = 'Fig_homophily_{}.csv'.format(CHOICE)
    header = ['currenttime',
              'option',
              'f',
              'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)


    # -- Default Graph parameters
    k = 3
    rep_DifferentGraphs = 1
    rep_SameGraph = 2
    initial_h0 = None
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    constraint = True

    variant = 1
    EC = True                   # Non-backtracking for learning
    global f_vec, labels, facecolor_vec

    s = 0.5
    err = 0
    numMaxIt = 10
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True


    clip_on_vec = [True] * 10
    draw_std_vec = range(10)
    ymin = 0.3
    ymax = 1
    xmin = 0.001
    xmax = 1
    xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    xtick_labels = ['1e-5', '0.01\%', '0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [5, 2, 3, 3, 3, 3] + [3]*10
    marker_vec = [None, '^', 'v', 'o', '^'] + [None]*10
    markersize_vec = [0, 8, 8, 8, 6, 6] + [6]*10
    facecolor_vec = ['black', "#C44E52",  "#64B5CD"]


    # -- Options with propagation variants
    if CHOICE == 101:
        n = 10000
        h = 3
        d = 15
        f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)]
        option_vec = ['opt1', 'opt2', 'opt3']
        learning_method_vec = ['GT','DHE','Homophily']
        weight_vec = [None] + [10] + [None]
        randomize_vec = [None] + [True] + [None]
        xmin = 0.001
        ymin = 0.3
        ymax = 1
        labels = ['GS', 'DCEr', 'Homophily']

    else:
        raise Warning("Incorrect choice!")

    a = 1
    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)

    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs):  # create several graphs with same parameters

            W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution,
                                                      exponent=exponent, directed=False, debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for j in range(rep_SameGraph):  # repeat several times for same graph
                # print("Graph:{} and j: {}".format(i,j))

                ind = None
                for f in f_vec:
                    X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified)
                    X2 = introduce_errors(X1, ind, err)

                    for option_index, (option, learning_method,  weights, randomize) in \
                            enumerate(zip(option_vec, learning_method_vec, weight_vec, randomize_vec)):

                        # -- Learning
                        if learning_method == 'GT':
                            H2 = H0
                        elif learning_method == 'Homophily':
                            H2 = np.identity(k)

                        elif learning_method == 'DHE':
                            H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, constraints=constraint)
                            # print("learning_method:", learning_method)
                            # print("H:\n{}".format(H2))

                        # -- Propagation
                        H2c = to_centering_beliefs(H2)
                        X2c = to_centering_beliefs(X2, ignoreZeroRows=True)

                        try:
                            eps_max = eps_convergence_linbp_parameterized(H2c, W,
                                                                          method='noecho',
                                                                          X=X2)
                            eps = s * eps_max

                            F, actualIt, actualPercentageConverged = \
                                linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                              method='noecho',
                                                              numMaxIt=numMaxIt,
                                                              convergencePercentage=convergencePercentage_W,
                                                              debug=2)
                        except ValueError as e:
                            print (
                            "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                        else:
                            accuracy_X = matrix_difference_classwise(X0, F, ignore_rows=ind)


                            tuple = [str(datetime.datetime.now())]
                            text = [option_vec[option_index],
                                    f,
                                    accuracy_X]
                            tuple.extend(text)
                            # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option_vec[option_index], f, actualIt, accuracy_X))
                            save_csv_record(join(data_directory, csv_filename), tuple)



    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))
    desred_decimals = 7
    df1['f'] = df1['f'].apply(lambda x: round(x,desred_decimals))                   # rounding due to different starting points
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))


    # Aggregate repetitions
    df2 = df1.groupby(['option', 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(10)))

    # Pivot table
    df3 = pd.pivot_table(df2, index=['f'], columns=['option'], values=['accuracy_mean', 'accuracy_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(10)))

    # Extract values
    X_f = df3['f'].values                     # plot x values
    Y=[]
    Y_std=[]
    for option in option_vec:
        Y.append(df3['accuracy_mean_{}'.format(option)].values)
        if STD_FILL:
            Y_std.append(df3['accuracy_std_{}'.format(option)].values)


    if SHORTEN_LENGTH:
        SHORT_FACTOR = 2        ## KEEP EVERY Nth ELEMENT
        X_f  = np.copy(X_f[list(range(0, len(X_f), SHORT_FACTOR)), ])

        for i in range(len(Y)):
            Y[i] = np.copy(Y[i][list(range(0, len(Y[i]), SHORT_FACTOR)), ])
            if STD_FILL:
                Y_std[i] = np.copy(Y_std[i][list(range(0, len(Y_std[i]), SHORT_FACTOR)),])






    if CREATE_PDF or SHOW_PLOT or SHOW_PDF:

        # -- Setup figure
        mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['font.size'] = 16
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


        #  -- Drawing
        if STD_FILL:
            for choice, (option, facecolor) in enumerate(zip(option_vec, facecolor_vec)):
                ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice],
                                facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')

        for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
                enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):
            P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
                    markersize=markersize, markeredgewidth=1, clip_on=clip_on, markeredgecolor='black')

        plt.xscale('log')

        # -- Title and legend
        distribution_label = '$'
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        n_label = '{}k'.format(int(n / 1000))
        if n < 1000:
            n_label='{}'.format(n)
        a_label = ''
        if a != 1:
            a_label = ', a\!=\!{}'.format(a)

        titleString = r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}{}{}'.format(n_label, d, h, a_label, distribution_label)
        plt.title(titleString)

        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(handles, labels,
                            loc='upper left',     # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            borderaxespad=0.2,  # distance between legend and the outer axes
                            borderpad=0.3,  # padding inside legend box
                            numpoints=1,  # put the marker only once
                            )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)


        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))

        grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Label Sparsity $(f)$', labelpad=0)      # labelpad=0
        ylabel(r'Accuracy', labelpad=0)

        xlim(xmin, xmax)
        ylim(ymin, ymax)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)
        
        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory, fig_filename))  # shows actually created PDF
Exemple #10
0
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False):

    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    STD_FILL = True


    csv_filename = 'Fig_End-to-End_accuracy_VaryK_{}.csv'.format(CHOICE)
    header = ['currenttime',
              'option',
              'k',
              'f',
              'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)


    # -- Default Graph parameters
    rep_SameGraph = 10       # iterations on same graph
    initial_h0 = None           # initial vector to start finding optimal H
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    variant = 1
    EC = True                   # Non-backtracking for learning
    ymin = 0.3
    ymax = 1
    xmax = 8
    xtick_lab = [2,3,4,5,6,7, 8]
    xtick_labels = ['2', '3', '4', '5', '6', '7', '8']
    ytick_lab = np.arange(0, 1.1, 0.1)
    f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)]
    k_vec = [3, 4, 5 ]
    rep_DifferentGraphs = 10   # iterations on different graphs
    err = 0
    avoidNeighbors = False
    gradient = False
    pruneRandom = False
    convergencePercentage_W = None
    stratified = True
    label_vec = ['*'] * 10
    clip_on_vec = [False] * 10
    draw_std_vec = range(10)
    numberOfSplits = 1
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [5, 4, 3, 3] + [3] * 10
    marker_vec = [None, None, 'o', 'x', 'o', '^', 'o', 'x', 'o', '^', 'o', 'x', 'o', '^']
    markersize_vec = [0, 0, 4, 8] + [6] * 10
    facecolor_vec = ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"]


    # -- Options with propagation variants
    if CHOICE == 500:     ## 1k nodes
        n = 1000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        learning_method_vec = ['GS', 'MHE', 'DHE']
        weight_vec = [10] * 3
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 2 + [True]
        xmin = 3.
        ymin = 0.
        ymax = 1.
        label_vec = ['GS', 'MCE', 'DCEr']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [3, 4, 5, 6]

    elif CHOICE == 501:        ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        learning_method_vec = ['GT', 'MHE', 'DHE']
        weight_vec = [10] * 3
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 2 + [True]
        xmin = 2.
        ymin = 0.
        ymax = 1.
        label_vec = ['GT', 'MCE', 'DCEr']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [2, 3, 4, 5]


    elif CHOICE == 502:        ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.6
        ymax = 1.
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]



    elif CHOICE == 503:        ## 10k nodes
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.3
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [6, 7, 8]
        clip_on_vec = [True] * 10

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]



    elif CHOICE == 504:        ## 10k nodes
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        xmax = 7
        ymin = 0.2
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        # k_vec = [2, 3, 4, 5, 6, 7, 8]
        k_vec = [7]
        clip_on_vec = [True] * 10




    elif CHOICE == 505:        ## 10k nodes    with f = 0.005
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        xmax = 7
        ymin = 0.2
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.005]
        k_vec = [2, 3, 4, 5, 6, 7]
        # k_vec = [7]
        clip_on_vec = [True] * 10

    # elif CHOICE == 506:        ## 10k nodes    with f = 0.005
    #     n = 10000
    #     h = 3
    #     d = 25
    #     option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
    #     learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
    #     weight_vec = [10] * 10
    #     alpha_vec = [0] * 10
    #     beta_vec = [0] * 10
    #     gamma_vec = [0] * 10
    #     s_vec = [0.5] * 10
    #     numMaxIt_vec = [10] * 10
    #     randomize_vec = [False] * 4 + [True] + [False]
    #     xmin = 2
    #     xmax = 7
    #     ymin = 0.2
    #     ymax = 0.9
    #     label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr']
    #     facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
    #     f_vec = [0.005]
    #     k_vec = [2,3,4,5,6,7]
    #     # k_vec = [7]
    #     clip_on_vec = [True] * 10




    elif CHOICE == 506:        ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        xmax = 7
        ymin = 0.2
        ymax = 0.9
        label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.005]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [5]
        clip_on_vec = [True] * 10

        rep_SameGraph = 1       # iterations on same graph
        rep_DifferentGraphs = 1  # iterations on same graph

    elif CHOICE == 507:  ## 10k nodes   with gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GS', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.1
        ymax = 0.9
        label_vec = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [6, 7, 8]
        clip_on_vec = [True] * 10

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]

        gradient = True
        pruneRandom = True


    elif CHOICE == 508:  ## 10k nodes   with gradient and PruneRandom
        n = 1000
        h = 3
        d = 10
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['GS', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True] + [False]
        xmin = 2
        ymin = 0.1
        ymax = 0.9
        label_vec = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [6, 7, 8]
        clip_on_vec = [True] * 10

        # option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE']
        # k_vec = [2, 3, 4, 5]

        gradient = True
        pruneRandom = True
        rep_DifferentGraphs = 1
        rep_SameGraph = 1



    else:
        raise Warning("Incorrect choice!")


    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs):  # create several graphs with same parameters
            # print("\ni: {}".format(i))

            for k in k_vec:
                # print("\nk: {}".format(k))

                H0 = create_parameterized_H(k, h, symmetric=True)
                H0c = to_centering_beliefs(H0)

                a = [1.] * k
                alpha0 = np.array(a)
                alpha0 = alpha0 / np.sum(alpha0)

                W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                                          distribution=distribution,
                                                          exponent=exponent,
                                                          directed=False,
                                                          debug=False)
                X0 = from_dictionary_beliefs(Xd)

                for j in range(rep_SameGraph):  # repeat several times for same graph
                    # print("j: {}".format(j))

                    ind = None
                    for f in f_vec:             # Remove fraction (1-f) of rows from X0 (notice that different from first implementation)
                        X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified)
                        X2 = introduce_errors(X1, ind, err)



                        for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
                                enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):

                            # -- Learning
                            if learning_method == 'GT':
                                H2c = H0c


                            elif learning_method == 'Holdout':


                                H2 = estimateH_baseline_serial(X2, ind, W, numMax=numMaxIt,
                                                               # ignore_rows=ind,
                                                               numberOfSplits=numberOfSplits,
                                                               # method=learning_method, variant=1, distance=length,
                                                               EC=EC,
                                                               alpha=alpha, beta=beta, gamma=gamma)
                                H2c = to_centering_beliefs(H2)

                            elif learning_method != 'DHE':
                                H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize)
                                H2c = to_centering_beliefs(H2)

                            else:
                                H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, gradient=gradient, randomrestarts=pruneRandom)
                                H2c = to_centering_beliefs(H2)


                            # -- Propagation
                            X2c = to_centering_beliefs(X2, ignoreZeroRows=True)       # try without
                            eps_max = eps_convergence_linbp_parameterized(H2c, W,
                                                                          method='noecho',
                                                                          alpha=alpha, beta=beta, gamma=gamma,
                                                                          X=X2)
                            eps = s * eps_max
                            try:
                                F, actualIt, actualPercentageConverged = \
                                    linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                                  method='noecho',
                                                                  alpha=alpha, beta=beta, gamma=gamma,
                                                                  numMaxIt=numMaxIt,
                                                                  convergencePercentage=convergencePercentage_W,
                                                                  debug=2)
                            except ValueError as e:
                                print (
                                "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                            else:
                                accuracy_X = matrix_difference(X0, F, ignore_rows=ind)

                                tuple = [str(datetime.datetime.now())]
                                text = [option_vec[option_index],
                                        k,
                                        f,
                                        accuracy_X]
                                # text = ['' if v is None else v for v in text]       # TODO: test with vocabularies
                                # text = np.asarray(text)         # without np, entries get ugly format
                                tuple.extend(text)
                                # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option_vec[option_index], f, actualIt, accuracy_X))
                                save_csv_record(join(data_directory, csv_filename), tuple)


    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # -- Aggregate repetitions
    df2 = df1.groupby(['option', 'k', 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size, np.median],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # -- Pivot table
    df3 = pd.pivot_table(df2, index=['f', 'k'], columns=['option'], values=[ 'accuracy_mean', 'accuracy_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(100)))



    # X_f = k_vec
    X_f = df3['k'].values            # read k from values instead

    Y_hash = defaultdict(dict)
    Y_hash_std = defaultdict(dict)
    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = list()
            Y_hash_std[f][option] = list()
    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = df3.loc[df3['f'] == f]['accuracy_mean_{}'.format(option)].values
            Y_hash_std[f][option] = df3.loc[df3['f'] == f]['accuracy_std_{}'.format(option)].values




    if CREATE_PDF or SHOW_PLOT or SHOW_PDF:

        # -- Setup figure
        fig_filename = 'Fig_End-to-End_accuracy_varyK_{}.pdf'.format(CHOICE)
        mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        opt_f_vecs = [(option, f) for option in option_vec for f in f_vec]

        for ((option, f), color, linewidth, clip_on, linestyle, marker, markersize) in \
            zip(opt_f_vecs, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec):

            # label = learning_method_vec[option_vec.index(option)]
            label = label_vec[option_vec.index(option)]
            # label = label + " " + str(f)

            if STD_FILL:


                # print((X_f))
                # print(Y_hash[f][option])


                ax.fill_between(X_f, Y_hash[f][option] + Y_hash_std[f][option], Y_hash[f][option] - Y_hash_std[f][option],
                                facecolor=color, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X_f, Y_hash[f][option] + Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X_f, Y_hash[f][option] - Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid')

            ax.plot(X_f, Y_hash[f][option], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
                markersize=markersize, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on)

        if CHOICE==507:
            Y_f = [1/float(i) for i in X_f]

            ax.plot(X_f, Y_f, linewidth=2, color='black', linestyle='dashed',
                    label='Random', zorder=4, marker='x',
                markersize=8, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on)

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        if n < 1000:
            n_label='{}'.format(n)
        else:
            n_label = '{}k'.format(int(n / 1000))

        title(r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(n_label, d, h, f, distribution_label))
        handles, label_vec = ax.get_legend_handles_labels()
        legend = plt.legend(handles, label_vec,
                            loc='upper right',  # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            borderaxespad=0.2,  # distance between legend and the outer axes
                            borderpad=0.3,  # padding inside legend box
                            numpoints=1,  # put the marker only once
                            )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8


        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))

        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')

        grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Number of Classes $(k)$', labelpad=0)      # labelpad=0
        ylabel(r'Accuracy', labelpad=0)

        xlim(xmin, xmax)
        ylim(ymin, ymax)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)

        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory, fig_filename))
Exemple #11
0
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False,
        show_arrows=False):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf

    SHOW_STD = True         ## FALSE for just scatter plot points
    SHOW_ARROWS = show_arrows


    # -- Default Graph parameters
    rep_SameGraph = 1       # iterations on same graph
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    variant = 1
    EC = False
    numberOfSplits = 1
    scaling_vec = [None]*10
    ymin = 0.3
    ymax = 1
    xmin = 1e-3
    xmax = 1e3
    xtick_lab = [1e-3, 0.01, 0.1, 1, 10, 100, 1000]
    xtick_labels = [r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$10^{2}$', r'$10^{3}$']
    ytick_lab = np.arange(0, 1.1, 0.1)
    k = 3
    a = 1
    rep_DifferentGraphs = 1   # iterations on different graphs
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = 0.99
    facecolor_vec = ["#4C72B0", "#55A868", "#8172B2", "#C44E52", "#CCB974", "#64B5CD"]
    label_vec = ['MCE', 'LCE', 'DCE', 'Holdout']
    linewidth_vec = [4, 3, 1, 2, 2, 1]
    # clip_ons = [True, True, True, True, True, True]
    FILEZNAME = 'Fig_timing_accuracy_learning'
    marker_vec = ['s', '^', 'v', 'o', 'x', '+', 'None']   #'^'
    length_vec = [5]
    stratified = True
    f = 0.01
    numMaxIt_vec = [10]*7
    alpha_vec = [0] * 7
    beta_vec = [0] * 7  # TODO: LinBP does not use beta. Also SSLH uses alpha, but not beta for W^row! Now fixed
    gamma_vec = [0] * 7
    s_vec = [0.5] * 7


    # -- Main Options
    if CHOICE == 1:         # Main graph
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8]


    elif CHOICE == 2:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'GS']
        randomize_vec = [False]*3 + [True] + [None]
        scaling_vec = [None]*2 + [10, 100] + [None]


    elif CHOICE == 3:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'GS']
        randomize_vec = [False]*3 + [True] + [None]
        scaling_vec = [None]*2 + [10, 100] + [None]
        f = 0.02


    elif CHOICE == 4:         # TODO: Overnight Wolfgang
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8, 16]


    elif CHOICE == 5:         # Toy graph with 100 nodes
        n = 100
        h = 3
        d = 8
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8]
        f=0.05


    elif CHOICE == 6:         # To be run by Prakhar on Cluster
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8]
        f=0.003
        xmin = 1e-2
        # ymax = 0.9
        ymin = 0.2
        ymax = 0.9
        xmin = 1e-2
        xmax = 1e3



    elif CHOICE == 7:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
        label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
        randomize_vec = [False]*3 + [True] + [None]*2
        scaling_vec = [None]*2 + [10, 100] + [None]*2
        splits_vec = [1, 2, 4, 8, 16]
        f=0.009

    # elif CHOICE == 8:       # not working well
    #     n = 1000
    #     h = 3
    #     d = 25
    #     option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
    #     learning_method_vec = ['MHE']  + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS']
    #     label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS']
    #     randomize_vec = [False]*3 + [True] + [None]*2
    #     scaling_vec = [None]*2 + [10, 100] + [None]*2
    #     splits_vec = [1, 2, 4, 8, 16]
    #     f=0.005



    else:
        raise Warning("Incorrect choice!")



    csv_filename = '{}_{}.csv'.format(FILEZNAME, CHOICE)
    header = ['currenttime',
              'option',
              'lensplit',
              'f',
              'accuracy',
              'timetaken']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    H0c = to_centering_beliefs(H0)


    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))


    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs):  # create several graphs with same parameters
            # print("\ni: {}".format(i))

            W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                                      distribution=distribution,
                                                      exponent=exponent,
                                                      directed=False,
                                                      debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for j in range(rep_SameGraph):  # repeat several times for same graph
                # print("j: {}".format(j))

                ind = None
                X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified = stratified)     # TODO: stratified sampling option = True
                X2 = introduce_errors(X1, ind, err)

                for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weight, randomize, option) in \
                        enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, scaling_vec, randomize_vec, option_vec)):

                    # weight = np.array([np.power(scaling, i) for i in range(5)])       # TODO: now enough to specify weight as a scalar!
                    H_est_dict = {}
                    timeTaken_dict = {}

                    # -- Learning
                    if learning_method == 'Holdout' :
                        for numberOfSplits in splits_vec:
                            prev_time = time.time()
                            H_est_dict[numberOfSplits] = estimateH_baseline_serial(X2, ind, W, numMax=numMaxIt,
                                                                                   # ignore_rows=ind,
                                                                                   numberOfSplits=numberOfSplits,
                                                                                   # method=learning_method, variant=1, distance=length,
                                                                                   EC=EC,
                                                                                   weights=weight, alpha=alpha, beta=beta, gamma=gamma)
                            timeTaken = time.time() - prev_time
                            timeTaken_dict[numberOfSplits] = timeTaken

                    elif learning_method in ['LHE', 'MHE', 'DHE']:      # TODO: no smartInit, just randomization as option
                        for length in length_vec:
                            prev_time = time.time()
                            H_est_dict[length] = estimateH(X2, W, method=learning_method, variant=1, randomize=randomize, distance=length, EC=EC, weights=weight)
                            timeTaken = time.time() - prev_time
                            timeTaken_dict[length] = timeTaken

                    elif learning_method == 'GS':
                        H_est_dict['GS'] = H0

                    for key in H_est_dict:
                        H_est = H_est_dict[key]
                        H2c = to_centering_beliefs(H_est)
                        # print("H_estimated by {} is \n".format(learning_method), H_est)
                        # print("H0 is \n", H0)
                        # print("randomize was: ", randomize)

                        # Propagation
                        X2c = to_centering_beliefs(X2, ignoreZeroRows=True)  # try without
                        eps_max = eps_convergence_linbp_parameterized(H2c, W,
                                                                      method='noecho',
                                                                      alpha=alpha, beta=beta, gamma=gamma,
                                                                      X=X2)

                        eps = s * eps_max

                        # print("Max Eps ", eps_max)

                        try:
                            F, actualIt, actualPercentageConverged = \
                                linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                              method='noecho',
                                                              alpha=alpha, beta=beta, gamma=gamma,
                                                              numMaxIt=numMaxIt,
                                                              convergencePercentage=convergencePercentage_W,
                                                              convergenceThreshold=0.99,
                                                              debug=2)

                        except ValueError as e:
                            print(
                                "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                        else:
                            accuracy_X = matrix_difference(X0, F, ignore_rows=ind)

                            tuple = [str(datetime.datetime.now())]
                            if learning_method == 'Holdout':
                                text = [option,"split{}".format(key), f, accuracy_X, timeTaken_dict[key]]
                            elif learning_method in ['MHE', 'DHE', 'LHE']:
                                text = [option, "len{}".format(key), f, accuracy_X, timeTaken_dict[key]]
                            elif learning_method == 'GS':
                                text = [option, 0, f, accuracy_X, 0]

                            tuple.extend(text)
                            # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option, f, actualIt, accuracy_X))
                            save_csv_record(join(data_directory, csv_filename), tuple)





    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # Aggregate repetitions
    df2 = df1.groupby(['option', 'lensplit', 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    df3 = df1.groupby(['option', 'lensplit', 'f']).agg({'timetaken': [np.median] })
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # resultdf3 = df3.sort(['timetaken'], ascending=1)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(15)))

    X_time_median_dict = {}
    Y_acc_dict = {}
    Y_std_dict = {}

    for option in option_vec:
        Y_acc_dict[option] = df2.loc[(df2['option'] == option), "accuracy_mean"].values
        Y_std_dict[option] = df2.loc[(df2['option'] == option), "accuracy_std"].values
        X_time_median_dict[option] = df3.loc[(df3['option'] == option), "timetaken_median"].values

        # print("option: ", option)
        # print("Y_acc_dict[option]: ", Y_acc_dict[option])
        # print("Y_std_dict[option]: ", Y_std_dict[option])
        # print("X_time_median_dict[option]: ", X_time_median_dict[option])



    # -- Setup figure
    fig_filename = '{}_{}.pdf'.format(FILEZNAME, CHOICE)
    mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
    mpl.rcParams['axes.labelsize'] = 18
    mpl.rcParams['xtick.labelsize'] = 16
    mpl.rcParams['ytick.labelsize'] = 16
    mpl.rcParams['axes.titlesize'] = 16
    mpl.rcParams['legend.fontsize'] = 14
    mpl.rcParams['grid.color'] = '777777'  # grid color
    mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
    mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
    mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
    mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
    mpl.rcParams['figure.figsize'] = [4, 4]
    fig = figure()
    ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


    SHOW_ARROWS = True

    for choice, color, learning_method, label, linewidth, marker in \
            zip(option_vec, facecolor_vec, learning_method_vec, label_vec, linewidth_vec, marker_vec):

        if learning_method == 'Holdout':
            # Draw std
            X1 = X_time_median_dict[choice]
            s = X1.argsort()
            X1 = X1[s]
            Y1 = Y_acc_dict[choice][s]
            Y2 = Y_std_dict[choice][s]

            if SHOW_STD:
                ax.fill_between(X1, Y1 + Y2, Y1 - Y2, facecolor=color, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X1, Y1 + Y2, linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X1, Y1 - Y2, linewidth=0.5, color='0.8', linestyle='solid')
                ax.set_ylim(bottom=ymin)

                ax.plot(X1, Y1, linewidth=linewidth, color=color, linestyle='solid', label=label, zorder=20, marker='x', markersize=linewidth + 5, markeredgewidth=1)
                ax.annotate(np.round(X1[1], decimals=1), xy=(X1[1], Y1[1] - 0.05), color=color, va='center', annotation_clip=False, zorder=5)

            else:
                ax.scatter(list(X1), list(Y1),
                           color=color, label=label, marker='x', s=42)


        elif learning_method == 'GS':
            ax.plot([1e-4, 1e4], [Y_acc_dict[choice], Y_acc_dict[choice]],
                    linewidth=1, color='black',
                    linestyle='dashed', zorder=0,
                    marker=None,
                    label=label,
                    )

        else:       # For all other
            if SHOW_STD:
                ax.errorbar(list(X_time_median_dict[choice]), list(Y_acc_dict[choice]), yerr=Y_std_dict[choice],
                            fmt='-o', linewidth=2, color=color,
                            label=label, marker=marker, markersize=8)
                ax.annotate(np.round(X_time_median_dict[choice], decimals=2), xy=(X_time_median_dict[choice], Y_acc_dict[choice]-0.05), color=color, va='center',
                            annotation_clip=False, zorder=5)

            else:
                ax.scatter(list(X_time_median_dict[choice]), list(Y_acc_dict[choice]),
                           color=color, label=label, marker=marker, s=42)

        if SHOW_ARROWS:
            dce_opt = 'opt4'
            holdout_opt = 'opt5'

            ax.annotate(s='', xy=(X_time_median_dict[dce_opt], Y_acc_dict[dce_opt]-0.3), xytext=(X_time_median_dict[holdout_opt][2]+0.02, Y_acc_dict[dce_opt]-0.3), arrowprops=dict(arrowstyle='<->'))
            ax.annotate(str(int(np.round(X_time_median_dict[holdout_opt][2] / X_time_median_dict[dce_opt]))) + 'x', xy=((X_time_median_dict[dce_opt] + X_time_median_dict[holdout_opt][2])/100, Y_acc_dict[dce_opt]-0.28),
                        color='black', va='center',
                        # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                        annotation_clip=False, zorder=5)






    # -- Title and legend
    title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}$'.format(int(n / 1000), d, h, f))
    handles, label_vec = ax.get_legend_handles_labels()
    for i, (h, learning_method) in enumerate(zip(handles, learning_method_vec)):        # remove error bars in legend
        if isinstance(handles[i], collections.Container):
            handles[i] = handles[i][0]

    # plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0))

    SHOW_STD = False


    legend = plt.legend(handles, label_vec,
                        loc='upper right',  # 'upper right'
                        handlelength=2,
                        fontsize=12,
                        labelspacing=0.2,  # distance between label entries
                        handletextpad=0.3,  # distance between label and the line representation
                        borderaxespad=0.2,  # distance between legend and the outer axes
                        borderpad=0.3,  # padding inside legend box
                        numpoints=1,  # put the marker only once
                        )
    if not(SHOW_STD):
        legend = plt.legend(handles, label_vec,
                        loc='upper right',  # 'upper right'
                        handlelength=2,
                        fontsize=10,
                        labelspacing=0.2,  # distance between label entries
                        handletextpad=0.3,  # distance between label and the line representation
                        borderaxespad=0.2,  # distance between legend and the outer axes
                        borderpad=0.3,  # padding inside legend box
                        numpoints=1,  # put the marker only once
                        scatterpoints=1  # display only one-scatter point in legend
                        )

    # # legend.set_zorder(1)
    frame = legend.get_frame()
    frame.set_linewidth(0.0)
    frame.set_alpha(0.9)  # 0.8


    # -- Figure settings and save
    plt.xscale('log')
    plt.xticks(xtick_lab, xtick_labels)
    plt.yticks(ytick_lab, ytick_lab)
    ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_ylim(bottom=ymin)

    grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
    grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',

    xlim(xmin, xmax)
    ylim(ymin, ymax)


    xlabel(r'Time Median (sec)', labelpad=0)      # labelpad=0
    ylabel(r'Accuracy', labelpad=0)
    if CREATE_PDF:
        savefig(join(figure_directory, fig_filename), format='pdf',
                dpi=None,
                edgecolor='w',
                orientation='portrait',
                transparent=False,
                bbox_inches='tight',
                pad_inches=0.05,
                frameon=None)

    if SHOW_PDF:
        showfig(join(figure_directory, fig_filename))

    if SHOW_PLOT:
        plt.show()
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        shorten_length=False):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf
    SHOW_ARROWS = False
    STD_FILL = False

    CALCULATE_DATA_STATISTICS = False
    csv_filename = 'Fig_timing_VaryK_{}.csv'.format(CHOICE)
    header = ['currenttime', 'option', 'k', 'f', 'time']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    rep_SameGraph = 2  # iterations on same graph
    initial_h0 = None  # initial vector to start finding optimal H
    distribution = 'powerlaw'
    exponent = -0.3
    length = 5
    variant = 1
    EC = True  # Non-backtracking for learning
    ymin = 0.0
    ymax = 1
    xmin = 2
    xmax = 7.5
    xtick_lab = [2, 3, 4, 5, 6, 7, 8]
    xtick_labels = ['2', '3', '4', '5', '6', '7', '8']
    ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 50]
    ytick_labels = [
        r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$50$'
    ]
    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    k_vec = [3, 4, 5]
    rep_DifferentGraphs = 1000  # iterations on different graphs
    err = 0
    avoidNeighbors = False
    gradient = False
    convergencePercentage_W = None
    stratified = True
    label_vec = ['*'] * 10
    clip_on_vec = [True] * 15
    draw_std_vec = range(10)
    numberOfSplits = 1
    linestyle_vec = ['solid'] * 15
    linewidth_vec = [3, 2, 4, 2, 3, 2] + [3] * 15
    marker_vec = ['^', 's', 'o', 'x', 'o', '+', 's'] * 3
    markersize_vec = [8, 7, 8, 10, 7, 6] + [10] * 10
    facecolor_vec = [
        "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#64B5CD"
    ]
    legend_location = 'upper right'

    # -- Options with propagation variants
    if CHOICE == 600:  ## 1k nodes
        n = 1000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['GT', 'MHE', 'DHE', 'Holdout']
        weight_vec = [10] * 4
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 4 + [True]
        xmin = 3.
        xmax = 10.
        ymin = 0.
        ymax = 50.
        label_vec = ['GT', 'MCE', 'DCE', 'Holdout']
        facecolor_vec = [
            'black'
        ] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 4
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [3, 4, 5, 6]
        ytick_lab = [0, 1e-3, 1e-2, 1e-1, 1, 10, 50]
        ytick_labels = [
            r'$0$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$50$'
        ]

    elif CHOICE == 601:  ## 10k nodes
        n = 10000
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['GT', 'MHE', 'DHE', 'Holdout']
        weight_vec = [10] * 4
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 15 + [True]
        xmin = 3.
        xmax = 8.
        ymin = 0.
        ymax = 500.
        label_vec = ['GT', 'MCE', 'DCE', 'Holdout']
        facecolor_vec = [
            'black'
        ] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 4
        f_vec = [0.03, 0.01, 0.001]
        k_vec = [3, 4, 5]
        ytick_lab = [0, 1e-3, 1e-2, 1e-1, 1, 10, 100, 300]
        ytick_labels = [
            r'$0$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$300$'
        ]

    elif CHOICE == 602:  ## 10k nodes
        n = 10000
        h = 8
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 3 + [True] + [False]
        ymin = 0.01
        ymax = 500
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DHEr']
        facecolor_vec = [
            "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"
        ] * 4
        f_vec = [0.01]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7, 8]

        # option_vec = ['opt2', 'opt3', 'opt6']
        # learning_method_vec = ['MHE', 'DHE', 'LHE']
        # k_vec = [2, 3, 4, 5]

    elif CHOICE == 603:  ## 10k nodes

        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]

        xmin = 1.8
        xmax = 8.2
        ymin = 0.01
        ymax = 500
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.01]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7, 8]

        legend_location = 'upper right'

        # option_vec = ['opt2', 'opt3', 'opt6']
        # learning_method_vec = ['MHE', 'DHE', 'LHE']
        # k_vec = [2, 3, 4, 5]

        # option_vec = ['opt4', 'opt3']
        # learning_method_vec = ['MHE', 'MHE']
        # randomize_vec = [True, False]
        # k_vec = [2, 3, 4, 5]

    elif CHOICE == 604:  ## 10k nodes with Gradient
        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]
        ymin = 0.00
        ymax = 800
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.01]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [7, 8]
        gradient = True
        legend_location = 'center right'

    elif CHOICE == 605:  ## 10k nodes with Gradient   with f = 0.005
        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]
        ymin = 0.00
        ymax = 800
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.005]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7]
        # k_vec = [7, 8]
        gradient = True
        legend_location = 'center right'

    elif CHOICE == 606:  ## 10k nodes with Gradient   with f = 0.005 and Gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        weight_vec = [10] * 20
        alpha_vec = [0] * 20
        beta_vec = [0] * 20
        gamma_vec = [0] * 20
        s_vec = [0.5] * 20
        numMaxIt_vec = [10] * 20
        randomize_vec = [False] * 4 + [True]

        xmin = 1.8
        xmax = 7.2
        ymin = 0.01
        ymax = 800
        label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr']
        facecolor_vec = [
            "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52"
        ] * 4
        f_vec = [0.005]
        k_vec = [3, 4, 5]
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

        option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4']
        learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE']
        k_vec = [2, 3, 4, 5, 6, 7]

        gradient = True
        pruneRandom = True
        legend_location = 'upper right'

    elif CHOICE == 607:  ## 10k nodes   with gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 3 + [True] + [False]

        xmin = 1.8
        xmax = 7.
        ymin = 0.01
        ymax = 800
        label_vec = ['LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = [
            "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"
        ] * 4
        legend_location = 'upper left'
        marker_vec = [None, 's', 'x', 'o', '^', '+'] * 3
        markersize_vec = [8, 7, 10, 8, 7, 6] + [10] * 10
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        clip_on_vec = [True] * 10
        gradient = True
        pruneRandom = True
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]

    elif CHOICE == 608:  ## 10k nodes   with gradient and PruneRandom
        n = 10000
        h = 3
        d = 25
        option_vec = ['opt2', 'opt3', 'opt4', 'opt5', 'opt6']
        learning_method_vec = ['LHE', 'MHE', 'DHE', 'DHE', 'Holdout']
        weight_vec = [10] * 10
        alpha_vec = [0] * 10
        beta_vec = [0] * 10
        gamma_vec = [0] * 10
        s_vec = [0.5] * 10
        numMaxIt_vec = [10] * 10
        randomize_vec = [False] * 3 + [True] + [False]

        xmin = 1.8
        xmax = 7.2
        ymin = 0.01
        ymax = 800
        label_vec = ['LCE', 'MCE', 'DCE', 'DCEr', 'Holdout']
        facecolor_vec = [
            "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"
        ] * 4
        legend_location = 'upper left'
        marker_vec = [None, 's', 'x', 'o', '^', '+'] * 3
        markersize_vec = [8, 7, 10, 8, 7, 6] + [10] * 10
        f_vec = [0.01]
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        clip_on_vec = [True] * 10
        gradient = True
        pruneRandom = True
        ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500]
        ytick_labels = [
            r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$',
            r'$100$', r'$500$'
        ]
        rep_DifferentGraphs = 10

    else:
        raise Warning("Incorrect choice!")

    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))

    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for i in range(rep_DifferentGraphs
                       ):  # create several graphs with same parameters
            # print("\ni: {}".format(i))

            for k in k_vec:
                # print("\nk: {}".format(k))

                H0 = create_parameterized_H(k, h, symmetric=True)
                H0c = to_centering_beliefs(H0)

                a = [1.] * k
                alpha0 = np.array(a)
                alpha0 = alpha0 / np.sum(alpha0)

                W, Xd = planted_distribution_model_H(n,
                                                     alpha=alpha0,
                                                     H=H0,
                                                     d_out=d,
                                                     distribution=distribution,
                                                     exponent=exponent,
                                                     directed=False,
                                                     debug=False)
                X0 = from_dictionary_beliefs(Xd)

                for j in range(
                        rep_SameGraph):  # repeat several times for same graph
                    # print("j: {}".format(j))

                    ind = None
                    for f in f_vec:  # Remove fraction (1-f) of rows from X0 (notice that different from first implementation)
                        X1, ind = replace_fraction_of_rows(
                            X0,
                            1 - f,
                            avoidNeighbors=avoidNeighbors,
                            W=W,
                            ind_prior=ind,
                            stratified=stratified)
                        X2 = introduce_errors(X1, ind, err)

                        for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
                                enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):

                            # -- Learning
                            if learning_method == 'GT':
                                timeTaken = 0.0

                            elif learning_method == 'Holdout':

                                prev_time = time.time()
                                H2 = estimateH_baseline_serial(
                                    X2,
                                    ind,
                                    W,
                                    numMax=numMaxIt,
                                    numberOfSplits=numberOfSplits,
                                    EC=EC,
                                    alpha=alpha,
                                    beta=beta,
                                    gamma=gamma)
                                timeTaken = time.time() - prev_time

                            else:
                                prev_time = time.time()
                                if gradient and pruneRandom:
                                    H2 = estimateH(X2,
                                                   W,
                                                   method=learning_method,
                                                   variant=1,
                                                   distance=length,
                                                   EC=EC,
                                                   weights=weights,
                                                   randomize=randomize,
                                                   gradient=gradient)
                                else:
                                    H2 = estimateH(X2,
                                                   W,
                                                   method=learning_method,
                                                   variant=1,
                                                   distance=length,
                                                   EC=EC,
                                                   weights=weights,
                                                   randomize=randomize)
                                timeTaken = time.time() - prev_time

                            tuple = [str(datetime.datetime.now())]
                            text = [option_vec[option_index], k, f, timeTaken]
                            tuple.extend(text)
                            # print("option: {}, f: {}, timeTaken: {}".format(option_vec[option_index], f, timeTaken))
                            save_csv_record(join(data_directory, csv_filename),
                                            tuple)

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # -- Aggregate repetitions
    df2 = df1.groupby(['option', 'k', 'f']).agg \
        ({'time': [np.mean, np.std, np.size, np.median],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # -- Pivot table
    df3 = pd.pivot_table(df2,
                         index=['f', 'k'],
                         columns=['option'],
                         values=['time_mean', 'time_std',
                                 'time_median'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(100)))

    # X_f = k_vec
    X_f = df3['k'].values  # read k from values instead

    Y_hash = defaultdict(dict)
    Y_hash_std = defaultdict(dict)

    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = list()
            Y_hash_std[f][option] = list()

    for f in f_vec:
        for option in option_vec:
            Y_hash[f][option] = df3.loc[df3['f'] == f]['time_mean_{}'.format(
                option)].values  # mean
            # Y_hash[f][option] = df3.loc[df3['f'] == f]['time_median_{}'.format(option)].values          # median
            Y_hash_std[f][option] = df3.loc[df3['f'] == f][
                'time_std_{}'.format(option)].values

    if SHOW_PLOT or SHOW_PDF or CREATE_PDF:

        # -- Setup figure
        fig_filename = 'Fig_Time_varyK_{}.pdf'.format(CHOICE)
        mpl.rc(
            'font', **{
                'family': 'sans-serif',
                'sans-serif': [u'Arial', u'Liberation Sans']
            })
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams[
            'xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        opt_f_vecs = [(option, f) for option in option_vec for f in f_vec]

        for ((option, f), color, linewidth, clip_on, linestyle, marker, markersize) in \
            zip(opt_f_vecs, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec):

            label = label_vec[option_vec.index(option)]
            # label = label + " " + str(f)

            if STD_FILL:
                ax.fill_between(X_f,
                                Y_hash[f][option] + Y_hash_std[f][option],
                                Y_hash[f][option] - Y_hash_std[f][option],
                                facecolor=color,
                                alpha=0.2,
                                edgecolor=None,
                                linewidth=0)
                ax.plot(X_f,
                        Y_hash[f][option] + Y_hash_std[f][option],
                        linewidth=0.5,
                        color='0.8',
                        linestyle='solid')
                ax.plot(X_f,
                        Y_hash[f][option] - Y_hash_std[f][option],
                        linewidth=0.5,
                        color='0.8',
                        linestyle='solid')

            ax.plot(X_f,
                    Y_hash[f][option],
                    linewidth=linewidth,
                    color=color,
                    linestyle=linestyle,
                    label=label,
                    zorder=4,
                    marker=marker,
                    markersize=markersize,
                    markeredgecolor='black',
                    markeredgewidth=1,
                    clip_on=clip_on)

        if SHOW_ARROWS:
            for indx in [2, 3]:
                ax.annotate(s='',
                            xy=(X_f[indx] - 0.05, Y_hash[f]['opt4'][indx]),
                            xytext=(X_f[indx] - 0.05, Y_hash[f]['opt5'][indx]),
                            arrowprops=dict(facecolor='blue',
                                            arrowstyle='<->'))
                ax.annotate(
                    str(
                        int(
                            np.round(Y_hash[f]['opt5'][indx] /
                                     Y_hash[f]['opt4'][indx]))) + 'x',
                    xy=(X_f[indx] - 0.4,
                        (Y_hash[f]['opt5'][indx] + Y_hash[f]['opt4'][indx]) /
                        10),
                    color='black',
                    va='center',
                    annotation_clip=False,
                    zorder=5)

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        if n < 1000:
            n_label = '{}'.format(n)
        else:
            n_label = '{}k'.format(int(n / 1000))

        title(r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(
            n_label, d, h, f, distribution_label))
        handles, label_vec = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            label_vec,
            loc=legend_location,  # 'upper right'
            handlelength=2,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings and save
        plt.yscale('log')
        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)

        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)

        grid(b=True,
             which='major',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True,
             which='minor',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Number of Classes $(k)$', labelpad=0)  # labelpad=0
        ylabel(r'Time [sec]', labelpad=0)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename),
                    format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)

        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
def test_planted_distribution_model():
    """ Tests the main graph generator with statistics and visualized degree distribution and edge adjacency matrix
    """
    print("\n--- 'planted_distribution_model_H', 'planted_distribution_model_P', 'number_of_connectedComponents', 'create_blocked_matrix_from_graph' --")
    CHOICE = 21
    print("CHOICE:", CHOICE)
    debug = 0

    # directed = True                     # !!! TODO: not yet clear what undirected means here, only P accepts directed
    backEdgesAllowed = True             # ??? should be enforced in code
    sameInAsOutDegreeRanking = False
    distribution = 'powerlaw'
    exponent = -0.3
    VERSION_P = True


    # --- AAAI figures ---
    if CHOICE in [1, 2, 3, 4, 5, 6]:
        n = 120
        alpha0 = [1/6, 1/3, 1/2]
        h = 8
        P = np.array([[1, h, 1],
                      [1, 1, h],
                      [h, 1, 1]])

    if CHOICE == 1:                     # P (equivalent to 2), AAAI 2
        m = 1080

    elif CHOICE == 2:                   # H (equivalent to 1)
        H0 = row_normalize_matrix(P)
        d_vec = [18, 9, 6]
        VERSION_P = False

    elif CHOICE == 3:                   # H (equivalent to 4), AAAI 3
        H0 = row_normalize_matrix(P)
        d_vec = 9
        VERSION_P = False

    elif CHOICE == 4:                   # P (equivalent to 3)
        P = np.array([[1, h, 1],
                      [2, 2, 2*h],
                      [3*h, 3, 3]])
        m = 1080

    elif CHOICE == 5:                   # H (equivalent to 2), but backedges=False
        H0 = row_normalize_matrix(P)
        d_vec = [18, 9, 6]
        VERSION_P = False
        backEdgesAllowed = False

    elif CHOICE == 6:                   # P undirected, AAAI 4
        P = np.array([[1, h, 1],
                      [h, 1, 1],
                      [1, 1, h]])
        directed = False
        backEdgesAllowed = False
        m = 540

    # --- AGAIN DIRECTED ---
    if CHOICE == 12:
        n = 1001
        alpha0 = [0.6, 0.2, 0.2]
        P = np.array([[0.1, 0.8, 0.1],
                      [0.8, 0.1, 0.1],
                      [0.1, 0.1, 0.8]])
        m = 3000
        distribution = 'uniform'    # uniform powerlaw
        exponent = None
        backEdgesAllowed = False    # ??? should be enforced in code

    if CHOICE == 13:
        # Nice for block matrix visualization
        n = 1000
        alpha0 = [0.334, 0.333, 0.333]
        h = 2
        P = np.array([[1, h, 1],
                      [h, 1, 1],
                      [1, 1, h]])
        m = 2000
        distribution = 'uniform'    # uniform powerlaw
        exponent = None
        backEdgesAllowed = False    # ??? should be enforced in code

    if CHOICE == 14:
        n = 1000
        alpha0 = [0.3334, 0.3333, 0.3333]
        h = 10
        P = np.array([[1, h, 1],
                      [h, 1, 1],
                      [1, 1, h]])
        m = 10000
        exponent = -0.55


    # --- UNDIRECTED ---
    if CHOICE == 20:
        n = 100
        alpha0 = [0.6, 0.2, 0.2]
        h = 1.4
        P = np.array([[1, h, 1],
                      [h, 1, 1],
                      [1, 1, h]])
        H0 = row_normalize_matrix(P)
        d_vec = 5
        directed = False
        exponent = -0.3
        VERSION_P = False

    elif CHOICE == 21:
        n = 1001
        alpha0 = [0.6, 0.2, 0.2]
        h = 4
        P = np.array([[1, h, 1],
                      [h, 1, 1],
                      [1, 1, h]])
        H0 = row_normalize_matrix(P)
        d_vec = 3.4                   # don't specify vector for undirected
        distribution = 'uniform'    # uniform powerlaw
        exponent = -0.5
        directed = False
        backEdgesAllowed = True             # ignored in code for undirected
        VERSION_P = False
        sameInAsOutDegreeRanking = True     # ignored in code for undirected

    elif CHOICE == 22:
        n = 1000
        m = 3000
        alpha0 = [0.6, 0.2, 0.2]
        h = 4
        P = np.array([[1, 3*h, 1],
                      [2*h, 1, 1],
                      [1, 1, h]])
        distribution = 'uniform'    # uniform powerlaw
        exponent = -0.5
        directed = False
        backEdgesAllowed = False             # ignored in code for undirected
        sameInAsOutDegreeRanking = True     # ignored in code for undirected
        debug=0

        VERSION_P = True
        H0 = row_normalize_matrix(P)


    # --- Create the graph
    start = time.time()
    if VERSION_P:
        W, Xd = planted_distribution_model(n, alpha=alpha0, P=P, m=m,
                                           distribution=distribution, exponent=exponent,
                                           directed=directed,
                                           backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking,
                                           debug=debug)
    else:
        W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d_vec,
                                                  distribution=distribution, exponent=exponent,
                                                  directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking,
                                                  debug=debug)

    time_est = time.time()-start
    print("Time for graph generation: {}".format(time_est))

    # - Undirectd degrees: In + Out
    W_und = W.multiply(W.transpose())
    """if backEdgesAllowed then there can be edges in both directions."""
    # W_und.data[:] = np.sign(W_und.data)  # W contains weighted edges -> unweighted before counting edges with Ptot
    print("Fraction of edges that go in both directions: {}".format(np.sum(W_und.data) / np.sum(W.data)))

    # --- Statistics on created graph
    print("\n- 'calculate_Ptot_from_graph':")
    P_tot = calculate_Ptot_from_graph(W, Xd)
    print("P_tot:\n{}".format(P_tot))
    print("sum(P_tot): {}".format(np.sum(P_tot)))
    print("P (normalized to sum=1):\n{}".format(1. * P_tot / np.sum(P_tot)))           # Potential: normalized sum = 1
    H = row_normalize_matrix(P_tot)
    print("H (row-normalized):\n{}".format(H))

    print("\n- 'calculate_nVec_from_Xd':")
    n_vec = calculate_nVec_from_Xd(Xd)
    print("n_vec: {}".format(n_vec))
    print("alpha: {}".format(1.*n_vec / sum(n_vec)))

    print("\n- Average Out/Indegree 'calculate_average_outdegree_from_graph' (assumes directed for total; for undirected the totals are incorrect):")
    print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W)))
    print("Average indegree: {}".format(calculate_average_outdegree_from_graph(W.transpose())))
    print("Average total degree: {}".format(calculate_average_outdegree_from_graph(W + W.transpose())))
    print("Average outdegree per class: {}".format(calculate_average_outdegree_from_graph(W, Xd)))
    print("Average indegree per class: {}".format(calculate_average_outdegree_from_graph(W.transpose(), Xd)))
    print("Average total degree per class: {}".format(calculate_average_outdegree_from_graph(W + W.transpose(), Xd)))

    # - Overall degree distribution: In / out
    print("\n- Overall Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':")
    print("Overall Out and Indegree distribution:")
    d_out_vec_tot = calculate_outdegree_distribution_from_graph(W, Xd=None)
    d_in_vec_tot = calculate_outdegree_distribution_from_graph(W.transpose(), Xd=None)
    print("Outdegree distribution (degree / number):\n{}".format(np.array([d_out_vec_tot.keys(), d_out_vec_tot.values()])))
    print("Indegree distribution (degree / number):\n{}".format(np.array([d_in_vec_tot.keys(), d_in_vec_tot.values()])))

    # - Overall degree distribution: In + Out
    d_tot_vec_tot = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd=None)
    print("Total degree distribution (degree / number):\n{}".format(np.array([d_tot_vec_tot.keys(), d_tot_vec_tot.values()])))

    # - Per-class degree distribution: In / out
    print("\n- Per-class Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':")
    print("\nOutdegree distribution per class:")
    d_out_vec = calculate_outdegree_distribution_from_graph(W, Xd)
    for i in range(len(d_out_vec)):
        print("Class {}:".format(i))
        print(np.array([d_out_vec[i].keys(), d_out_vec[i].values()]))
    print("Indegree distribution per class:")
    d_in_vec = calculate_outdegree_distribution_from_graph(W.transpose(), Xd)
    for i in range(len(d_in_vec)):
        print("Class {}:".format(i))
        print(np.array([d_in_vec[i].keys(), d_in_vec[i].values()]))

    # - per-class degree distribution: In + out
    print("\nTotal degree distribution per class:")
    d_vec_und = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd)
    for i in range(len(d_vec_und)):
        print("Class {}:".format(i))
        print(np.array([d_vec_und[i].keys(), d_vec_und[i].values()]))

    print("\n- number of weakly connected components':")
    print("Number of weakly connected components: {}".format(connected_components(W, directed=True, connection='weak', return_labels=False)))


    # --- convergence boundary
    # print("\n- '_out_eps_convergence_directed_linbp', 'eps_convergence_linbp'")
    # if directed:
    #     eps_noEcho = _out_eps_convergence_directed_linbp(P, W, echo=False)
    #     eps_Echo = _out_eps_convergence_directed_linbp(P, W, echo=True)
    # else:
    Hc = to_centering_beliefs(H)
    eps_noEcho = eps_convergence_linbp(Hc, W, echo=False)
    eps_Echo = eps_convergence_linbp(Hc, W, echo=True)
    print("Eps (w/ echo): {}".format(eps_Echo))
    print("Eps (no echo): {}".format(eps_noEcho))


    # --- Fig1: Draw edge distributions
    print("\n- Fig1: Draw degree distributions")
    params = {'backend': 'pdf',
              'lines.linewidth': 4,
              'font.size': 10,
              'axes.labelsize': 24,  # fontsize for x and y labels (was 10)
              'axes.titlesize': 22,
              'xtick.labelsize': 20,
              'ytick.labelsize': 20,
              'legend.fontsize': 8,
              'figure.figsize': [5, 4],
              'font.family': 'sans-serif'
    }
    mpl.rcdefaults()
    mpl.rcParams.update(params)
    fig = plt.figure(1)
    ax = fig.add_axes([0.15, 0.15, 0.8, 0.8])  # main axes
    ax.xaxis.labelpad = -12
    ax.yaxis.labelpad = -12

    # A: Draw directed degree distribution
    y_vec = []
    for i in range(len(d_out_vec)):
        y = np.repeat(list(d_out_vec[i].keys()), list(d_out_vec[i].values()) )    # !!! np.repeat
        y = -np.sort(-y)
        y_vec.append(y)
        # print ("Class {}:\n{}".format(i,y))
    y_tot = np.repeat(list(d_out_vec_tot.keys()), list(d_out_vec_tot.values()))             # total outdegree
    y_tot = -np.sort(-y_tot)
    plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A out", linestyle='-')        # !!! plot default index starts from 0 otherwise
    plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B out", linestyle='--')
    plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C out", linestyle=':')
    plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot out", linestyle='-')

    # B: Draw second edge distribution of undirected degree distribution
    y_vec = []
    for i in range(len(d_vec_und)):
        y = np.repeat(list(d_vec_und[i].keys()), list(d_vec_und[i].values()) )    # !!! np.repeat
        y = -np.sort(-y)
        y_vec.append(y)
        # print ("Class {}:\n{}".format(i,y))
    y_tot = np.repeat(list(d_tot_vec_tot.keys()), list(d_tot_vec_tot.values()))             # total outdegree
    y_tot = -np.sort(-y_tot)
    plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A", linestyle='-')
    plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B", linestyle='--')
    plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C", linestyle=':')
    plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot", linestyle='-')

    plt.legend(loc='upper right', labelspacing=0)
    filename = 'figs/Fig_test_planted_distribution_model1_{}.pdf'.format(CHOICE)
    plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w',
                orientation='portrait', papertype='letter', format='pdf',
                transparent=True, bbox_inches='tight', pad_inches=0.1,
                # frameon=None,                 # TODO: frameon deprecated
                )
    os.system("open " + filename)


    # --- Fig2: Draw block matrix
    print("\n- Fig2: 'create_blocked_matrix_from_graph'")
    W_new, Xd_new = create_blocked_matrix_from_graph(W, Xd)

    fig = plt.figure(2)
    row, col = W_new.nonzero()                      # transform the sparse W back to row col format
    plt.plot(col, row, 'o', color='r', markersize=2, markeredgewidth=2, lw=0, zorder=3)    # Notice (col, row) because first axis is vertical in matrices
    # plt.matshow(W_new.todense(), cmap=plt.cm.Greys)  # cmap=plt.cm.gray / Blues   # alternative that does not work as well
    plt.gca().invert_yaxis()    # invert the y-axis to start on top and go down

    # Show quadrants
    d1 = alpha0[0] * n
    d2 = (alpha0[0] + alpha0[1]) * n
    plt.grid(which='major', color='0.7', linestyle='-', linewidth=1)
    plt.xticks([0, d1, d2, n])
    plt.yticks([0, d1, d2, n])
    plt.xlabel('to', labelpad=-1)
    plt.ylabel('from', rotation=90, labelpad=0)

    frame = plt.gca()
    # frame.axes.xaxis.set_ticklabels([])       # would hide the labels
    # frame.axes.yaxis.set_ticklabels([])
    frame.tick_params(direction='inout', width=1, length=10)

    filename = 'figs/Fig_test_planted_distribution_model2_{}.pdf'.format(CHOICE)
    plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w',
            orientation='portrait', papertype='letter', format='pdf',
            transparent=True, bbox_inches='tight', pad_inches=0.1)
    os.system("open " + filename)
Exemple #14
0
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    # -- Setup
    CHOICE = choice
    #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron
    experiments = [CHOICE]
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf

    SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF
    STD_FILL = True
    TIMING = False
    CALCULATE_DATA_STATISTICS = False

    # -- Default Graph parameters
    rep_SameGraph = 10  # iterations on same graph

    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1

    alpha_vec = [0] * 10
    beta_vec = [0] * 10
    gamma_vec = [0] * 10
    s_vec = [0.5] * 10
    clip_on_vec = [True] * 10
    numMaxIt_vec = [10] * 10

    # Plotting Parameters
    xtick_lab = [0.001, 0.01, 0.1, 1]
    xtick_labels = ['0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    xmax = 1
    xmin = 0.0001
    ymin = 0.3
    ymax = 0.7
    labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr']
    facecolor_vec = [
        'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
        "#64B5CD"
    ]
    draw_std_vec = [False] * 4 + [True]
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [4, 4, 2, 1, 2, 2]
    marker_vec = [None, 'o', 'x', '^', 'v', '+']
    markersize_vec = [0, 8, 8, 8, 8, 8, 8]

    option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
    learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']

    Macro_Accuracy = False
    EC = True  # Non-backtracking for learning
    constraints = True  # True
    weight_vec = [None] * 3 + [10, 10] * 2
    randomize_vec = [False] * 4 + [True] * 2
    k = 3
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True
    gradient = True
    doubly_stochastic = True
    num_restarts = None

    raw_std_vec = range(10)
    numberOfSplits = 1

    select_lambda_vec = [False] * 20
    lambda_vec = None

    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    FILENAMEZ = ""
    legend_location = ""
    fig_label = ""
    H_heuristic = ""

    def choose(choice):
        # -- Default Graph parameters
        nonlocal n
        nonlocal d
        nonlocal rep_SameGraph
        nonlocal FILENAMEZ
        nonlocal initial_h0
        nonlocal exponent
        nonlocal length
        nonlocal variant

        nonlocal alpha_vec
        nonlocal beta_vec
        nonlocal gamma_vec
        nonlocal s_vec
        nonlocal clip_on_vec
        nonlocal numMaxIt_vec

        # Plotting Parameters
        nonlocal xtick_lab
        nonlocal xtick_labels
        nonlocal ytick_lab
        nonlocal xmax
        nonlocal xmin
        nonlocal ymin
        nonlocal ymax
        nonlocal labels
        nonlocal facecolor_vec
        nonlocal draw_std_vec
        nonlocal linestyle_vec
        nonlocal linewidth_vec
        nonlocal marker_vec
        nonlocal markersize_vec
        nonlocal legend_location

        nonlocal option_vec
        nonlocal learning_method_vec

        nonlocal Macro_Accuracy
        nonlocal EC
        nonlocal constraints
        nonlocal weight_vec
        nonlocal randomize_vec
        nonlocal k
        nonlocal err
        nonlocal avoidNeighbors
        nonlocal convergencePercentage_W
        nonlocal stratified
        nonlocal gradient
        nonlocal doubly_stochastic
        nonlocal num_restarts
        nonlocal numberOfSplits
        nonlocal H_heuristic

        nonlocal select_lambda_vec
        nonlocal lambda_vec
        nonlocal f_vec

        if choice == 0:
            None

        elif choice == 304:  ## with varying weights
            FILENAMEZ = 'prop37'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'Prop37'
            legend_location = 'lower right'
            n = 62000
            d = 34.8
            select_lambda_vec = [False] * 5
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]

        elif choice == 305:  # DCEr Only experiment
            choose(605)
            choose(304)

            select_lambda_vec = [False] * 6

        elif choice == 306:
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 307:  # heuristic comparison
            choose(304)
            select_lambda_vec = [False] * 3 + [True] * 3
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            learning_method_vec.append('Heuristic')
            labels.append('Heuristic')
            H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- MovieLens dataset
        elif choice == 401:
            FILENAMEZ = 'movielens'
            Macro_Accuracy = True
            gradient = True
            fig_label = 'MovieLens'
            legend_location = 'upper left'

            n = 26850
            d = 25.0832029795

        elif choice == 402:
            choose(401)
            select_lambda_vec = [False] * 3 + [
                True
            ] * 3  # allow to choose lambda for different f in f_vec

            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 403:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            learning_method_vec.append('Holdout')
            labels.append('Holdout')

        elif choice == 404:
            choose(401)

            select_lambda_vec = [
                True
            ] * 3  # allow to choose lambda for different f in f_vec
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

            labels = ['GS', 'DCEr', 'Homophily']
            facecolor_vec = ['black', "#C44E52", "#64B5CD"]
            draw_std_vec = [False, True, False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 2, 2, 2, 2]
            marker_vec = [None, '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]

            weight_vec = [None, 10, None]
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6']
            randomize_vec = [False, True, False]
            learning_method_vec = ['GT', 'DHE']  #TODO

        elif choice == 405:  # DCEr ONLY experiment
            choose(605)
            choose(401)
            learning_method_vec += ['Holdout']
            labels += ['Holdout']

        elif choice == 406:  # comparison with a static heuristic matrix
            choose(402)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        elif choice == 407:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [1] * 21  # same length as f_vec

        elif choice == 408:
            choose(402)
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from
        # data/sigmod-movielens-fig.csv
        elif choice == 409:
            choose(402)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52",
                "#C44E52", "#CCB974", "#64B5CD"
            ]
            labels = [
                'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10',
                'Holdout'
            ]
            draw_std_vec = [False] * 5 + [True] * 2 + [False]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2]
            marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8]
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]
            legend_location = 'upper left'
            ymin = 0.3
            ymax = 1.0
            lambda_vec = [10] * 21  # same length as f_vec

        # -- Yelp dataset
        elif choice == 501:
            FILENAMEZ = 'yelp'
            Macro_Accuracy = True
            weight_vec = [None] * 3 + [10, 10]
            gradient = True
            ymin = 0.1
            ymax = 0.75
            fig_label = 'Yelp'
            legend_location = 'upper left'

            n = 4301900  # for figure
            d = 6.56  # for figure

        # -- Flickr dataset
        elif choice == 601:
            FILENAMEZ = 'flickr'
            Macro_Accuracy = True
            fig_label = 'Flickr'
            legend_location = 'lower right'
            ymin = 0.3
            ymax = 0.7
            n = 2007369
            d = 18.1

        elif choice == 602:  ## with varying weights
            choose(601)

            select_lambda_vec = [False] * 4 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 603:  ## with varying weights
            choose(602)

            select_lambda_vec = [False] * 3 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        elif choice == 604:  ## with weight = 1
            choose(603)

            lambda_vec = [0.5] * 21  # same length as f_vec

        elif choice == 605:
            choose(601)
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD", 'orange'
            ]
            draw_std_vec = [False] + [True] * 10
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [3] * 10
            marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x']
            markersize_vec = [0] + [8] * 10

            randomize_vec = [True] * 8
            option_vec = [
                'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8'
            ]

            learning_method_vec = [
                'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE'
            ]
            select_lambda_vec = [False] * 8
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec
            weight_vec = [0, 0, 1, 2, 5, 10, 15]

            labels = ['GT'] + [
                i + ' {}'.format(weight_vec[ix])
                for ix, i in enumerate(['DCEr'] * 6)
            ]

        elif choice == 606:  # heuristic experiment
            choose(602)
            labels.append('Heuristic')
            learning_method_vec.append('Heuristic')
            H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476],
                                    [.476, .476, .0476]])

        # -- DBLP dataset
        elif choice == 701:
            FILENAMEZ = 'dblp'
            Macro_Accuracy = True
            ymin = 0.2
            ymax = 0.5
            fig_label = 'DBLP'
            legend_location = 'lower right'
            n = 2241258  # for figure
            d = 26.11  # for figure

        # -- ENRON dataset
        elif choice == 801:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            ymin = 0.3
            ymax = 0.75
            fig_label = 'Enron'
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            legend_location = 'upper left'
            n = 46463  # for figures
            d = 23.4  # for figures

        elif choice == 802:  ### WITH ADAPTIVE WEIGHTS
            choose(801)

            select_lambda_vec = [False] * 4 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 803:  ### WITH ADAPTIVE WEIGHTS
            choose(802)

            lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [
                1
            ] * 6  # same length as f_vec

        elif choice == 804:
            choose(803)

        elif choice == 805:
            choose(605)
            choose(801)
            #learning_method_vec += ['Holdout']
            #labels += ['Holdout']
        elif choice == 806:  # Heuristic experiment
            choose(802)
            learning_method_vec += ['Heuristic']
            labels += ['Heuristic']
            H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08],
                                    [0.08, 0.08, 0.76, 0.08],
                                    [0.08, 0.76, 0.08, 0.76],
                                    [0.08, 0.08, 0.76, 0.08]])

        # MASC Dataset
        elif choice == 901:
            FILENAMEZ = 'masc'
            Macro_Accuracy = False
            fig_label = 'MASC'
            legend_location = 'lower right'
            n = 0
            d = 0
            ymin = 0
            num_restarts = 100

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        # MASC collapsed Dataset
        elif choice == 1001:
            FILENAMEZ = 'masc-collapsed'
            fig_label = 'MASC Collapsed'
            legend_location = 'lower right'
            n = 43724
            d = 7.2
            ymin = 0
            num_restarts = 20
            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 1002:
            choose(1001)
            Macro_Accuracy = True

        # MASC Reduced dataset
        elif choice == 1101:
            FILENAMEZ = 'masc-reduced'
            fig_label = 'MASC Reduced'
            legend_location = 'lower right'
            n = 31000
            d = 8.3
            ymin = 0
            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 1102:
            choose(1101)
            Macro_Accuracy = True

        else:
            raise Warning("Incorrect choice!")

    def _f_worker_(X0, W, f, f_index):
        RANDOMSEED = None  # For repeatability
        random.seed(RANDOMSEED)  # seeds some other python random generator
        np.random.seed(
            seed=RANDOMSEED
        )  # seeds the actually used numpy random generator; both are used and thus needed

        X1, ind = replace_fraction_of_rows(X0,
                                           1 - f,
                                           avoidNeighbors=avoidNeighbors,
                                           W=W,
                                           stratified=stratified)
        X2 = introduce_errors(X1, ind, err)


        for option_index, (label, select_lambda, learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \
                enumerate(zip(labels, select_lambda_vec, learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)):
            learn_time = -1
            # -- Learning
            if learning_method == 'GT':
                H2c = H0c
            elif learning_method == 'Heuristic':
                # print('Heuristic')
                H2c = H_heuristic

            elif learning_method == 'Holdout':
                # print('Holdout')
                H2 = estimateH_baseline_serial(
                    X2,
                    ind,
                    W,
                    numMax=numMaxIt,
                    # ignore_rows=ind,
                    numberOfSplits=numberOfSplits,
                    # method=learning_method, variant=1,
                    # distance=length,
                    EC=EC,
                    alpha=alpha,
                    beta=beta,
                    gamma=gamma,
                    doubly_stochastic=doubly_stochastic)
                H2c = to_centering_beliefs(H2)

            else:
                if "DCEr" in learning_method:
                    learning_method = "DCEr"
                elif "DCE" in learning_method:
                    learning_method = "DCE"

                # -- choose optimal lambda: allows to specify different lambda for different f
                # print("option: ", option_index)
                if select_lambda == True:
                    weight = lambda_vec[f_index]
                    # print("weight : ", weight)
                else:
                    weight = weights

                # -- learn H
                learn_start = time.time()
                H2 = estimateH(X2,
                               W,
                               method=learning_method,
                               variant=1,
                               distance=length,
                               EC=EC,
                               weights=weight,
                               randomrestarts=num_restarts,
                               randomize=randomize,
                               constraints=constraints,
                               gradient=gradient,
                               doubly_stochastic=doubly_stochastic)
                learn_time = time.time() - learn_start
                H2c = to_centering_beliefs(H2)

            # if learning_method not in ['GT', 'GS']:

            # print(FILENAMEZ, f, learning_method)
            # print(H2c)

            # -- Propagation
            prop_start = time.time()
            # X2c = to_centering_beliefs(X2, ignoreZeroRows=True)       # try without
            eps_max = eps_convergence_linbp_parameterized(H2c,
                                                          W,
                                                          method='noecho',
                                                          alpha=alpha,
                                                          beta=beta,
                                                          gamma=gamma,
                                                          X=X2)
            eps = s * eps_max
            # print("Max eps: {}, eps: {}".format(eps_max, eps))
            # eps = 1

            try:
                F, actualIt, actualPercentageConverged = \
                    linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                  method='noecho',
                                                  alpha=alpha, beta=beta, gamma=gamma,
                                                  numMaxIt=numMaxIt,
                                                  convergencePercentage=convergencePercentage_W,
                                                  debug=2)
                prop_time = time.time() - prop_start
                if Macro_Accuracy:
                    accuracy_X = matrix_difference_classwise(X0,
                                                             F,
                                                             ignore_rows=ind)
                    precision = matrix_difference_classwise(
                        X0, F, similarity='precision', ignore_rows=ind)
                    recall = matrix_difference_classwise(X0,
                                                         F,
                                                         similarity='recall',
                                                         ignore_rows=ind)
                else:
                    accuracy_X = matrix_difference(X0, F, ignore_rows=ind)
                    precision = matrix_difference(X0,
                                                  F,
                                                  similarity='precision',
                                                  ignore_rows=ind)
                    recall = matrix_difference(X0,
                                               F,
                                               similarity='recall',
                                               ignore_rows=ind)

                result = [str(datetime.datetime.now())]
                text = [
                    label, f, accuracy_X, precision, recall, learn_time,
                    prop_time
                ]
                result.extend(text)
                # print("method: {}, f: {}, actualIt: {}, accuracy: {}, precision:{}, recall: {}, learning time: {}, propagation time: {}".format(label, f, actualIt, accuracy_X, precision, recall, learn_time, prop_time))
                save_csv_record(join(data_directory, csv_filename), result)

            except ValueError as e:

                print("ERROR: {} with {}: d={}, h={}".format(
                    e, learning_method, d, h))
                raise e

        return 'success'

    def multi_run_wrapper(args):
        """Wrapper to unpack arguments passed to the pool worker. 
        
        NOTE: This method could be removed by upgrading to Python>=3.3, which
        includes the multiprocessing.starmap_async() function, which allows
        multiple arguments to be passed to the map function.  
        """

        return _f_worker_(*args)

    for choice in experiments:

        choose(choice)
        filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format(
            choice, FILENAMEZ)
        csv_filename = '{}.csv'.format(filename)

        header = [
            'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall',
            'learntime', 'proptime'
        ]
        if CREATE_DATA:
            save_csv_record(join(data_directory, csv_filename),
                            header,
                            append=False)

        # print("choice: {}".format(choice))

        # --- print data statistics
        if CALCULATE_DATA_STATISTICS:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())
            d = (len(W.nonzero()[0]) * 2) / n

            k = len(X0[0])

            print("FILENAMEZ:", FILENAMEZ)
            print("k:", k)
            print("n:", n)
            print("d:", d)

            # -- Graph statistics
            n_vec = calculate_nVec_from_Xd(Xd)
            print("n_vec:\n", n_vec)
            d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd)
            print("d_vec:\n", d_vec)
            P = calculate_Ptot_from_graph(W, Xd)
            print("P:\n", P)
            for i in range(k):
                Phi = calculate_degree_correlation(W, X0, i, NB=True)
                print("Degree Correlation, Class {}:\n{}".format(i, Phi))

            # -- Various compatibilities
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            print("H0 w/  constraints:\n", np.round(H0, 2))
            #raw_input() # Why?

            H2 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H4 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H5 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=2,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H6 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            H7 = estimateH(X0,
                           W,
                           method='DHE',
                           variant=1,
                           distance=2,
                           EC=EC,
                           weights=10,
                           randomize=False,
                           constraints=True,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)

            print()
            # print("H MCE w/o constraints:\n", np.round(H0, 3))
            print("H MCE w/  constraints:\n", np.round(H2, 3))
            # print("H DCE 2 w/o constraints:\n", np.round(H4, 3))
            print("H DCE 2 w/  constraints:\n", np.round(H5, 3))
            # print("H DCE 10 w/o constraints:\n", np.round(H6, 3))
            print("H DCE 20 w/  constraints:\n", np.round(H7, 3))

            print()
            H_row_vec = H_observed(W, X0, 3, NB=True, variant=1)
            print("H_est_1:\n", np.round(H_row_vec[0], 3))
            print("H_est_2:\n", np.round(H_row_vec[1], 3))
            print("H_est_3:\n", np.round(H_row_vec[2], 3))

        # --- Create data
        if CREATE_DATA or ADD_DATA:

            Xd, W = load_Xd_W_from_csv(
                join(realDataDir, FILENAMEZ) + '-classes.csv',
                join(realDataDir, FILENAMEZ) + '-neighbors.csv')

            X0 = from_dictionary_beliefs(Xd)
            n = len(Xd.keys())  ## number of nodes in graph
            k = len(X0[0])
            d = (len(W.nonzero()[0]) * 2) / n
            #print(n)
            #print(d)
            #print("contraint = {}".format(constraints))
            #print('select lambda: {}'.format(len(select_lambda_vec)))
            #print('learning method: {}'.format(len(learning_method_vec)))
            #print('alpha: {}'.format(len(alpha_vec)))
            #print('beta: {}'.format(len(beta_vec)))
            #print('gamma: {}'.format(len(gamma_vec)))
            #print('s: {}'.format(len(s_vec)))
            #print('maxit: {}'.format(len(numMaxIt_vec)))
            #print('weight: {}'.format(len(weight_vec)))
            #print('randomize: {}'.format(len(randomize_vec)))
            # ---  Calculating True Compatibility matrix
            H0 = estimateH(X0,
                           W,
                           method='MHE',
                           variant=1,
                           distance=1,
                           EC=EC,
                           weights=1,
                           randomize=False,
                           constraints=constraints,
                           gradient=gradient,
                           doubly_stochastic=doubly_stochastic)
            # print(H0)
            H0c = to_centering_beliefs(H0)

            num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph

            # Starts a thread pool with 10 fewer than the max number your computer
            # has available assuming one thread per cpu - this is meant for
            # supercomputer.
            #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()-10))
            # Use this for a reasonably powerful home computer
            #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()/2))
            # Use this for anything else
            pool = multiprocessing.Pool(2)

            f_processes = f_vec * rep_SameGraph
            workers = []
            results = [(X0, W, f, ix)
                       for ix, f in enumerate(f_vec)] * rep_SameGraph
            # print('Expected results: {}'.format(num_results))
            try:  # hacky fix due to a bug in 2.7 multiprocessing
                # Distribute work for evaluating accuracy over the thread pool using
                # a hacky method due to python 2.7 multiprocessing not being fully
                # featured
                pool.map_async(multi_run_wrapper, results).get(num_results * 2)
            except multiprocessing.TimeoutError as e:
                continue
            finally:
                pool.close()
                pool.join()

        # -- Read data for all options and plot
        df1 = pd.read_csv(join(data_directory, csv_filename))
        acc_filename = '{}_accuracy_plot.pdf'.format(filename)
        pr_filename = '{}_PR_plot.pdf'.format(filename)
        if TIMING:
            print('=== {} Timing Results ==='.format(FILENAMEZ))
            print('Prop Time:\navg: {}\nstddev: {}'.format(
                np.average(df1['proptime'].values),
                np.std(df1['proptime'].values)))
            for learning_method in labels:
                rs = df1.loc[df1["method"] == learning_method]
                avg = np.average(rs['learntime'])
                std = np.std(rs['learntime'])
                print('{} Learn Time:\navg: {}\nstd: {}'.format(
                    learning_method, avg, std))

        sslhv.plot(df1,
                   join(figure_directory, acc_filename),
                   n=n,
                   d=d,
                   k=k,
                   labels=labels,
                   dataset=FILENAMEZ,
                   line_styles=linestyle_vec,
                   xmin=xmin,
                   ymin=ymin,
                   xmax=xmax,
                   ymax=ymax,
                   marker_sizes=markersize_vec,
                   draw_stds=draw_std_vec,
                   markers=marker_vec,
                   line_colors=facecolor_vec,
                   line_widths=linewidth_vec,
                   legend_location=legend_location,
                   show=SHOW_PDF,
                   save=CREATE_PDF,
                   show_plot=SHOW_PLOT)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False):

    verbose = False
    repeat_diffGraph = 1000
    SUBSET = True
    NOGT = False        ## Not draw Ground Truth Comparison
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    SHOW_PDF = show_pdf
    CREATE_PDF = create_pdf

    STD_FILL = False

    csv_filename = 'Fig_fast_optimal_restarts_Accv2_{}.csv'.format(CHOICE)
    fig_filename = 'Fig_fast_optimal_restarts_Accv2_{}.pdf'.format(CHOICE)
    header = ['currenttime',
              'k',
              'restarts',
              'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=False)




    # -- Default Graph parameters
    global f_vec, labels, facecolor_vec
    global number_of_restarts



    initial_h0 = None
    distribution = 'powerlaw'
    exponent = -0.3  # for powerlaw
    length = 4  # path length
    constraint = True
    gradient = True
    variant = 1
    EC = True
    delta = 0.001
    numMaxIt = 10
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True

    learning_method = 'DHE'
    weights = 10
    randomize = True
    return_min_energy = True
    number_of_restarts = [8, 6, 5, 4]



    clip_on_vec = [True] * 20
    draw_std_vec = range(10)
    ymin = 0.3
    ymax = 1
    xmin = 0.001
    xmax = 1
    xtick_lab = []
    xtick_labels = []
    ytick_lab = np.arange(0, 1.1, 0.1)
    linestyle_vec = ['solid','solid','solid'] * 20
    linewidth_vec = [4,4,4,4]*10
    marker_vec = ['x', 'v', '^', '+', '>', '<'] *10
    markersize_vec = [10, 8, 8, 8 ,8 ,8 ,8 ]*10
    facecolor_vec = ["#C44E52", "#4C72B0", "#8172B2",  "#CCB974",  "#55A868", "#64B5CD"]*5




    # -- Options mainly change k

    if CHOICE == 101:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 10, 13, 16, 18, 20]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [30, 20, 10, 7, 5, 4, 3, 2, 1, 50, 99, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 102:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        # number_of_restarts = [30, 20, 10, 7, 5, 4, 3, 2, 1, 50, 99, 100]

        number_of_restarts = [20, 10, 5, 4, 3, 2]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 103:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 99]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 104:
        n = 10000
        h = 8
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 99]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]



    elif CHOICE == 105:
        n = 10000
        h = 8
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]

    elif CHOICE == 106:
        n = 10000
        h = 3
        d = 15
        k_vec = [3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10
        markersize_vec = [6, 10, 6, 6, 10, 6] * 10

        labels = ['r' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]


    elif CHOICE == 107:

        n = 10000
        h = 8
        d = 15
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [10, 5, 4, 3, 2, 99]
        # number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['x', 'v', '^', 's', 'o',  's', None] * 10
        markersize_vec = [10, 6, 6, 6, 6, 6, 6] * 10

        labels = [r'$r=$' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]

    elif CHOICE == 108:

        n = 10000
        h = 8
        d = 15
        k_vec = [2, 3, 4, 5, 6, 7, 8]
        # k_vec = [4, 5, 7, 10]
        f = 0.09
        distribution = 'uniform'

        # Write in DESCENDING ORDER
        number_of_restarts = [10, 5, 4, 3, 2, 99]
        # number_of_restarts = [20, 10, 5, 4, 3, 2, 100]
        ### 100:GT 99:GTr
        ### 50:min{30,GTr} 1:uninformative

        marker_vec = ['x', 'v', '^', 's', 'o',  's', None] * 10
        markersize_vec = [10, 6, 6, 6, 6, 6, 6] * 10

        labels = [r'$r=$' + str(a1) for a1 in number_of_restarts]
        xtick_lab = k_vec
        xtick_labels = [str(a1) for a1 in k_vec]
        repeat_diffGraph = 10

    else:
        raise Warning("Incorrect choice!")

    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))



    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for _ in range(repeat_diffGraph):

            for k in k_vec:
                a = [1.] * k
                k_star = int(k * (k - 1) / 2)
                alpha0 = np.array(a)
                alpha0 = alpha0 / np.sum(alpha0)

                # Generate Graph
                # print("Generating Graph: n={} h={} d={} k={}".format(n, h, d, k))
                H0 = create_parameterized_H(k, h, symmetric=True)
                W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False)
                H0_vec = transform_HToh(H0)
                # print("\nGold standard {}".format(np.round(H0_vec, decimals=3)))

                X0 = from_dictionary_beliefs(Xd)
                X2, ind = replace_fraction_of_rows(X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=None, stratified=stratified)

                h0 = [1.] * int(k_star)
                h0 = np.array(h0)
                h0 = h0 / k

                delta = 1 / (3 * k)
                # print("delta: ", delta)

                perm = []
                while len(perm) < number_of_restarts[0]:
                    temp = []
                    for _ in range(k_star):
                        temp.append(random.choice([-delta, delta]))
                    if temp not in perm:
                        perm.append(temp)
                    if len(perm) >= 2 ** (k_star):
                        break

                E_list = []   ## format = [[energy, H_vec], []..]
                for vec in perm:
                    H2_vec, energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC,
                                               weights=weights, randomize=False, constraints=constraint,
                                               gradient=gradient, return_min_energy=True, verbose=verbose,
                                               initial_h0=h0 + np.array(vec))
                    E_list.append([energy, list(H2_vec)])

                # print("All Optimizaed vector:")
                # [print(i) for i in E_list ]

                # print("Outside Energy:{} optimized vec:{} \n".format(min_energy_vec[0], optimized_Hvec))

                # min_energy_vec = min(E_list)
                # optimized_Hvec = min_energy_vec[1]
                #
                # print("\nEnergy:{} optimized vec:{}  \n\n".format(min_energy_vec[0],optimized_Hvec))
                #
                #

                GTr_optimized_Hvec, GTr_energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC,
                                                   weights=weights, randomize=False, constraints=constraint,
                                                   gradient=gradient, return_min_energy=True, verbose=verbose,
                                                   initial_h0=H0_vec)

                uninformative_optimized_Hvec, uninformative_energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC,
                                                   weights=weights, randomize=False, constraints=constraint,
                                                   gradient=gradient, return_min_energy=True, verbose=verbose,
                                                   initial_h0=h0)


                iterative_permutations = list(E_list)
                for restartz in number_of_restarts:
                    if k==2 or k == 3 and restartz > 8 and restartz<99:
                        continue

                    if restartz <= number_of_restarts[0]:
                        iterative_permutations = random.sample(iterative_permutations, restartz)
                    # print("For restart:{}, we have vectors:\n".format(restartz))
                    # [print(i) for i in  iterative_permutations]


                    if restartz == 100:       ## for GT
                        H2c = to_centering_beliefs(H0)
                        # print("\nGT: ", transform_HToh(H0,k))

                    elif restartz == 99:       ## for DCEr init with GT
                        H2c = to_centering_beliefs(transform_hToH(GTr_optimized_Hvec, k))
                        # print("\nGTr: ", GTr_optimized_Hvec)

                    elif restartz == 1:  ## for DCEr with uninformative initial
                        H2c = to_centering_beliefs(transform_hToH(uninformative_optimized_Hvec, k))
                        # print("\nUninformative: ", uninformative_optimized_Hvec)

                    elif restartz == 50:  ## for min{DCEr , GTr}
                        # print("Length:",len(E_list))
                        # [print(i) for i in E_list]
                        mod_E_list = list(E_list)+[[GTr_energy , list(GTr_optimized_Hvec)]]     #Add GTr to list and take min
                        # print("Mod Length:", len(mod_E_list))
                        # [print(i) for i in mod_E_list]
                        min_energy_vec = min(mod_E_list)
                        # print("\nSelected for 50:",min_energy_vec)
                        optimized_Hvec = min_energy_vec[1]

                        H2c = to_centering_beliefs(transform_hToH(optimized_Hvec, k))

                    else:
                        min_energy_vec = min(iterative_permutations)
                        optimized_Hvec = min_energy_vec[1]
                        H2c = to_centering_beliefs(transform_hToH(optimized_Hvec, k))

                    # print("Inside Chosen Energy:{} optimized vec:{} \n".format(min_energy_vec[0], optimized_Hvec))

                    try:
                        eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', X=X2)
                        s = 0.5
                        eps = s * eps_max

                        F, actualIt, actualPercentageConverged = \
                            linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                          method='noecho',
                                                          numMaxIt=numMaxIt,
                                                          convergencePercentage=convergencePercentage_W,
                                                          debug=2)
                    except ValueError as e:
                        print(
                            "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h))

                    else:
                        acc = matrix_difference_classwise(X0, F, ignore_rows=ind)

                        tuple = [str(datetime.datetime.now())]
                        text = [k,
                                restartz,
                                acc]
                        tuple.extend(text)

                        if verbose:
                            print("\nGold standard {}".format(np.round(H0_vec, decimals=3)))
                        # print("k:{}  Restart:{}  OptimizedVec:{}  Energy:{}  Accuracy:{}".format(k, restartz, np.round(min_energy_vec[1], decimals=3), min_energy_vec[0], acc  ))
                        # print("k:{}  Restart:{}   Accuracy:{}".format(k, 1, L2_dist))
                        save_csv_record(join(data_directory, csv_filename), tuple)



    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(20)))

    # Aggregate repetitions
    df2 = df1.groupby(['k', 'restarts']).agg \
        ({'accuracy': [np.mean, np.std, np.size], })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    df2['restarts'] = df2['restarts'].astype(str)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(20)))

    # Pivot table
    df3 = pd.pivot_table(df2, index=['k'], columns=['restarts'], values=['accuracy_mean', 'accuracy_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(10)))




    df4 = df3.drop('k', axis=1)
    if NOGT:
        df4 = df3.drop(['k', 'accuracy_mean_0', 'accuracy_mean_1', 'accuracy_std_0', 'accuracy_std_1'], axis=1)

    # df4 = df3.drop(['k', 'accuracy_mean_100', 'accuracy_std_100'], axis=1)


    df5 = df4.div(df4.max(axis=1), axis=0)
    df5['k'] = df3['k']
    # print("\n-- df5 (length {}):\n{}".format(len(df5.index), df5.head(100)))

    # df5 = df3     ## for normalization

    X_f = df5['k'].values            # read k from values instead
    Y=[]
    Y_std=[]
    for rez in number_of_restarts:
        if NOGT:
            if rez == 100 or rez==99:
                continue
        Y.append(df5['accuracy_mean_{}'.format(rez)].values)
        if STD_FILL:
            Y_std.append(df5['accuracy_std_{}'.format(rez)].values)



    if CREATE_PDF or SHOW_PDF or SHOW_PLOT:

        # -- Setup figure
        mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']})
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams['ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['font.size'] = 16
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])


        #  -- Drawing
        if STD_FILL:
            for choice, (option, facecolor) in enumerate(zip(number_of_restarts, facecolor_vec)):
                if option == 100:  ## GT
                    if NOGT:
                        continue
                    facecolor = 'black'
                elif option == 99:  ## GT-r
                    if NOGT:
                        continue
                    facecolor = 'black'

                ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice],
                                facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0)
                ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')
                ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid')

        for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
                enumerate(zip(number_of_restarts, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):

            if option == 100:     ## GT
                if NOGT:
                    continue
                linestyle='dashed'
                linewidth=3
                color='black'
                label='GS'
                marker='x'
                markersize=6
            elif option == 99:       ## GT-r
                if NOGT:
                    continue
                linestyle='dashed'
                linewidth=2
                color='black'
                label='Global Minima'
                marker = None
                markersize = 6
            elif option == 1:     ## GT
                color="#CCB974"
                linewidth = 2
                label='Uninfo'
            elif option == 50:       ## GT-r
                label='min{30,GTr}'

            P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
                    markersize=markersize, markeredgecolor='black',  markeredgewidth=1, clip_on=clip_on)

        # plt.xscale('log')

        # -- Title and legend
        distribution_label = '$'
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        n_label = '{}k'.format(int(n / 1000))
        if n < 1000:
            n_label='{}'.format(n)

        titleString = r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{} $'.format(n_label, d, h, f)
        title(titleString)

        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(handles, labels,
                            loc='lower left',     # 'upper right'
                            handlelength=2,
                            labelspacing=0,  # distance between label entries
                            handletextpad=0.3,  # distance between label and the line representation
                            borderaxespad=0.2,  # distance between legend and the outer axes
                            borderpad=0.3,  # padding inside legend box
                            numpoints=1,  # put the marker only once
                            # bbox_to_anchor=(1.1, 0)
                            )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        plt.xticks(xtick_lab, xtick_labels)
        # plt.yticks(ytick_lab, ytick_lab)


        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.2f'))
        # ax.xaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.0f'))

        grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Number of Classes $(k)$', labelpad=0)      # labelpad=0
        ylabel(r'Relative Accuracy', labelpad=0)

        xlim(2.9, 7.1)
        #
        ylim(0.65, 1.015)

        if CREATE_PDF:
            savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)
        if SHOW_PLOT:
            plt.show()

        if SHOW_PDF:
            showfig(join(figure_directory, fig_filename))  # shows actually created PDF
Exemple #16
0
def test_PaperExample():
    print("\n-- 'estimateH': Example graph for MHE vs LHE paper example --")
    CHOICE = 1
    if CHOICE == 1:  # graph example
        X = np.array([
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [0, 0],
            [0, 0],
            [0, 0],
        ])
    elif CHOICE == 2:  # full graph
        X = np.array([
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [0, 1],
            [1, 0],
            [0, 1],
        ])
    elif CHOICE == 3:  # no neighbors connected
        X = np.array([
            [0, 1],
            [0, 0],
            [1, 0],
            [1, 0],
            [0, 0],
            [0, 0],
            [0, 0],
        ])

    Xb = to_explicit_bool_vector(X)
    X2c = to_centering_beliefs(X, ignoreZeroRows=True)  # try without
    X2cf = to_centering_beliefs(X, ignoreZeroRows=False)  # try without

    row = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3]
    col = [1, 4, 5, 2, 3, 5, 6, 4, 6, 6]
    row, col = row + col, col + row
    Ws = sparse.csr_matrix(([1] * len(row), (row, col)), shape=(7, 7))

    # _out_visualize_Graph(Ws, X, Xb=Xb, colorDisplay='explicit')

    print("W:\n{}".format(Ws.todense()))
    print("X:\n{}\n".format(X))

    start = time.time()
    H = estimateH(X, Ws, method='MHE')
    time_est = time.time() - start
    print("Estimated H (MHE):\n{}".format(H))
    print("Time :{}\n".format(time_est))

    start = time.time()
    H = estimateH(X, Ws, method='LHE')
    time_est = time.time() - start
    print("Estimated H (LHE):\n{}".format(H))
    print("Time :{}\n".format(time_est))

    start = time.time()
    H = estimateH(X, Ws, method='LHE', constraints=True)
    time_est = time.time() - start
    print("Estimated H (LHE) with constraints:\n{}".format(H))
    print("Time :{}\n".format(time_est))

    # start = time.time()
    # H = estimateH(X, Ws, method='LHEregular')
    # time_est = time.time() - start
    # print ("Estimated H (LHEregular):\n{}".format(H))
    # print ("Time :{}\n".format(time_est))
    #
    # start = time.time()
    # H = estimateH(X, Ws, method='LHE2')
    # time_est = time.time() - start
    # print ("Estimated H (LHE2):\n{}".format(H))
    # print ("Time :{}\n".format(time_est))

    print("= Variants with centered X -- ")
    start = time.time()
    H = estimateH(X2c, Ws, method='LHE')
    # print (X2c)
    time_est = time.time() - start
    print("Estimated H (LHE) with centering (while ignoring zero rows):\n{}".
          format(H))
    print("Time :{}\n".format(time_est))

    start = time.time()
    H = estimateH(X2cf, Ws, method='LHE')
    # print (X2cf)
    time_est = time.time() - start
    print("Estimated H (LHE) with centering (and NOT ignoring zero rows):\n{}".
          format(H))
    print("Time :{}\n".format(time_est))
def beliefPropagation(X, W, P,
                      numMaxIt=10,
                      convergencePercentage=None, convergenceThreshold=0.9961947,
                      debug=1, damping=1, clamping=False):
    """Standard belief propagation assuming a directed graph with two variants:
        V1: one directed potential across edge direction: P is one potential, and W contains the weights of edges
        V2: a set of potentials on different edges: P is a tensor, and W indexes the potentials
    Dimensions of P (2 or 3) determines variant.
    Uses message-passing with division: see [Koller,Friedman 2009] Section 10.3.1.
    Uses damping: see [Koller,Friedman 2009] Section 11.1.
    Can be run either with given number of maximal iterations or until specified percentage of nodes have converged.
    Convergence of a node is determined by (variant of) cosine similarity between *centered beliefs* from two iterations.
    If convergence criterium is reached, the iterations will stop before maximal iterations.
    Parameter "debug" allows alternative, more detailed outputs, e.g., to get intermediate belief values.
    Checks that every entry in X and P are > 0.
    Can model undirected graphs by (1) specifing every edge only for one direction, an d(2) using symmetric potentials.

    TODO: also implement version without message passing with division

    TODO: future variant with non-constant k and different potential dimensions

    TODO: future variant without echo cancellation

    TODO: alternative convergence condition:
        if np.allclose(x, x_new, atol=1e-10):
            break]

    TODO: clamping not necessary: all depends on relative strength of prior beliefs

    Parameters
    ----------
    X : [n x k] np array
        prior (explicit) belief matrix.
        Rows do not have to be row-normalized.
        Rows can be all 0, which get later replaced by undefined prior belief.
    W : [n x n] sparse.csr_matrix
        directed sparse weighted adjacency matrix (thus a directed graph is assumed)
        Also allows undirected graph by simply specifying only symmetric potentials
        V1: weight determines thea ctual edge weight
        V2: weight determines the index of a potential (from potential tensor P)
    P : V1: [k x k]
        any directed potential (no requirement for normalization or identical row or column sums)
        V2: [num_pot_P x k x k] np array
        set of potentials (as tensor)
    numMaxIt : int (Default = 10)
        number of maximal iterations to perform
    convergencePercentage : float (Default = None)
        percentage of nodes that need to have converged in order to interrupt the iterations.
        Notice that a node with undefined beliefs does not count as converged if it does not change anymore
        (in order to avoid counting nodes without explicit beliefs as converged in first few rounds).
        If None, then runs until numMaxIt
    convergenceThreshold : float (Default = 0.9961947)
        cose similarity (actually, the "cosine_ratio" similarity) between two belief vectors in order to deem them as identicial (thus converged).
        In case both vectors have the same length, then: cos(5 deg) = 0.996194698092. cos(1 deg) = 0.999847695156
    debug : int (Default = 1)
        0 : no debugging and just returns F
        1 : tests for correct input, and just returns F
        2 : tests for correct input, and returns (F, actualNumIt, convergenceRatios)
        3 : tests for correct input, and returns (list of F, list of convergenceRatios)
    damping : float   (Default = 1)
        fraction of message values that come from new iteration (if 1, then no re-use of prior iteration)
    clamping : Boolean (Default = False)
        whether or not the explicit beliefs in X should be clamped to the nodes or not

    Returns (if debug == 0 or debug == 1)
    -------------------------------------
    F : [n x k] np array
        final belief matrix, each row normalized to form a label distribution

    Returns (if debug == 2 )
    ------------------------
    F : [n x k] np array
        final belief matrix, each row normalized to form a label distribution
    actualNumIt : int
        actual number of iterations performed
    actualPercentageConverged : float
        percentage of nodes that converged

    Returns (if debug == 3 )
    ------------------------
    List of F : [(actualNumIt+1) x n x k] np array
        list of final belief matrices for each iteration, represented as 3-dimensional numpy array
        Also includes the original beliefs as first entry (0th iteration). Thus has (actualNumIt + 1) entries
    actualNumIt : int
        actual number of iterations performed (not counting the first pass = 0th iteration for initializing)
    List of actualPercentageConverged : list of float (with length actualNumIt)
        list of percentages of nodes that converged in each iteration > 0. Thus has actualNumIt entries
    """

    # --- create variables for convergence checking and debugging
    n, k = X.shape
    dim_pot = len(P.shape)  # dimensions 2 or 3: determines V1 or V2
    Pot = P                 # for case of dim_pot = 2
    if debug >= 1:
        assert (X >= 0).all(), "All explicit beliefs need to be >=0 "
        assert(issparse(W)), "W needs to be sparse"
        n2, n3 = W.shape
        assert type(P).__module__ == "numpy", "P needs to be numpy array (and not a matrix)"
        assert dim_pot in [2, 3], "Input Potentials need to be 2-dimensional or 3-dimensional"
        if dim_pot == 2:
            assert (P >= 0).all(), "All entries in the potentials need to be >=0 "
            k2, k3 = P.shape
        else:
            num_pot_P, k2, k3 = P.shape
            for P_entry in P:
                assert (P_entry >= 0).all(), "All entries in each potential need to be >=0 "
            assert W.dtype == int, "Entries of weight matrix need to be integers to reference index of the potential"
            weight = W.data
            set_pot = set(weight)
            max_pot_W = max(set_pot)
            assert max_pot_W <= set_pot, "Indices in W refering to P need to be smaller than the number of potentials"
        assert(n == n2 & n2 == n3), "X and W need to have compatible dimensions"
        assert(k == k2 & k2 == k3), "X and P need to have compatible dimensions"
    if debug >= 3:
        listF = []          # store the belief matrices for each iteration
        listConverged = []  # store all L2 norms to previous iteration

    # --- create edge dictionaries
    row, col = W.nonzero()
    nodes = set(np.concatenate((row, col)))
    dict_edges_out = {}                         # dictionary: i to all nodes j with edge (i->j)
    for node in nodes:
        dict_edges_out[node] = set()
    dict_edges_in = deepcopy(dict_edges_out)    # dictionary: i to all nodes j with edge (i<-j)

    for (i,j) in zip(row, col):
        dict_edges_out[i].add(j)
        dict_edges_in[j].add(i)

    if dim_pot == 3:
        dict_edges_pot = {}     # Dictionary: for each directed edge (i,j) -> index of the potential in P[index, :, :]
        for (i, j, d) in zip(row, col, weight):
            dict_edges_pot[(i, j)] = d

    # --- X -> X0: replace all-0-rows with all 1s (no need to normalize initial beliefs)
    implicitVector = 1-1*to_explicit_bool_vector(X)             # indicator numpy array with 1s for rows with only 0s
    implicitVectorT = np.array([implicitVector]).transpose()    # vertical 1 vector for implicit nodes
    X0 = X + implicitVectorT    # X0: prio beliefs: addition of [n x k] matrix with [n x 1] vector is ok

    F1 = X0                     # old F: only for checking convergence (either because convergencePercantage not None or debug >= 2)
    F2 = X0.astype(float)   # new F: copy is necessary as to not change original X0 matrix when F2 is changed

    # --- Actual loop: each loop calculates (a) the new messages (with damping) and (b) the new beliefs
    converged = False
    actualNumIt = -1    # iterations start with 0th iteration
    while actualNumIt < numMaxIt and not converged:
        actualNumIt += 1

        # --- (a) calculate messages
        if actualNumIt == 0:
            # --- first pass (counts as 0th iteration): create message dictionaries and initialize messages with ones
            dict_messages_along_1 = {}        # dictionary: messages for each edge (i->j) in direction i->j
            dict_messages_against_1 = {}      # dictionary: messages for each edge (i<-j) in direction i->j
            default = np.ones(k)            # first message vector: all 1s
            for (i,j) in zip(row, col):
                dict_messages_along_1[(i,j)] = default
                dict_messages_against_1[(j,i)] = default
        else:
            # --- other iterations: calculate "messages_new" using message-passing with division (from F and messages)
            dict_messages_along_2 = {}            # new dictionary: messages for each edge (i->j) in direction i->j
            dict_messages_against_2 = {}          # new dictionary: messages for each edge (i<-j) in direction i->j
            for (i,j) in dict_messages_along_1.keys():  # also includes following case: "for (j,i) in dict_messages_against_1.keys()"
                if dim_pot == 3:                        # need to reference the correct potential in case dim_pot == 3
                    Pot = P[dict_edges_pot[(i,j)]-1, :, :]
                dict_messages_along_2[(i,j)] = (F2[i] / dict_messages_against_1[(j,i)]).dot(Pot)  # entry-wise division
                dict_messages_against_2[(j,i)] = (F2[j] / dict_messages_along_1[(i,j)]).dot(Pot.transpose())
                # TODO above two lines can contain errors

            # --- assign new to old message dictionaries, and optionally damp messages
            if damping == 1:
                dict_messages_along_1 = dict_messages_along_2.copy()        # requires shallow copy because of later division
                dict_messages_against_1 = dict_messages_against_2.copy()
            else:
                for (i,j) in dict_messages_along_1.keys():
                    dict_messages_along_1[(i,j)] = damping*dict_messages_along_2[(i,j)] + \
                                                   (1-damping)*dict_messages_along_1[(i,j)]
                for (i,j) in dict_messages_against_1.keys():
                    dict_messages_against_1[(i,j)] = damping*dict_messages_against_2[(i,j)] + \
                                                     (1-damping)*dict_messages_against_1[(i,j)]

        # --- (b) create new beliefs by multiplying prior beliefs with all incoming messages (pointing in both directions)
        for (i, f) in enumerate(F2):
            if not clamping or implicitVector[i] == 0:  # only update beliefs if those are not explicit and clamped
                F2[i] = X0[i]        # need to start multiplying from explicit beliefs, referencing the row with separate variable did not work out
                for j in dict_edges_out[i]:         # edges pointing away
                    F2[i] *= dict_messages_against_1[(j,i)]
                for j in dict_edges_in[i]:          # edges pointing inwards
                    F2[i] *= dict_messages_along_1[(j,i)]
                    # TODO line can contain errors


        # --- normalize beliefs [TODO: perhaps remove later to optimize except in last round]
        F2 = row_normalize_matrix(F2, norm='l1')

        # --- check convergence and store information if debug
        if convergencePercentage is not None or debug >= 2:
            F1z = to_centering_beliefs(F1)
            F2z = to_centering_beliefs(F2)
            actualPercentageConverged = matrix_convergence_percentage(F1z, F2z, threshold=convergenceThreshold)
            if convergencePercentage is not None \
                    and actualPercentageConverged >= convergencePercentage\
                    and actualNumIt > 0:  # end the loop early
                converged = True
            F1 = F2.copy()  # save for comparing in *next* iteration, make copy since F entries get changed

        if debug == 3:
            listF.append(F2.copy())      # stores (actualNumIt+1) values (copy is important as F2 is later overwritten)
            if actualNumIt > 0:
                listConverged.append(actualPercentageConverged) # stores actualNumIt values

    # --- Various return formats
    if debug <= 1:
        return F2
    elif debug == 2:
        return F2, actualNumIt, actualPercentageConverged
    else:
        return np.array(listF), actualNumIt, listConverged
Exemple #18
0
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    global n
    global d
    global rep_SameGraph
    global FILENAMEZ
    global initial_h0
    global H0c
    global exponent
    global length
    global variant

    global alpha_vec
    global beta_vec
    global gamma_vec
    global s_vec
    global clip_on_vec
    global numMaxIt_vec

    # Plotting Parameters
    global xtick_lab
    global xtick_labels
    global ytick_lab
    global xmax
    global xmin
    global ymin
    global ymax
    global labels
    global facecolor_vec
    global draw_std_vec
    global linestyle_vec
    global linewidth_vec
    global marker_vec
    global markersize_vec
    global legend_location

    global option_vec
    global learning_method_vec

    global Macro_Accuracy
    global EC
    global constraints
    global weight_vec
    global randomize_vec
    global k
    global fig_label
    global err
    global avoidNeighbors
    global convergencePercentage_W
    global stratified
    global gradient
    global doubly_stochastic
    global numberOfSplits

    global select_lambda_vec
    global lambda_vec
    global f_vec
    # -- Setup
    CHOICE = choice
    #500 Yelp, 600 Flickr, 700 DBLP, 800 Enron
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    STD_FILL = True

    CALCULATE_DATA_STATISTICS = False

    # -- Default Graph parameters
    rep_SameGraph = 3  # iterations on same graph

    initial_h0 = None  # initial vector to start finding optimal H
    exponent = -0.3
    length = 5
    variant = 1

    alpha_vec = [0] * 10
    beta_vec = [0] * 10
    gamma_vec = [0] * 10
    s_vec = [0.5] * 10
    clip_on_vec = [True] * 10
    numMaxIt_vec = [10] * 10

    # Plotting Parameters
    xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    xtick_labels = ['0.001\%', '0.01\%', '0.1\%', '1\%', '10\%', '100\%']
    ytick_lab = np.arange(0, 1.1, 0.1)
    xmax = 1
    xmin = 0.0001
    ymin = 0.3
    ymax = 0.7
    labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
    facecolor_vec = [
        'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
        "#64B5CD"
    ]
    draw_std_vec = [0, 3, 4, 4, 4, 4]
    linestyle_vec = ['dashed'] + ['solid'] * 10
    linewidth_vec = [4, 4, 2, 1, 2]
    marker_vec = [None, 'o', 'x', '^', 'v', '+']
    markersize_vec = [0, 8, 8, 8, 8, 8, 8]

    option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
    learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']

    Macro_Accuracy = False
    EC = True  # Non-backtracking for learning
    constraints = True  # True
    weight_vec = [None] * 3 + [10, 10]
    randomize_vec = [False] * 4 + [True]
    k = 3
    err = 0
    avoidNeighbors = False
    convergencePercentage_W = None
    stratified = True
    gradient = True
    doubly_stochastic = True

    draw_std_vec = range(10)
    numberOfSplits = 1

    select_lambda_vec = [False] * 20
    lambda_vec = None

    f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
    FILENAMEZ = ""
    legend_location = ""
    fig_label = ""
    global exp_backoff
    exp_backoff = [2**n for n in range(6, 12)]

    def choose(choice):
        # -- Default Graph parameters
        global n
        global d
        global rep_SameGraph
        global FILENAMEZ
        global initial_h0
        global exponent
        global length
        global variant

        global alpha_vec
        global beta_vec
        global gamma_vec
        global s_vec
        global clip_on_vec
        global numMaxIt_vec

        # Plotting Parameters
        global xtick_lab
        global xtick_labels
        global ytick_lab
        global xmax
        global xmin
        global ymin
        global ymax
        global labels
        global facecolor_vec
        global draw_std_vec
        global linestyle_vec
        global linewidth_vec
        global marker_vec
        global markersize_vec
        global legend_location

        global option_vec
        global learning_method_vec

        global Macro_Accuracy
        global EC
        global constraints
        global weight_vec
        global randomize_vec
        global k
        global fig_label
        global err
        global avoidNeighbors
        global convergencePercentage_W
        global stratified
        global gradient
        global doubly_stochastic
        global numberOfSplits

        global select_lambda_vec
        global lambda_vec
        global f_vec
        if choice == 0:
            None

        elif choice == 304:  ## with varying weights
            FILENAMEZ = 'prop37'
            Macro_Accuracy = True
            fig_label = 'Prop37'
            legend_location = 'lower right'
            n = 62000
            d = 34.8
            select_lambda_vec = [False] * 5
            # select_lambda_vec = [False] * 3 + [True] * 2  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            # lambda_vec = [0.5] * 21  # same length as f_vec

        elif choice == 305:  # Test row stochastic cases
            choose(304)
            doubly_stochastic = False

        # -- Yelp dataset
        elif choice == 501:
            FILENAMEZ = 'yelp'
            Macro_Accuracy = True
            weight_vec = [None] * 3 + [10, 10]
            gradient = True
            ymin = 0.1
            ymax = 0.75
            fig_label = 'Yelp'
            legend_location = 'upper left'

            n = 4301900  # for figure
            d = 6.56  # for figure

        # -- Flickr dataset
        elif choice == 601:
            FILENAMEZ = 'flickr'
            Macro_Accuracy = True
            fig_label = 'Flickr'
            legend_location = 'lower right'
            n = 2007369
            d = 18.1

        elif choice == 602:  ## with varying weights
            choose(601)

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 603:  ## with varying weights
            choose(602)

            select_lambda_vec = [False] * 3 + [
                True
            ] * 2  # allow to choose lambda for different f in f_vec
            # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6  # same length as f_vec

        elif choice == 604:  ## with weight = 1
            draw_std_vec = [4]
            choose(603)

            lambda_vec = [0.5] * 21  # same length as f_vec

        # -- DBLP dataset
        elif choice == 701:
            FILENAMEZ = 'dblp.txt'
            Macro_Accuracy = True
            ymin = 0.2
            ymax = 0.5
            fig_label = 'DBLP'
            legend_location = 'lower right'
            n = 2241258  # for figure
            d = 26.11  # for figure

        # -- ENRON dataset
        elif choice == 801:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            ymin = 0.3
            ymax = 0.75
            fig_label = 'Enron'
            f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)]
            legend_location = 'upper left'
            n = 46463  # for figures
            d = 23.4  # for figures

        elif choice == 802:  ### WITH ADAPTIVE WEIGHTS
            choose(801)

            select_lambda_vec = [False] * 4 + [
                True
            ]  # allow to choose lambda for different f in f_vec
            lambda_vec = [1] * 11 + [10] * 10  # same length as f_vec

        elif choice == 803:  ### WITH ADAPTIVE WEIGHTS
            choose(802)

            lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [
                1
            ] * 6  # same length as f_vec

        elif choice == 804:
            choose(803)

        elif choice == 805:
            choose(801)
            doubly_stochastic = False

        elif choice == 821:
            FILENAMEZ = 'enron'
            Macro_Accuracy = True
            constraints = True  # True
            gradient = True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [0.2, 0.2]

            randomize_vec = [False] * 4 + [True]
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.7
            labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Enron'
            legend_location = 'lower right'
            n = 46463  # for figures
            d = 23.4  # for figures

            alpha = 0.0
            beta = 0.0
            gamma = 0.0
            s = 0.5
            numMaxIt = 10

            select_lambda_vec = [False] * 3 + [True] * 2
            lambda_vec = [0.2] * 13 + [10] * 8  # same length as f_vec
            captionText = "DCE weight=[0.2*13] [10*8], s={}, numMaxIt={}".format(
                s, numMaxIt)

        # -- Cora dataset
        elif choice == 901:
            FILENAMEZ = 'cora'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.9
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Cora'
            legend_location = 'lower right'
            n = 2708
            d = 7.8

        # -- Citeseer dataset
        elif CHOICE == 1001:
            FILENAMEZ = 'citeseer'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.001
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Citeseer'
            legend_location = 'lower right'
            n = 3312
            d = 5.6

        elif CHOICE == 1101:
            FILENAMEZ = 'hep-th'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.0001
            ymin = 0.0
            ymax = 0.1
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Hep-th'
            legend_location = 'lower right'
            n = 27770
            d = 5.6

        elif CHOICE == 1204:
            FILENAMEZ = 'pokec-gender'
            Macro_Accuracy = True
            constraints = True  # True
            option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5']
            learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE']
            weight_vec = [None] * 3 + [10, 10]

            numMaxIt_vec = [10] * 10
            randomize_vec = [False] * 4 + [True]
            gradient = True
            xmin = 0.000015
            ymin = 0.0
            ymax = 0.75
            labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r']
            facecolor_vec = [
                'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974",
                "#64B5CD"
            ]
            draw_std_vec = [0, 3, 4, 4, 4, 4]
            linestyle_vec = ['dashed'] + ['solid'] * 10
            linewidth_vec = [4, 4, 2, 1, 2]
            marker_vec = [None, 'o', 'x', '^', 'v', '+']
            markersize_vec = [0, 8, 8, 8, 8, 8, 8]
            fig_label = 'Pokec-Gender'
            legend_location = 'lower right'
            n = 1632803
            d = 54.6

        else:
            raise Warning("Incorrect choice!")

    choose(CHOICE)

    csv_filename = 'Fig_End-to-End_accuracy_{}_{}.csv'.format(
        CHOICE, FILENAMEZ)
    header = ['currenttime', 'method', 'f', 'precision', 'recall', 'accuracy']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # print("choice: {}".format(CHOICE))

    # --- print data statistics
    if CALCULATE_DATA_STATISTICS:

        Xd, W = load_Xd_W_from_csv(
            join(realDataDir, FILENAMEZ) + '-classes.csv',
            join(realDataDir, FILENAMEZ) + '-neighbors.csv')

        X0 = from_dictionary_beliefs(Xd)
        n = len(Xd.keys())
        d = (len(W.nonzero()[0]) * 2) / n

        print("FILENAMEZ:", FILENAMEZ)
        print("n:", n)
        print("d:", d)

        # -- Graph statistics
        n_vec = calculate_nVec_from_Xd(Xd)
        print("n_vec:\n", n_vec)
        d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd)
        print("d_vec:\n", d_vec)
        P = calculate_Ptot_from_graph(W, Xd)
        print("P:\n", P)

        # -- Various compatibilities
        H0 = estimateH(X0,
                       W,
                       method='MHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=1,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        print("H0 w/  constraints:\n", np.round(H0, 2))
        raw_input()

        H2 = estimateH(X0,
                       W,
                       method='MHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=1,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H4 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=2,
                       randomize=False,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H5 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=2,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H6 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=2,
                       EC=EC,
                       weights=10,
                       randomize=False,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        H7 = estimateH(X0,
                       W,
                       method='DHE',
                       variant=1,
                       distance=2,
                       EC=EC,
                       weights=10,
                       randomize=False,
                       constraints=True,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)

        # print("H MCE w/o constraints:\n", np.round(H0, 3))
        print("H MCE w/  constraints:\n", np.round(H2, 3))
        # print("H DCE 2 w/o constraints:\n", np.round(H4, 3))
        print("H DCE 2 w/  constraints:\n", np.round(H5, 3))
        # print("H DCE 10 w/o constraints:\n", np.round(H6, 3))
        print("H DCE 20 w/  constraints:\n", np.round(H7, 3))

        H_row_vec = H_observed(W, X0, 3, NB=True, variant=1)
        print("H_est_1:\n", np.round(H_row_vec[0], 3))
        print("H_est_2:\n", np.round(H_row_vec[1], 3))
        print("H_est_3:\n", np.round(H_row_vec[2], 3))

    # --- Create data
    if CREATE_DATA or ADD_DATA:

        Xd, W = load_Xd_W_from_csv(
            join(realDataDir, FILENAMEZ) + '-classes.csv',
            join(realDataDir, FILENAMEZ) + '-neighbors.csv')

        X0 = from_dictionary_beliefs(Xd)
        n = len(Xd.keys())  ## number of nodes in graph

        d = (len(W.nonzero()[0]) * 2) / n
        # print(n)
        # print(d)
        # print("contraint = {}".format(constraints))

        # ---  Calculating True Compatibility matrix
        H0 = estimateH(X0,
                       W,
                       method='MHE',
                       variant=1,
                       distance=1,
                       EC=EC,
                       weights=1,
                       randomize=False,
                       constraints=constraints,
                       gradient=gradient,
                       doubly_stochastic=doubly_stochastic)
        # print(H0)
        H0c = to_centering_beliefs(H0)

        graph_workers = []
        gq = multiprocessing.Queue()
        for j in range(rep_SameGraph):  # repeat several times for same graph

            # print("Graph: {}".format(j))
            graph_workers.append(
                multiprocessing.Process(target=graph_worker, args=(X0, W, gq)))

        for gw in graph_workers:
            gw.start()

        for gw in graph_workers:
            for t in exp_backoff:
                gw.join(t)
                if gw.exitcode is None:
                    print(
                        "failed to join graph worker {} after {} seconds, retrying"
                        .format(gw, t))
                else:
                    continue
            print("Failed to join graph worker {}.".format(gw))

        gq.put('STOP')
        for i in iter(gq.get, 'STOP'):
            save_csv_record(join(data_directory, csv_filename), i)

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    acc_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format(
        CHOICE, FILENAMEZ)
    pr_filename = 'Fig_End-to-End_PR_realData{}_{}.pdf'.format(
        CHOICE, FILENAMEZ)
    # generate_figure(data_directory, acc_filename, df1)
    # generate_figure(data_directory, pr_filename, df1, metric='pr')

    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(5)))

    # Aggregate repetitions
    if "option" in df1.columns.values:
        pivot_col = "option"
        pivot_vec = option_vec
    else:
        pivot_col = "method"
        pivot_vec = learning_method_vec

    df2 = df1.groupby([pivot_col, 'f']).agg \
        ({'accuracy': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'accuracy_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(500)))

    # Pivot table
    df3 = pd.pivot_table(df2,
                         index='f',
                         columns=pivot_col,
                         values=['accuracy_mean', 'accuracy_std'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(5)))

    # Extract values
    X_f = df3['f'].values  # plot x values
    Y = []
    Y_std = []
    for val in pivot_vec:
        Y.append(df3['accuracy_mean_{}'.format(val)].values)
        if STD_FILL:
            Y_std.append(df3['accuracy_std_{}'.format(val)].values)

    if CREATE_PDF or SHOW_PDF or SHOW_PLOT:
        print("Setting up figure...")

        # -- Setup figure
        # remove 4 last characters ".txt"
        fig_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format(
            CHOICE, FILENAMEZ)
        mpl.rc(
            'font', **{
                'family': 'sans-serif',
                'sans-serif': [u'Arial', u'Liberation Sans']
            })
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 14  # 6
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams[
            'xtick.major.pad'] = 2  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 1  # padding of tick labels: default = 4
        mpl.rcParams['xtick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['ytick.direction'] = 'out'  # default: 'in'
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['figure.figsize'] = [4, 4]
        fig = figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        #  -- Drawing
        if STD_FILL:
            for choice, (option,
                         facecolor) in enumerate(zip(option_vec,
                                                     facecolor_vec)):
                if choice in draw_std_vec:
                    ax.fill_between(X_f,
                                    Y[choice] + Y_std[choice],
                                    Y[choice] - Y_std[choice],
                                    facecolor=facecolor,
                                    alpha=0.2,
                                    edgecolor=None,
                                    linewidth=0)
                    ax.plot(X_f,
                            Y[choice] + Y_std[choice],
                            linewidth=0.5,
                            color='0.8',
                            linestyle='solid')
                    ax.plot(X_f,
                            Y[choice] - Y_std[choice],
                            linewidth=0.5,
                            color='0.8',
                            linestyle='solid')

        for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
                enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):
            ax.plot(X_f,
                    Y[choice],
                    linewidth=linewidth,
                    color=color,
                    linestyle=linestyle,
                    label=label,
                    zorder=4,
                    marker=marker,
                    markersize=markersize,
                    markeredgewidth=1,
                    clip_on=clip_on)

        # -- Title and legend
        if n < 1000:
            n_label = '{}'.format(n)
        else:
            n_label = '{}k'.format(int(n / 1000))

        title(r'$\!\!\!\!\!\!\!${}: $n={}, d={}$'.format(
            fig_label, n_label, np.round(d, 1)))
        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            labels,
            loc=legend_location,  # 'upper right'
            handlelength=2,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            # title='Variants',
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        # # legend.set_zorder(1)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8
        plt.xscale('log')

        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_labels)
        plt.yticks(ytick_lab, ytick_lab)

        # Only show ticks on the left and bottom spines
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f'))

        grid(b=True,
             which='major',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        grid(b=True,
             which='minor',
             axis='both',
             alpha=0.2,
             linestyle='solid',
             linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        xlabel(r'Label Sparsity $(f)$', labelpad=0)  # labelpad=0
        ylabel(r'Accuracy', labelpad=0)

        xlim(xmin, xmax)
        ylim(ymin, ymax)

        if CREATE_PDF:
            print("saving PDF of figure...")
            savefig(join(figure_directory, fig_filename),
                    format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    frameon=None)

        if SHOW_PLOT:
            print("Showing plot...")
            plt.show()

        if SHOW_PDF:
            print("Showing pdf...")
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
def run(choice,
        variant,
        create_data=False,
        add_data=False,
        create_graph=False,
        create_fig=True,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        shorten_length=False,
        show_arrows=True):
    """main parameterized method to produce all figures.
    Can be run from external jupyther notebook or method to produce all figures in PDF
    """

    # -- Setup
    CHOICE = choice  # determines the CSV data file to use
    VARIANT = variant  # determines the variant of how the figures are plotted
    CREATE_DATA = create_data  # starts new CSV file and stores experimental timing results
    ADD_DATA = add_data  # adds data to existing file
    CREATE_GRAPH = create_graph  # creates the actual graph for experiments (stores W and X in CSV files)

    SHOW_PDF = show_pdf
    SHOW_PLOT = show_plot
    CREATE_FIG = create_fig
    CREATE_PDF = create_pdf
    SHORTEN_LENGTH = shorten_length  # to prune certain fraction of data to plot
    SHOW_SCALING_LABELS = True  # first entry in the legend is for the dashed line of scalability
    SHOW_TITLE = True  # show parameters in title of plot
    SHOW_DCER_WITH_BOX = True  # show DCER value in a extra box
    LABEL_FONTSIZE = 16  # size of number labels in figure
    SHOW_LINEAR = True  # show dashed line for linear scaling

    SHOW_ARROWS = show_arrows  # show extra visual comparison of speed-up

    csv_filename = 'Fig_Timing_{}.csv'.format(
        CHOICE)  # CSV filename includes CHOICE
    filename = 'Fig_Timing_{}-{}'.format(
        CHOICE, VARIANT)  # PDF filename includes CHOICE and VARIANT
    header = ['n', 'type', 'time']
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    distribution = 'powerlaw'
    exponent = -0.3
    k = 3
    a = 1  # this value was erroneously set to 5 previously!!! TODO: fix everywhere else
    # err = 0
    avoidNeighbors = False
    f = 0.1
    est_EC = True  # !!! TODO: for graph estimation
    weights = 10
    pyamg = False
    convergencePercentage_W = None
    alpha = 0
    beta = 0
    gamma = 0
    s = 0.5
    numMaxIt = 10
    xtick_lab = [0.001, 0.01, 0.1, 1]
    ytick_lab = np.arange(0, 1, 0.1)
    xmin = 1e2
    xmax = 1e8
    # xmax = 1e6
    ymin = 1e-3
    ymax = 5e3
    color_vec = [
        "#4C72B0", "#55A868", "#8172B2", "#C44E52", "#CCB974", 'black',
        'black', "#64B5CD", "black"
    ]
    marker_vec = ['s', '^', 'x', 'o', 'None', 'None', 'None', 'None']
    linestyle_vec = ['solid'] * 6 + ['dashed']
    linewidth_vec = [3] * 3 + [4, 3, 4] + [3] * 7
    SHOWMAXNUMBER = True
    show_num_vec = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max']

    # %% -- Main Options
    if CHOICE == 3:
        n_vec = [
            100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400,
            204800, 409600, 819200, 1638400, 3276800, 6553600
        ]
        # # n_vec = [1638400]  # graph:  12021 sec = 3.4h, 18600 sec = 5h, 21824 sec (34000 sec old laptop)
        # # n_vec = [3276800]  # graph:  49481 sec = 13.8h, 68145 sec (125233 sec old laptop)
        # # n_vec = [6553600]  # graph: 145020 sec = 40h
        h = 8
        d = 5

        repeat_vec_vec = [[
            50, 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 5, 5, 3, 3, 3, 3
        ], [5, 5, 5, 5, 3, 3, 3, 3, 3, 1,
            1], [20, 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 3, 3, 1, 1, 1, 1]]
        method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']]

        if VARIANT == 1:
            method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']
            label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop']
            show_num_vec = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']

        if VARIANT == 2:  # version used for main paper figure
            method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']
            label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop']
            linestyle_vec = ['solid'] * 5 + ['dashed']
            SHOW_ARROWS = False

        if VARIANT == 3:  # version used for main paper figure
            method_vec_fig = ['DHEr', 'Holdout', 'prop']
            label_vec = [
                'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$'
            ]
            linestyle_vec = ['solid'] * 2 + ['dashed']
            color_vec = [
                "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black"
            ]
            marker_vec = ['o', 'x', 'None', 'None', 'None']
            linestyle_vec = ['solid'] * 3 + ['dashed']
            linewidth_vec = [4, 3, 4] + [3] * 7
            ymin = 1e-2
            SHOW_ARROWS = True

        if VARIANT == 4:  # figure used in slides
            method_vec_fig = ['prop']
            label_vec = ['Propagation']
            color_vec = ['black']
            marker_vec = ['None']
            linestyle_vec = ['solid'] * 1
            linewidth_vec = [2]
            ymin = 1e-2
            SHOW_ARROWS = False
            SHOW_SCALING_LABELS = False
            SHOW_TITLE = False
            SHOW_DCER_WITH_BOX = False
            LABEL_FONTSIZE = 20
            SHOW_LINEAR = False

        if VARIANT == 5:  # figure used in slides
            method_vec_fig = ['prop', 'Holdout']
            label_vec = ['Propagation', 'Baseline']
            color_vec = ['black', "#CCB974"]
            marker_vec = ['None', '^']
            linestyle_vec = ['solid'] * 2
            linewidth_vec = [2, 4]
            ymin = 1e-2
            SHOW_ARROWS = True
            SHOW_SCALING_LABELS = False
            SHOW_TITLE = False
            SHOW_DCER_WITH_BOX = False
            LABEL_FONTSIZE = 20
            SHOW_LINEAR = False

        if VARIANT == 6:  # figure used in slides
            method_vec_fig = ['prop', 'Holdout', 'DHEr']
            label_vec = ['Propagation', 'Baseline', 'Our method']
            color_vec = ['black', "#CCB974", "#C44E52"]
            marker_vec = ['None', '^', 'o', 'None', 'None']
            linestyle_vec = ['solid'] + ['solid'] * 2
            linewidth_vec = [2, 4, 4]
            ymin = 1e-2
            SHOW_ARROWS = True
            SHOW_SCALING_LABELS = False
            SHOW_TITLE = True
            SHOW_DCER_WITH_BOX = False
            LABEL_FONTSIZE = 20
            SHOW_LINEAR = False

        graph_cvs = 'Fig_Timing_SSLH_1'  # re-use existing large graphs

    elif CHOICE == 4:
        n_vec = [
            200,
            400,
            800,
            1600,
            3200,
            6400,
            12800,
            25600,
            51200,
            102400,
            204800,
            409600,
            819200,
        ]
        # n_vec = [819200]    # graph: 47905 sec = 13.3h. 90562 sec = 25h (180527 sec old laptop)
        h = 3
        d = 25
        repeat_vec_vec = [[
            50,
            50,
            50,
            50,
            50,
            50,
            20,
            10,
            10,
            5,
            3,
            3,
            3,
        ], [5, 5, 5, 3, 1, 1, 1, 1, 1],
                          [
                              20,
                              20,
                              10,
                              10,
                              10,
                              10,
                              10,
                              5,
                              5,
                              5,
                              1,
                              1,
                              1,
                          ]]
        method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']]

        VARIANT = 2

        if VARIANT == 1:
            method_vec_fig = [
                'MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max'
            ]
            label_vec = [
                'MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop',
                '$\epsilon_{\mathrm{max}}$'
            ]
            show_num_vec = [
                'MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max'
            ]

        if VARIANT == 2:
            method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop']
            label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop']
            linestyle_vec = ['solid'] * 5 + ['dashed']

        if VARIANT == 3:
            method_vec_fig = ['DHEr', 'Holdout', 'prop']

            label_vec = [
                'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$'
            ]
            linestyle_vec = ['solid'] * 2 + ['dashed']
            color_vec = [
                "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black"
            ]
            marker_vec = ['o', 'x', 'None', 'None', 'None']
            linestyle_vec = ['solid'] * 3 + ['dashed']
            linewidth_vec = [4, 3, 4] + [3] * 7
            ymin = 1e-2

        graph_cvs = 'Fig_Timing_SSLH_2'  # re-use existing large graphs
        xmin = 1e3
        xmax = 5e7
        ymax = 1e3

    elif CHOICE == 2:
        # rep_Estimation = 10
        # n_vec = [200, 400, 800, 1600, 3200, 6400, 12800,
        #          25600, 51200, 102400, 204800, 409600, 819200]
        # repeat_vec = [20, 20, 20, 20, 20, 10, 10,
        #               10, 10, 10, 5, 5, 1]
        # n_vec = [819200]    # graph: 47905 sec = 13.3h. 90562 sec = 25h (180527 sec old laptop)
        n_vec = [1638400]  # !!! not done yet
        repeat_vec = [1]
        h = 3
        d = 25
        xmax = 5e7
        graph_cvs = 'Fig_Timing_SSLH_2'

    elif CHOICE == 10:  # same as 3 but with difference bars
        n_vec = [
            100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400,
            204800, 409600, 819200, 1638400, 3276800, 6553600
        ]
        # # n_vec = [1638400]  # graph:  12021 sec = 3.4h, 18600 sec = 5h, 21824 sec (34000 sec old laptop)
        # # n_vec = [3276800]  # graph:  49481 sec = 13.8h, 68145 sec (125233 sec old laptop)
        # # n_vec = [6553600]  # graph: 145020 sec = 40h
        h = 8
        d = 5

        repeat_vec_vec = [[
            50, 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 5, 5, 3, 3, 3, 3
        ], [5, 5, 5, 5, 3, 3, 3, 3, 3, 1,
            1], [20, 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 3, 3, 1, 1, 1, 1]]
        method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']]

        method_vec_fig = ['DHEr', 'Holdout', 'prop']
        label_vec = [
            'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$'
        ]
        linestyle_vec = ['solid'] * 2 + ['dashed']
        color_vec = [
            "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black"
        ]
        marker_vec = ['o', 'x', 'None', 'None', 'None']
        linestyle_vec = ['solid'] * 3 + ['dashed']
        linewidth_vec = [4, 3, 4] + [3] * 7
        ymin = 1e-2

        graph_cvs = 'Fig_Timing_SSLH_1'  # re-use existing large graphs

    else:
        raise Warning("Incorrect choice!")

    # %% -- Common options

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    H0c = to_centering_beliefs(H0)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    # print("CHOICE: {}".format(CHOICE))

    def save_tuple(n, label, time):
        tuple = [str(datetime.datetime.now())]
        text = [n, label, time]
        tuple.extend(text)
        print("time potential {}: {}".format(label, time))
        save_csv_record(join(data_directory, csv_filename), tuple)

    # %% -- Create data
    if CREATE_DATA or ADD_DATA:

        for repeat_vec, method_vec in zip(repeat_vec_vec, method_vec_vec):

            for n, repeat in zip(n_vec, repeat_vec):
                print("\nn: {}".format(n))
                # repeat = repeat_vec[j]

                # -- Graph
                if CREATE_GRAPH:
                    start = time.time()
                    W, Xd = planted_distribution_model(
                        n,
                        alpha=alpha0,
                        P=H0,
                        m=d * n,
                        distribution=distribution,
                        exponent=exponent,
                        directed=False,
                        debug=False)
                    X0 = from_dictionary_beliefs(Xd)
                    time_graph = time.time() - start

                    save_W(join(data_directory,
                                '{}_{}_W.csv'.format(graph_cvs, n)),
                           W,
                           saveWeights=False)
                    save_X(
                        join(data_directory,
                             '{}_{}_X.csv'.format(graph_cvs, n)), X0)
                    save_tuple(n, 'graph', time_graph)

                else:
                    W, _ = load_W(join(data_directory,
                                       '{}_{}_W.csv'.format(graph_cvs, n)),
                                  skiprows=1,
                                  zeroindexing=True,
                                  n=None,
                                  doubleUndirected=False)
                    X0, _, _ = load_X(join(data_directory,
                                           '{}_{}_X.csv'.format(graph_cvs, n)),
                                      n=None,
                                      k=None,
                                      skiprows=1,
                                      zeroindexing=True)

                # -- Repeat loop
                for i in range(repeat):
                    print("\n  repeat: {}".format(i))
                    X2, ind = replace_fraction_of_rows(
                        X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W)

                    for method in method_vec:

                        if method == 'DHE':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='DHE',
                                           variant=1,
                                           distance=5,
                                           EC=est_EC,
                                           weights=weights)
                            time_est = time.time() - start
                            save_tuple(n, 'DHE', time_est)

                        elif method == 'DHEr':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='DHE',
                                           variant=1,
                                           distance=5,
                                           EC=est_EC,
                                           weights=weights,
                                           randomize=True)
                            time_est = time.time() - start
                            save_tuple(n, 'DHEr', time_est)

                        elif method == 'MHE':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='MHE',
                                           variant=1,
                                           distance=1,
                                           EC=est_EC,
                                           weights=None)
                            time_est = time.time() - start
                            save_tuple(n, 'MHE', time_est)

                        elif method == 'LHE':
                            start = time.time()
                            H2 = estimateH(X2,
                                           W,
                                           method='LHE',
                                           variant=1,
                                           distance=1,
                                           EC=est_EC,
                                           weights=None)
                            time_est = time.time() - start
                            save_tuple(n, 'LHE', time_est)

                        elif method == 'Holdout':
                            start = time.time()
                            H2 = estimateH_baseline_serial(
                                X2,
                                ind,
                                W,
                                numMax=numMaxIt,
                                numberOfSplits=1,
                                # EC=EC,
                                # weights=weight,
                                alpha=alpha,
                                beta=beta,
                                gamma=gamma)
                            time_est = time.time() - start
                            save_tuple(n, 'Holdout', time_est)

                        elif method == 'prop':
                            H2c = to_centering_beliefs(H0)
                            X2c = to_centering_beliefs(
                                X2, ignoreZeroRows=True)  # try without
                            start = time.time()
                            eps_max = eps_convergence_linbp_parameterized(
                                H2c,
                                W,
                                method='noecho',
                                alpha=alpha,
                                beta=beta,
                                gamma=gamma,
                                X=X2,
                                pyamg=pyamg)
                            time_eps_max = time.time() - start
                            save_tuple(n, 'eps_max', time_eps_max)

                            # -- Propagate
                            eps = s * eps_max
                            try:
                                start = time.time()
                                F, actualIt, actualPercentageConverged = \
                                    linBP_symmetric_parameterized(X2, W, H2c * eps,
                                                                  method='noecho',
                                                                  alpha=alpha, beta=beta, gamma=gamma,
                                                                  numMaxIt=numMaxIt,
                                                                  convergencePercentage=convergencePercentage_W,
                                                                  debug=2)
                                time_prop = time.time() - start
                            except ValueError as e:
                                print("ERROR: {}: d={}, h={}".format(e, d, h))
                            else:
                                save_tuple(n, 'prop', time_prop)

                        else:
                            raise Warning("Incorrect choice!")

    # %% -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(50)))

    # Aggregate repetitions
    df2 = df1.groupby(['n', 'type']).agg \
        ({'time': [np.mean, np.median, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # Pivot table
    df3 = pd.pivot_table(df2,
                         index=['n'],
                         columns=['type'],
                         values=['time_mean', 'time_median'])  # Pivot
    # df3 = pd.pivot_table(df2, index=['n'], columns=['type'], values=['time_mean', 'time_median', 'time_std'] )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    # Extract values
    X = df3['n'].values  # plot x values
    X = X * d / 2  # calculate edges (!!! notice dividing by 2 as one edge appears twice in symmetric adjacency matrix)
    Y = {}
    for method in method_vec_fig:
        # Y[method] = df3['time_mean_{}'.format(method)].values
        Y[method] = df3['time_median_{}'.format(method)].values

    if SHORTEN_LENGTH:
        SHORT_FACTOR = 4  ## KEEP EVERY Nth ELEMENT
        X = np.copy(X[list(range(0, len(X), SHORT_FACTOR)), ])
        for method in method_vec_fig:
            Y[method] = np.copy(
                Y[method][list(range(0, len(Y[method]), SHORT_FACTOR)), ])

    # %% -- Figure
    if CREATE_FIG:
        fig_filename = '{}.pdf'.format(
            filename)  # TODO: repeat pattern in other files
        mpl.rcParams['backend'] = 'agg'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = LABEL_FONTSIZE
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 12
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams[
            'xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Draw the plots
        if SHOW_LINEAR:
            ax.plot([1, 1e8], [1e-5, 1e3],
                    linewidth=1,
                    color='gray',
                    linestyle='dashed',
                    label='1sec/100k edges',
                    clip_on=True,
                    zorder=3)
        for i, (method, color, marker, linewidth, linestyle) in enumerate(
                zip(method_vec_fig, color_vec, marker_vec, linewidth_vec,
                    linestyle_vec)):
            ax.plot(X,
                    Y[method],
                    linewidth=linewidth,
                    color=color,
                    linestyle=linestyle,
                    label=label_vec[i],
                    clip_on=True,
                    marker=marker,
                    markersize=6,
                    markeredgewidth=1,
                    markeredgecolor='black',
                    zorder=4)

            # for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \
            #         enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)):
            #     P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker,
            #                 markersize=markersize, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on)

            if SHOWMAXNUMBER and method in show_num_vec:
                if method == 'DHEr' and SHOW_DCER_WITH_BOX:
                    j = np.argmax(np.ma.masked_invalid(
                        Y[method]))  # mask nan, then get index of max element
                    ax.annotate(int(np.round(Y[method][j])),
                                xy=(X[j] * 1.5, Y[method][j]),
                                color=color,
                                va='center',
                                bbox=dict(boxstyle="round,pad=0.3", fc="w"),
                                annotation_clip=False,
                                zorder=5)
                else:
                    j = np.argmax(np.ma.masked_invalid(
                        Y[method]))  # mask nan, then get index of max element
                    ax.annotate(int(np.round(Y[method][j])),
                                xy=(X[j] * 1.5, Y[method][j]),
                                color=color,
                                va='center',
                                annotation_clip=False,
                                zorder=5)

        if SHOW_ARROWS:
            dce_opt = 'DHEr'
            holdout_opt = 'Holdout'
            prop_opt = 'prop'

            j_holdout = np.argmax(np.ma.masked_invalid(Y[holdout_opt]))

            if dce_opt in Y:
                j_dce = np.argmax(np.ma.masked_invalid(Y[dce_opt]))
                ax.annotate(s='',
                            xy=(X[j_dce], Y[prop_opt][j_dce]),
                            xytext=(X[j_dce], Y[dce_opt][j_dce]),
                            arrowprops=dict(arrowstyle='<->'))
                ax.annotate(
                    str(int(np.round(Y[prop_opt][j_dce] / Y[dce_opt][j_dce])))
                    + 'x',
                    xy=(X[j_dce],
                        int(Y[prop_opt][j_dce] + Y[dce_opt][j_dce]) / 6),
                    color='black',
                    va='center',
                    fontsize=14,
                    # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                    annotation_clip=False,
                    zorder=5)

                ax.annotate(s='',
                            xy=(X[j_holdout], Y[holdout_opt][j_holdout]),
                            xytext=(X[j_holdout], Y[dce_opt][j_holdout]),
                            arrowprops=dict(arrowstyle='<->'))
                ax.annotate(
                    str(
                        int(
                            np.round(Y[holdout_opt][j_holdout] /
                                     Y[dce_opt][j_holdout]))) + 'x',
                    xy=(X[j_holdout],
                        int(Y[holdout_opt][j_holdout] + Y[dce_opt][j_holdout])
                        / 8),
                    color='black',
                    va='center',
                    fontsize=14,
                    # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                    annotation_clip=False,
                    zorder=5)

            else:  # in case dce_opt not shown, then show arrow as compared to prop method
                ax.annotate(s='',
                            xy=(X[j_holdout], Y[holdout_opt][j_holdout]),
                            xytext=(X[j_holdout], Y[prop_opt][j_holdout]),
                            arrowprops=dict(arrowstyle='<->'))
                ax.annotate(
                    str(
                        int(
                            np.round(Y[holdout_opt][j_holdout] /
                                     Y[prop_opt][j_holdout]))) + 'x',
                    xy=(X[j_holdout],
                        int(Y[holdout_opt][j_holdout] + Y[prop_opt][j_holdout])
                        / 8),
                    color='black',
                    va='center',
                    fontsize=14,
                    # bbox = dict(boxstyle="round,pad=0.3", fc="w"),
                    annotation_clip=False,
                    zorder=5)

        if SHOW_TITLE:
            plt.title(r'$\!\!\!d\!=\!{}, h\!=\!{}$'.format(d, h))

        handles, labels = ax.get_legend_handles_labels()
        if not SHOW_SCALING_LABELS and SHOW_LINEAR:
            handles = handles[1:]
            labels = labels[1:]

        legend = plt.legend(
            handles,
            labels,
            loc='upper left',  # 'upper right'
            handlelength=2,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.3,  # padding inside legend box
            numpoints=1,  # put the marker only once
        )
        legend.set_zorder(3)
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.2)  # 0.8

        # -- Figure settings and save
        plt.minorticks_on()
        plt.xscale('log')
        plt.yscale('log')
        minorLocator = LogLocator(
            base=10, subs=[0.1 * n for n in range(1, 10)], numticks=40
        )  # TODO: discuss with Paul trick that helped with grid lines last time; necessary in order to create the log locators (otherwise does now show the wanted ticks
        #         ax.xaxis.set_minor_locator(minorLocator)
        plt.xticks([1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9])
        plt.grid(True,
                 which='both',
                 axis='both',
                 alpha=0.2,
                 linestyle='-',
                 linewidth=1,
                 zorder=1)  # linestyle='dashed', which='minor', axis='y',
        # grid(b=True, which='minor', axis='x', alpha=0.2, linestyle='solid', linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'Number of edges ($m$)', labelpad=0)  # labelpad=0
        plt.ylabel(r'Time [sec]', labelpad=0)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        # print(ax.get_xaxis().get_minor_locator())

        if CREATE_PDF:
            plt.savefig(
                join(figure_directory, fig_filename),
                format='pdf',
                dpi=None,
                edgecolor='w',
                orientation='portrait',
                transparent=False,
                bbox_inches='tight',
                pad_inches=0.05,
                # frameon=None
            )
        if SHOW_PDF:
            showfig(join(figure_directory,
                         fig_filename))  # shows actually created PDF
        if SHOW_PLOT:
            plt.show()
def test_linBP_symmetric_Torus():
    # Shows that with s>1 LinBP will diverge and v.v., for Torus graph
    # Interesting is that with H (instead of Hc) and echo=True, just above s=1, the oscillations can start late
    print("\n-- 'linBP_symmetric', 'eps_convergence_linbp', with Torus --")

    # -- Load W, create X and P
    W, n = load_W(join(data_directory, 'Torus_W.csv'), zeroindexing=False)
    X = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0],
                  [0, 0, 0], [0, 0, 0], [0, 0, 0]])
    H = np.array([[0.1, 0.8, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8]])
    print("W:\n", W.todense())
    print("X:\n", X)
    Xc = to_centering_beliefs(X, ignoreZeroRows=True)
    print("Xc:\n", Xc)
    Xl = to_explicit_list(X)
    print("Xl:\n", Xl)
    print("H:\n", H)
    Hc = to_centering_beliefs(H)
    print("Hc:\n", Hc)

    # -- Other eps_max for 3 x 2 methods
    print("\neps_max without echo and Hc:")
    print(" eps_max (W):              ", eps_convergence_linbp(Hc, W))
    print("eps_max with echo and Hc:")
    print(" eps_max (W):              ", eps_convergence_linbp(Hc,
                                                               W,
                                                               echo=True))
    print("eps_max with echo and compensation and Hc:")
    print(" eps_max (W):              ",
          eps_convergence_linbp(Hc, W, echo=True, compensation=True))

    print("\neps_max without echo and H:")
    print(" eps_max (W):              ", eps_convergence_linbp(H, W))
    print("eps_max with echo and H:")
    print(" eps_max (W):              ", eps_convergence_linbp(H, W,
                                                               echo=True))
    print("eps_max with echo and compensation and H:")
    print(" eps_max (W):              ",
          eps_convergence_linbp(H, W, echo=True, compensation=True))

    # -- Define parameters and run LinBP
    print("\nActual run with various parameters")
    s = 1.15  # 0.4
    numMaxIt = 200
    echo = True
    convergencePercentage = None  # 0.5
    convergenceThreshold = 0.99
    eps_max = eps_convergence_linbp(H, W, echo=echo)

    print("eps:", s)
    print("echo:", echo)

    listF, actualNumIt, listConverged = linBP_symmetric(
        Xc,
        W,
        H * eps_max * s,
        echo=echo,
        numMaxIt=numMaxIt,
        convergencePercentage=convergencePercentage,
        convergenceThreshold=convergenceThreshold,
        debug=3)

    # # -- Display BP results
    print("\nlinBP results:")
    print(
        "Notice that we get identical results with X or Xc, and for Hc or H (except for convergence)"
    )
    print("\nlast two F:")
    print(listF[-2])
    print(listF[-1])
    print("actualNumIt:", actualNumIt)
    print("listConverged:\n", listConverged)

    # print("\nValues for node 6 (zero indexing):"
    # print listF[:, 6, :]
    # print("all:\n", listF

    # -- Visualize BP results
    filename = join(fig_directory, 'Fig_temp_SSLH_inference.pdf')
    print("\nVisualize values for node 3 (zero indexing):")
    node = 3
    plt.plot(listF[:, node, :], lw=2)
    plt.xlabel('# iterations')
    plt.ylabel('belief')
    plt.xlim(0, numMaxIt)

    print(filename)

    plt.savefig(filename,
                dpi=None,
                facecolor='w',
                edgecolor='w',
                orientation='portrait',
                papertype='letter',
                format='pdf',
                transparent=True,
                bbox_inches='tight',
                pad_inches=0.1)
    os.system("chmod 744 " +
              filename)  # first change permissions in order to open PDF
    os.system("open " + filename)  # open PDF