Esempio n. 1
0
    def assign_no_where_to_go(self, threshold: float = 0.9):
        """
        now we put all entities related to one or more external links to a ta2 cluster,
        and merged the chained no-elink clusters,
        compare each cluster to existing ta2 clusters to merge them,
        otherwise compare each pair of clusters in self.no_where_to_go to decide if merge them
        :return: None
        """
        lefts = []
        for i in range(len(self.no_where_to_go)):
            # try to assign to existing ta2 cluster, no chained to avoid FPs
            cur_i = self.no_where_to_go[i]
            to_go = self.get_best(target=cur_i, threshold=threshold)
            if to_go:
                for mem in cur_i.members.values():
                    to_go.add_member(mem)
            else:
                lefts.append(i)

        # no similar ta2 cluster to go, try to merge with others
        edges = {}
        for i in range(len(lefts) - 1):
            for j in range(i + 1, len(lefts)):
                cur_i = self.no_where_to_go[lefts[i]]
                cur_j = self.no_where_to_go[lefts[j]]
                if cur_i.calc_similarity(cur_j, JARO_CACHE,
                                         threshold) > threshold:
                    if i not in edges:
                        edges[i] = []
                    if j not in edges:
                        edges[j] = []
                    edges[i].append(j)
                    edges[j].append(i)

        groups = []
        visited = set()
        for i in edges:
            if i not in visited:
                to_check = [i]
                added = {i}
                idx = 0
                while idx < len(to_check):
                    cur = to_check.pop()
                    for j in edges[cur]:
                        if j not in added:
                            to_check.append(j)
                            added.add(j)
                visited = visited.union(added)
                groups.append(to_check)

        for i in range(len(groups)):
            cur = Cluster([])
            for _ in groups[i]:
                for mem in self.no_where_to_go[lefts[_]].members.values():
                    cur.add_member(mem)
            self.ta2_clusters['NO_EXTERNAL_LINK_CLUSTERS_%d' % i] = cur
Esempio n. 2
0
 def test_main_cluster_runner(self):
     result = cluster_runner(Cluster(4, 2), [1, 3, 0, 1, 0, 1])
     list_expected = [[[4]], [[3, 4], [4, 4]], [[2, 3], [3, 3]],
                      [[1, 2], [2, 2], [4]], [[1], [1, 1], [3]], [[2, 4]],
                      [[1, 3]], [[2]], [[1]]]
     self.assertEqual(15, result['cost'])
     self.assertListEqual(list_expected, result['logs'])
Esempio n. 3
0
    def init_el_based_clusters(self, entity_json, cluster_json):
        """
        create Entity instance for each entity and put in self.entities ;
        create Cluster instance for each external link and put entities with elink to corresponding ta2_clusters;
        go over ta1 clusters and put every no-elink entity to self.no_link, record siblings' elinks or ta1 cluster uri.
        :param entity_json: raw info {entity_uri: [name_or_{translation:[tran1,tran2]}, type, external_link], ... }
        :param cluster_json: raw info {cluster_uri: [[member1, member2], [prototype1]], ... }
        :return: None
        """

        # init all entities
        # init ta2 clusters, group by external links(skip 'others')
        for ent, attr in entity_json.items():
            name, _type, link = attr
            names = self.parse_name(name)
            _type = _type.rsplit('#', 1)[-1]
            link = self.parse_link(link)
            self.entities[ent] = Entity(ent, names, _type, link)
            if link != OTHERS:
                if link not in self.ta2_clusters:
                    self.ta2_clusters[link] = Cluster([])
                self.ta2_clusters[link].add_member(self.entities[ent])
            else:
                self.no_link[ent] = [set(), set()]
        '''
        now we have:
        self.entities - dict, each key is an entity uri, each value is the corresponding Entity object
        self.ta2_clusters - dict, each key is a real external link, each value is the corresponding Cluster object
        self.no_link - dict, each key is an entity uri, each value is two sets:
                one to store elinks related to the entity, the other to store the ta1 cluster uri
        then all the entities are either in ta2_clusters's Clusters, or in no_link's keys
        '''

        # process ta1 clusters
        for cluster, mems in cluster_json.items():
            self.cluster_to_ent[cluster] = set()
            cur = Cluster([])
            members, prototypes = mems
            cur_no_link = set()
            for m in members:
                cur.add_member(self.entities[m])
                if self.entities[m].link == OTHERS:
                    cur_no_link.add(m)
            for m in prototypes:
                if m in self.entities:
                    cur.add_member(self.entities[m])
                    if self.entities[m].link == OTHERS:
                        cur_no_link.add(m)
            for elink in cur.links:
                if elink == OTHERS:
                    for m in cur_no_link:
                        self.cluster_to_ent[cluster].add(m)
                        self.no_link[m][1].add(cluster)
                else:
                    for m in cur_no_link:
                        self.cluster_to_ent[cluster].add(m)
                        self.no_link[m][0].add(elink)
Esempio n. 4
0
 def assign_chained_elink(self):
     """
     now we have filled self.no_links: {entity_uri: [(some_external_links), (some_ta1_cluster_uris)], ... }
     and self.cluster_to_ent - dict: {ta1_cluster_uri: (entities_with_no_link_uris), ... }
     :return: None
     """
     # for each entity in no_link, try to find a best place to go
     visited = set()
     for ent_uri, elink_ta1cluster in self.no_link.items():
         elinks, ta1s = elink_ta1cluster
         cur_ent = self.entities[ent_uri]
         if len(elinks):
             cur_cluster = self.get_best(cur_ent, elinks)
             cur_cluster.add_member(cur_ent)
         else:
             # not directly clustered with entity with elink:
             # no elinks, try to find chained elinks, otherwise no where to go
             if ent_uri in visited:
                 return
             visited_no_el_sibling_ent = {ent_uri}
             elinks = set()
             added = ta1s
             to_check = list(ta1s)
             # TODO: confidence by hops? now ents with >= 1 hop sibling with elinks are treat in the same way
             while to_check:
                 cur_cluster = to_check.pop()
                 for sibling in self.cluster_to_ent[cur_cluster]:
                     if len(self.no_link[sibling][0]):
                         # find external links
                         elinks = elinks.union(self.no_link[sibling][0])
                     else:
                         visited_no_el_sibling_ent.add(ent_uri)
                     for next_hop_cluster in self.no_link[sibling][1]:
                         # add other chained clusters to check
                         if next_hop_cluster not in added:
                             added.add(next_hop_cluster)
                             to_check.append(next_hop_cluster)
             if len(elinks):
                 for covered_ent in visited_no_el_sibling_ent:
                     cur_cluster = self.get_best(self.entities[covered_ent],
                                                 elinks)
                     cur_cluster.add_member(self.entities[covered_ent])
             else:
                 cur_cluster = Cluster([
                     self.entities[covered_ent]
                     for covered_ent in visited_no_el_sibling_ent
                 ])
                 self.no_where_to_go.append(cur_cluster)
             visited = visited.union(visited_no_el_sibling_ent)
Esempio n. 5
0
def run(
        seed: int = None,
        n: int = 100,
        graphs: Iterable[str] = (),
        n_samples: int = None,
        n_features: int = 100,
        dataset: str = None,
        smv_label_flip_prob: float = 0.0,
        error_mean: float = 0.0,
        error_std_dev: float = 0.0,
        node_error_mean: float = 0.0,
        node_error_std_dev: float = 0.0,
        starting_weights_domain: Union[List[float], Tuple[float]] = None,
        max_iter: int = None,
        max_time: float = None,
        method: Union[str, None] = 'classic',
        alpha: float = None,
        learning_rate: str = 'constant',
        spectrum_dependent_learning_rate: bool = False,
        dual_averaging_radius=10,
        time_distr_class: object = statistics.ExponentialDistribution,
        time_distr_param: list = (1,),
        time_distr_param_rule: str = None,
        time_const_weight: float = 0,
        real_y_activation_func: callable = None,
        obj_function: str = 'mse',
        average_model_toggle: bool = False,
        metrics: list = (),
        real_metrics: list = (),
        real_metrics_toggle: bool = False,
        metrics_type: int = 0,
        metrics_nodes: str = 'all',
        shuffle: bool = True,
        batch_size: int = 20,
        epsilon: float = None,
        save_test_to_file: bool = False,
        test_folder_name_struct: list = (
                'u040',
                'shuffle',
                'w_domain',
                'metrics',
                'dataset',
                'distr',
                'error',
                'nodeserror',
                'alpha',
                'nodes',
                'samp',
                'feat',
                'time',
                'iter',
                'c',
                'method'
        ),
        test_parent_folder: str = "",
        instant_plot: bool = False,
        plots: list = ('mse_iter',),
        save_plot_to_file: bool = False,
        plot_global_w: bool = False,
        plot_node_w: Union[bool, int, List[int]] = False,
        verbose_main: int = 0,
        verbose_cluster: int = 0,
        verbose_node: int = 0,
        verbose_task: int = 0,
        verbose_plotter: int = 0
):
    """
    Main method.

    Parameters
    ----------
    seed : int or None:
        Random simulation seed. If None will be taken from current time.
    n : int
        Amount of nodes in the cluster.
    graphs: List[str]
        List of topologies to run the simulation with.
    n_samples : int
        Total number of samples in the generated dataset.
    n_features : int
        Number of feature each sample will have.
    dataset : str
        Dataset label:
        - "reg": general customizable linear regression dataset;
        - "unireg": unidimensional regression;
        - "svm": multidimensional classification problem;
        - "unisvm": unidimensional dataset that changes with topology spectral gap;
        - "skreg" : regression dataset from sklearn library,
        - "sloreg" and "susysvm" from UCI's repository.
    smv_label_flip_prob : float
        Probability that a label is flipped in svm dataset generation.
        Kind of noise added in the dataset.
    error_mean : float
        Mean of noise to introduce in regression datasets.
    error_std_dev : float
        Standard deviation of noise introduced in regression datasets.
    node_error_mean : float
        Mean of the per-node noise introduced in each node's sample.
        Be careful because if used with SVM this can change values of labels.
    node_error_std_dev : float
        Standard deviation of the per-node noise introduced in each node's sample.
        Be careful because if used with SVM this can change values of labels.
    starting_weights_domain : List[float]
        In the form of [a,b]. Domain of each node's w is uniformly randomly picked within a and b.
    max_iter : int
        Maximum iteration after which the simulation is stopped.
    max_time : float
        Maximum time value after which the simulation is stopped.
    epsilon : float
        Accuracy threshold for objective function below which the simulation is stopped.
    method : str
        - "classic" : classic gradient descent, batch is equal to the whole dataset;
        - "stochastic" : stochastic gradient descent;
        - "batch" : batch gradient descent;
        - "subgradient" : subgradient projected gradient descent;
        - "dual_averaging" : dual averaging method.
    alpha : float
        Learning rate constant coefficient.
    learning_rate : str
        - 'constant' : the learning rate never changes during the simulation (it is euqual to alpha);
        - 'root_decreasing' : learning rate is alpha * 1/math.sqrt(K) where K = #iter.
    spectrum_dependent_learning_rate : bool
        If True the learning rate is also multiplied by math.sqrt(spectral_gap), so it is different for each graph.
    dual_averaging_radius : int
        Radius of the projection on the feasible set.
    time_distr_class : object
        Class of the random time distribution.
    time_distr_param : list or list of list
        Parameters list.
        See Also generate_time_distr_param_list.
    time_distr_param_rule : str
        Parameters distribution rule.
        See Also generate_time_distr_param_list.
    time_const_weight : float
        Weight assigned to constant part of the computation time.
        It is calculated as T_u(t) = E[X_u] * c + (1-c) * X_u(t).
    real_y_activation_func : function
        Activation function applied on real_y calculation.
    obj_function : str
        Identifier of the objective function (one of those declared in metrics.py).
    average_model_toggle : bool
        If True then the average over time of parameter vector is used istead of just x(k).
    metrics : list of str
        List of additional metrics to compute (objective function is automatically added to this list).
    real_metrics : list of str
        List of real metrics to compute (with regards to the real noiseless model).
    real_metrics_toggle : bool
        If False real metrics are not computed (useful to speed up the computation).
    metrics_type : int
        - 0 : metrics are computed over the whole dataset using model W equal to the avg of nodes' locla models;
        - 1 : metrics are computed as AVG of local nodes' metrics;
        - 2 : metrics are computed over the whole dataset using the model only from metrics_nodes (see below).
    metrics_nodes : int or list of int
        If type is int then it will be put into a list and treated as [int].
        Depends on the value of metrics_type:
        - metrics_type == 0 : no effects;
        - metrics_type == 1 : metrics are computed as avg of local metrics of nodes inside metrics_nodes list;
        - metrics_type == 2 : metrics are computed over the whole dataset using the model obtained as mean of
            nodes inside metrics_nodes.
    shuffle : bool
        If True the dataset is shuffled before being split into nodes, otherwise the dataset is untouched.
    batch_size : int
        Useful only for batch gradient descent, is the size of the batch.
    save_test_to_file : bool
        If True the test is saved to specified folder, otherwise it is stored into tempo folder.
    test_folder_name_struct : list
        See generate_test_subfolder_name.
    test_parent_folder : str
        Parent test folder: the test will be located in ./test_log/{$PARENT_FOLDER}/{$TEST_NAME_FOLDER}.
        Can be more than one-folder-deep!
    instant_plot : bool
        If True plots will be prompted upon finishing simulation. Be careful since it will pause the thread!
    plots : list of str
        List of plots' names to create / prompt upon finishing simulation.
        See plotter.py.
    save_plot_to_file : bool
        If True plots will be saved into .../{$TEST_FOLDER_NAME}/plots/ folder.
    plot_global_w : bool
        If True global W will be prompted after finishing simulation.
        This plot is never automatically saved, save it by yourself if you need to keep it.
    plot_node_w : list or False
        List of nodes to plot w which. If False nothing will be prompted.
    verbose_main : int
        Verbose policy in simulator.py script.
        - <0 : no print at all except from errors (unsafe).
        -  0 : default messages;
        -  1 : verbose + default messages
        -  2 : verbose + default messages + input required to continue after each message (simulation will be paused
            after each message and will require to press ENTER to go on, useful for debugging).
    verbose_cluster : int
        Verbose policy in cluster.py script.
        See verbose_main.
    verbose_node : int
        Verbose policy in node.py script.
        See verbose_main.
    verbose_task : int
        Verbose policy in tasks.py script.
        See verbose_main.
    verbose_plotter : int
        Verbose policy in plotter.py script.
        See verbose_main.

    Returns
    -------
    None
    """

    ### BEGIN SETUP ###

    begin_time = time.time()
    # descriptor text placed at the beginning of _descriptor.txt file within the test folder

    setup_from_file = False
    setup_folder_path = Plotter.get_temp_test_folder_path_by_index()
    setup_file_path = os.path.join(setup_folder_path, ".setup.pkl")

    setup = dict()

    setup['seed'] = int(time.time()) if seed is None else seed
    setup['n'] = n

    setup['graphs'] = generate_n_nodes_graphs_list(setup['n'], graphs)

    # TRAINING SET SETUP

    setup['n_samples'] = n_samples
    setup['n_features'] = n_features
    setup['dataset'] = dataset  # svm, unireg, reg, reg2, skreg
    setup['smv_label_flip_prob'] = smv_label_flip_prob
    setup['error_mean'] = error_mean
    setup['error_std_dev'] = error_std_dev
    setup['node_error_mean'] = node_error_mean
    setup['node_error_std_dev'] = node_error_std_dev

    # r = np.random.uniform(4, 10)
    # c = np.random.uniform(1.1, 7.8) * np.random.choice([-1, 1, 1, 1])
    # starting_weights_domain = [c - r, c + r]
    setup['starting_weights_domain'] = starting_weights_domain

    # TRAINING SET ALMOST FIXED SETUP
    # SETUP USED ONLY BY REGRESSION 'reg':
    setup['domain_radius'] = 8
    setup['domain_center'] = 0

    # CLUSTER SETUP 1
    setup['max_iter'] = max_iter
    setup['max_time'] = max_time  # units of time
    setup['method'] = method
    setup['dual_averaging_radius'] = dual_averaging_radius

    setup['alpha'] = alpha
    setup['learning_rate'] = learning_rate  # constant, root_decreasing
    setup['spectrum_dependent_learning_rate'] = spectrum_dependent_learning_rate

    setup['time_distr_class'] = time_distr_class
    setup['time_distr_param'] = generate_time_distr_param_list(
        setup['n'],
        time_distr_param,
        time_distr_param_rule

    )  # exp[rate], par[a,s], U[a,b]
    setup['time_distr_param_rule'] = time_distr_param_rule
    setup['time_const_weight'] = time_const_weight
    setup['real_y_activation_func'] = real_y_activation_func
    setup['obj_function'] = obj_function  # mse, hinge_loss, edgy_hinge_loss, cont_hinge_loss, score
    setup['average_model_toggle'] = average_model_toggle

    setup['metrics'] = metrics
    setup['real_metrics'] = real_metrics
    setup['real_metrics_toggle'] = real_metrics_toggle  # False to disable real_metrics computation (for better perf.)
    setup['metrics_type'] = metrics_type  # 0: avg w on whole TS, 1: avg errors in nodes, 2: node's on whole TS
    setup['metrics_nodes'] = metrics_nodes  # single node ID, list of IDs, 'all', 'worst', 'best'
    setup['shuffle'] = shuffle  # <--

    # CLUSTER ALMOST FIXED SETUP
    setup['batch_size'] = batch_size
    setup['epsilon'] = epsilon

    # VERBOSE FLAGS
    # verbose <  0: no print at all except from errors
    # verbose == 0: default messages
    # verbose == 1: verbose + default messages
    # verbose == 2: verbose + default messages + input required to continue after each message
    verbose = verbose_main

    if setup_from_file:
        with open(setup_file_path, 'rb') as setup_file:
            setup = pickle.load(setup_file)

    # OUTPUT SETUP
    test_subfolder = generate_test_subfolder_name(setup,
        *test_folder_name_struct,
        parent_folder=test_parent_folder
    )

    test_title = test_subfolder

    # OUTPUT ALMOST FIXED SETUP
    test_root = "test_log"  # don't touch this
    temp_test_subfolder = datetime.datetime.now().strftime('%y-%m-%d_%H.%M.%S.%f')
    compress = True
    overwrite_if_already_exists = False  # overwrite the folder if it already exists or create a different one otherwise
    delete_folder_on_errors = True
    save_descriptor = True  # create _descriptor.txt file
    ### END SETUP ###

    np.random.seed(setup['seed'])
    random.seed(setup['seed'])

    if setup['n'] % 2 != 0 and setup['n'] > 1:
        warnings.warn("Amount of nodes is odd (N={}), keep in mind graph generator "
                      "can misbehave in undirected graphs generation with odd nodes amount (it can "
                      "generate directed graphs instead)".format(setup['n']))

    if not save_test_to_file:
        # if you don't want to store the file permanently they are however placed inside temp folder
        # in order to use them for a short and limited period of time (temp folder may be deleted manually)
        test_subfolder = os.path.join("temp", temp_test_subfolder)
        overwrite_if_already_exists = False

    test_path = os.path.normpath(os.path.join(test_root, test_subfolder))

    if not overwrite_if_already_exists:
        # determine a name for the new folder such that it doesn't coincide with any other folder
        c = 0
        tmp_test_path = test_path
        while os.path.exists(tmp_test_path):
            tmp_test_path = test_path + ".conflict." + str(c)
            c += 1
        test_path = tmp_test_path

    test_path = os.path.normpath(test_path)

    # create dir
    if not os.path.exists(test_path):
        os.makedirs(test_path)

    # define function to delete test folder (in case of errors)
    def delete_test_dir():
        if delete_folder_on_errors:
            shutil.rmtree(test_path)

    # markov_matrix = normalize(__adjacency_matrix, axis=1, norm='l1')

    ### BEGIN TRAINING SET GEN ###
    X, y, w = None, None, None
    # X, y = make_blobs(n_samples=10000, n_features=100, centers=3, cluster_std=2, random_state=20)

    if setup['dataset'] == 'reg':
        X, y, w = datasets.reg_dataset(
            setup['n_samples'], setup['n_features'],
            error_mean=setup['error_mean'],
            error_std_dev=setup['error_std_dev']
        )
    elif setup['dataset'] == 'svm':
        X, y, w = datasets.svm_dual_averaging_dataset(
            setup['n_samples'], setup['n_features'],
            label_flip_prob=setup['smv_label_flip_prob']
        )
    elif setup['dataset'] == 'unireg':
        X, y, w = datasets.unireg_dataset(setup['n'])
    elif setup['dataset'] == 'unisvm':
        X, y, w = datasets.unisvm_dual_averaging_dataset(
            setup['n'],
            label_flip_prob=setup['smv_label_flip_prob']
        )
    elif setup['dataset'] == 'enereg':
        X, y, w = datasets.load_appliances_energy_reg_dataset(setup['n_samples'])
    elif setup['dataset'] == 'sloreg':
        X, y, w = datasets.load_slice_localization_reg_dataset(setup['n_samples'])
    elif setup['dataset'] == 'susysvm':
        X, y, w = datasets.load_susy_svm_dataset(setup['n_samples'])
    elif setup['dataset'] == 'skreg':
        X, y, w = make_regression(
            n_samples=setup['n_samples'],
            n_features=setup['n_features'],
            n_informative=setup['n_features'],
            n_targets=1,
            bias=1,
            effective_rank=None,
            tail_strength=1.0,
            noise=setup['error_std_dev'],
            shuffle=True,
            coef=True,
            random_state=None
        )
    elif setup['dataset'] in ['eigvecsvm', 'alteigvecsvm'] or 'multieigvecsvm' in setup['dataset']:
        pass
    else:
        delete_test_dir()
        raise Exception("{} is not a good training set generator function".format(setup['dataset']))

    ### END TRAINING SET GEN ###

    ### BEGIN MAIN STUFFS ###

    # save setup object dump
    with open(os.path.join(test_path, '.setup.pkl'), "wb") as f:
        pickle.dump(setup, f, pickle.HIGHEST_PROTOCOL)

    # setup['string_graphs'] = pprint.PrettyPrinter(indent=4).pformat(setup['graphs']).replace('array([', 'np.array([')

    # Fill descriptor with setup dictionary
    descriptor = """>>> Test Descriptor File
Title: {}
Date: {}
Summary: 

""".format(
        test_title if save_test_to_file else '',
        str(datetime.datetime.fromtimestamp(begin_time))
    )

    for k, v in setup.items():
        descriptor += "{} = {}\n".format(k, v)
    descriptor += "\n"

    # save descriptor file
    if save_descriptor:
        with open(os.path.join(test_path, '.descriptor.txt'), "w") as f:
            f.write(descriptor)

    w_logs = {}
    node_w_logs = {}

    ## SIMULATIONS
    # simulation for each adjacency matrix in setup['graphs'] dict
    for graph, adjmat in setup['graphs'].items():
        # set the seed again (each simulation must perform on the same cluster setup)
        np.random.seed(setup['seed'])
        random.seed(setup['seed'])

        cluster = None
        try:
            cluster = Cluster(adjmat, graph_name=graph, verbose=verbose_cluster)

            if setup['dataset'] in ['eigvecsvm', 'alteigvecsvm', 'multieigvecsvm']:
                # if using the ones matrix with this dataset, something wrong happens
                # so we use the last adj_mat also for the clique
                if 'clique' in graph:
                    max_deg = 0
                    max_deg_adjmat = adjmat
                    for G, A in setup['graphs'].items():
                        if 'clique' in G:
                            continue
                        d = int(G.split('-')[0])
                        if d > max_deg:
                            max_deg_adjmat = A
                            max_deg = d
                    if setup['dataset'] == 'eigvecsvm':
                        X, y, w = datasets.eigvecsvm_dataset_from_adjacency_matrix(max_deg_adjmat)
                    elif setup['dataset'] == 'alteigvecsvm':
                        X, y, w = datasets.eigvecsvm_dataset_from_expander(
                            setup['n'],
                            max_deg,
                            matrix_type='uniform-weighted'
                        )
                    elif setup['dataset'] == 'multieigvecsvm':
                        X, y, w = datasets.multieigvecsvm_dataset_from_expander(
                            setup['n_samples'], setup['n'], max_deg)
                else:
                    if setup['dataset'] == 'eigvecsvm':
                        X, y, w = datasets.eigvecsvm_dataset_from_adjacency_matrix(adjmat)
                    elif setup['dataset'] == 'alteigvecsvm':
                        deg = int(graph.split('-')[0])
                        X, y, w = datasets.eigvecsvm_dataset_from_expander(
                            setup['n'],
                            deg,
                            matrix_type='uniform-weighted'
                        )
                    elif setup['dataset'] == 'multieigvecsvm':
                        deg = int(graph.split('-')[0])
                        X, y, w = datasets.multieigvecsvm_dataset_from_expander(
                            setup['n_samples'], setup['n'], deg)

            elif 'multieigvecsvm' in setup['dataset']:
                deg = int(setup['dataset'].split('-')[0])
                X, y, w = datasets.multieigvecsvm_dataset_from_expander(
                    setup['n_samples'], setup['n'], deg)

            alpha = setup['alpha']
            if spectrum_dependent_learning_rate:
                alpha *= math.sqrt(uniform_weighted_Pn_spectral_gap_from_adjacency_matrix(adjmat))

            cluster.setup(
                X, y, w,
                real_y_activation_function=setup['real_y_activation_func'],
                obj_function=setup['obj_function'],
                average_model_toggle=average_model_toggle,
                method=setup['method'],
                max_iter=setup['max_iter'],
                max_time=setup['max_time'],
                batch_size=setup['batch_size'],
                dual_averaging_radius=setup['dual_averaging_radius'],
                epsilon=setup['epsilon'],
                alpha=alpha,
                learning_rate=setup['learning_rate'],
                metrics=setup['metrics'],
                real_metrics=setup["real_metrics"],
                real_metrics_toggle=setup['real_metrics_toggle'],
                metrics_type=setup['metrics_type'],
                metrics_nodes=setup['metrics_nodes'],
                shuffle=setup['shuffle'],
                time_distr_class=setup['time_distr_class'],
                time_distr_param=setup['time_distr_param'],
                time_const_weight=setup['time_const_weight'],
                node_error_mean=setup['node_error_mean'],
                node_error_std_dev=setup['node_error_std_dev'],
                starting_weights_domain=setup['starting_weights_domain'],
                verbose_node=verbose_node,
                verbose_task=verbose_task
            )

            if setup['method'] is None:
                cluster.run()
            else:
                cluster.run()

        except:
            # if the cluster throws an exception then delete the folder created to host its output files
            # the most common exception in cluster.run() is thrown when the SGD computation diverges
            delete_test_dir()
            print(
                "Exception in cluster object\n",
                "cluster.iteration=" + str(cluster.iteration)
            )
            raise

        extension = '.txt'
        if compress:
            extension += '.gz'

        np.savetxt(
            os.path.join(test_path, "{}_iter_time_log{}".format(graph, extension)),
            cluster.logs["iter_time"],
            delimiter=','
        )

        np.savetxt(
            os.path.join(test_path, "{}_avg_iter_time_log{}".format(graph, extension)),
            cluster.logs["avg_iter_time"],
            delimiter=','
        )
        np.savetxt(
            os.path.join(test_path, "{}_max_iter_time_log{}".format(graph, extension)),
            cluster.logs["max_iter_time"],
            delimiter=','
        )

        # Save metrics logs
        if not setup['method'] is None:
            for metrics_id, metrics_log in cluster.logs["metrics"].items():
                np.savetxt(
                    os.path.join(test_path, "{}_{}_log{}".format(graph, metrics_id, extension)),
                    metrics_log,
                    delimiter=','
                )

            # Save real metrics logs
            for real_metrics_id, real_metrics_log in cluster.logs["real_metrics"].items():
                np.savetxt(
                    os.path.join(test_path, "{}_real_{}_log{}".format(graph, real_metrics_id, extension)),
                    real_metrics_log,
                    delimiter=','
                )

        if plot_global_w:
            w_logs[graph] = cluster.w

        if not plot_node_w is False:
            try:
                node_w_logs[graph] = np.array(cluster.nodes[plot_node_w[0]].training_task.w)
                for i in range(1, len(plot_node_w)):
                    node_w_logs[graph] += np.array(cluster.nodes[plot_node_w[i]].training_task.w)
                node_w_logs[graph] /= len(plot_node_w)
            except:
                plot_node_w = False

        print("Logs of {} simulation created at {}".format(graph, test_path))

    if save_descriptor:
        with open(os.path.join(test_path, '.descriptor.txt'), 'a') as f:
            f.write('\n\n# duration (hh:mm:ss): ' + time.strftime('%H:%M:%S', time.gmtime(time.time() - begin_time)))

    colors = Plotter.generate_rainbow_color_dict_from_graph_keys(
        list(w_logs.keys()), setup['n']
    )

    if plot_global_w:
        plt.suptitle(test_subfolder)
        plt.title("W(it)")
        plt.xlabel("iter")
        plt.ylabel("Global W at iteration")
        plt.yscale('linear')
        for graph in w_logs:
            plt.plot(
                list(range(len(w_logs[graph]))),
                w_logs[graph],
                label=graph,
                color=colors[graph],
                marker='o',
                markersize=2
                # **kwargs
            )
        plt.legend()
        plt.show()
        plt.close()

    if not plot_node_w is False:
        plt.suptitle(test_subfolder)
        plt.title("W_{0}(it) (W of Node {0} at iteration)".format(plot_node_w))
        plt.xlabel("iter")
        plt.ylabel("W_{}(iter)".format(plot_node_w))
        plt.yscale('linear')
        for graph in node_w_logs:
            plt.plot(
                list(range(len(node_w_logs[graph]))),
                [p[0] for p in node_w_logs[graph]],
                label=graph,
                color=colors[graph],
                marker='o',
                markersize=2
            )
        plt.legend()
        plt.show()
        plt.close()

    if save_plot_to_file or instant_plot:
        plot_from_files(
            test_folder_path=test_path,
            save_plots_to_test_folder=save_plot_to_file,
            instant_plot=instant_plot,
            plots=plots,
            verbose=verbose_plotter,
            test_tag=test_subfolder
        )
Esempio n. 6
0
 def setUp(self):
     self.TTASK = 4
     self.UMAX = 2
     self.cluster = Cluster(self.TTASK, self.UMAX)
Esempio n. 7
0
class TestCluster(unittest.TestCase):
    def setUp(self):
        self.TTASK = 4
        self.UMAX = 2
        self.cluster = Cluster(self.TTASK, self.UMAX)

    def test_cluster_add_server(self):
        """Tests server creation in cluster."""
        self.cluster.add_server()
        self.assertEqual(1, len(self.cluster.servers_list))
        self.cluster.add_server(5)
        self.assertEqual(6, len(self.cluster.servers_list))

    def test_cluster_add_task(self):
        """Checks task and server health when creating tasks."""
        self.cluster.add_task(1)
        self.assertEqual(1, len(self.cluster.servers_list))
        self.assertEqual(1, len(self.cluster.servers_list[0].task_list))

        self.cluster.add_task(2)
        self.assertEqual(2, len(self.cluster.servers_list))
        self.assertEqual(2, len(self.cluster.servers_list[0].task_list))
        self.assertEqual(1, len(self.cluster.servers_list[1].task_list))

        self.cluster.add_task(2)
        self.assertEqual(3, len(self.cluster.servers_list))
        self.assertEqual(2, len(self.cluster.servers_list[0].task_list))
        self.assertEqual(2, len(self.cluster.servers_list[1].task_list))
        self.assertEqual(1, len(self.cluster.servers_list[2].task_list))

    def test_cluster_clock_reverberate(self):
        """Checks if cluster clock reverberates in tasks."""
        self.cluster.add_task()
        self.assertEqual(1, len(self.cluster.servers_list[0].task_list))
        self.cluster.clock()
        self.assertEqual(
            3, self.cluster.servers_list[0].task_list[0].missing_ttask)

    def test_cluster_clock_remove_server(self):
        self.cluster.add_task()
        for i in range(self.TTASK):
            self.cluster.clock()
        self.assertEqual(0, len(self.cluster.servers_list))

    def test_cluster_stats_like_pdf_example(self):
        """Tests cluster status like a PDF example."""
        self.cluster.add_task()
        self.assertListEqual([[4]], self.cluster.stats())
        self.cluster.clock()

        self.cluster.add_task(3)
        self.assertListEqual([[3, 4], [4, 4]], self.cluster.stats())
        self.cluster.clock()

        self.assertListEqual([[2, 3], [3, 3]], self.cluster.stats())
        self.cluster.clock()

        self.cluster.add_task(1)
        self.assertListEqual([[1, 2], [2, 2], [4]], self.cluster.stats())
        self.cluster.clock()

        self.assertListEqual([[1], [1, 1], [3]], self.cluster.stats())
        self.cluster.clock()

        self.cluster.add_task(1)
        self.assertListEqual([[2, 4]], self.cluster.stats())
        self.cluster.clock()

        self.assertListEqual([[1, 3]], self.cluster.stats())
        self.cluster.clock()

        self.assertListEqual([[2]], self.cluster.stats())
        self.cluster.clock()

        self.assertListEqual([[1]], self.cluster.stats())
        self.cluster.clock()

        self.assertListEqual([], self.cluster.stats())
Esempio n. 8
0
        cost += len(cluster.stats()) * TICK_VALUE
        logs.append(cluster.stats())
        cluster.clock()
    return {
        'cost': cost,
        'logs': logs
    }


if __name__ == '__main__':
    # Reads Command Line arguments or raise Exception
    try:
        src, des = sys.argv[1], sys.argv[2]
    except Exception:
        exception_text = 'Invalid arguments. Please, run: ' \
                         '"python {} input_file_path output_file_path"'.format(sys.argv[0])
        raise Exception(exception_text)

    # Start vars in keeping with received args
    with open(src) as file:
        _input = text_numbers_to_tuple(file.read())
    ttask, umax, users_per_tick = _input[0], _input[1], _input[2:]

    cluster = Cluster(ttask, umax)

    # Run
    result = cluster_runner(cluster, users_per_tick)
    result_text = custom_joiner(result['logs'], result['cost'])
    with open(des, 'w') as x:
        x.write(result_text)
Esempio n. 9
0
 def addCluster(self, centroid):
     self.clusters.append(Cluster(centroid, self))
     self.numClusters += 1