def lbfgsb(loss, max_vals, min_vals=None, weights=None, deviation_tol=1.E-6, verbose=True, **kwargs): if min_vals is None: min_vals = [0] * len(max_vals) if weights is None: weights = [(min_val + max_val) / 2 for min_val, max_val in zip(min_vals, max_vals)] import scipy.optimize import scipy.sparse if verbose: utils.log("Optimizing with LBFGSB") ret = scipy.optimize.minimize( loss, bounds=[(min_val, max_val) for min_val, max_val in zip(min_vals, max_vals)], method='L-BFGS-B', x0=weights, options={"ftol": deviation_tol}) if verbose: utils.log() return ret.x
def nelder_mead(loss, max_vals, min_vals=None, weights=None, deviation_tol=1.E-6, parameter_tol: float = float('inf'), verbose=True, **kwargs): if min_vals is None: min_vals = [0] * len(max_vals) if weights is None: weights = [(min_val + max_val) / 2 for min_val, max_val in zip(min_vals, max_vals)] import scipy.optimize import scipy.sparse if verbose: utils.log("Optimizing with Nelder-Mead") ret = scipy.optimize.minimize( loss, bounds=[(min_val, max_val) for min_val, max_val in zip(min_vals, max_vals)], method='Nelder-Mead', x0=weights, options={ "fatol": deviation_tol, "xatol": parameter_tol }) if verbose: utils.log() return ret.final_simplex[0][0]
def _gnn_train_torch(model, features, graph, labels, training, validation, optimizer=None, patience=100, epochs=10000, test=None, verbose=False): import torch optimizer = torch.optim.SGD(model.parameters(), lr=0.01) if optimizer is None else optimizer remaining_patience = patience test = validation if test is None else test labels = torch.FloatTensor(labels) features = torch.FloatTensor(features) training = torch.LongTensor(training) test = torch.LongTensor(test) validation = torch.LongTensor(validation) best_loss = float('inf') for epoch in range(epochs): optimizer.zero_grad() predictions = model(features, graph, training=True) loss = _gnn_cross_entropy_torch(labels, predictions, training) + model.loss loss.backward() optimizer.step() loss = _gnn_cross_entropy_torch(labels, predictions, validation) remaining_patience -= 1 if loss < best_loss: remaining_patience = patience best_loss = loss torch.save(model.state_dict(), "_pygrank_torch_state.pt") if verbose: # pragma: no cover utils.log( f"Epoch {epoch} loss {loss} acc {float(_gnn_accuracy_torch(labels, predictions, test)):.3f}" ) if remaining_patience == 0: break utils.log() model.load_state_dict(torch.load("_pygrank_torch_state.pt")) model.eval() import os os.remove("_pygrank_torch_state.pt")
def _gnn_train_tf(model, features, graph, labels, training, validation, optimizer=None, patience=100, epochs=10000, test=None, verbose=False): import tensorflow as tf optimizer = tf.optimizers.Adam( learning_rate=0.01) if optimizer is None else optimizer best_loss = float('inf') best_params = None test = validation if test is None else test remaining_patience = patience for epoch in range(epochs): with tf.GradientTape() as tape: predictions = model(features, graph, training=True) loss = _gnn_cross_entropy_tf(labels, predictions, training) loss = loss + tf.reduce_sum(model.losses) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) predictions = model(features, graph, training=False) loss = _gnn_cross_entropy_tf(labels, predictions, validation) remaining_patience -= 1 if loss < best_loss: remaining_patience = patience best_loss = loss best_params = [ tf.identity(param) for param in model.trainable_variables ] if verbose: # pragma: no cover utils.log( f"Epoch {epoch} loss {loss} acc {float(_gnn_accuracy_tf(labels, predictions, test)):.3f}" ) if remaining_patience == 0: break if verbose: utils.log() for variable, best_value in zip(model.trainable_variables, best_params): variable.assign(best_value)
def optimize(loss, max_vals=[1 for _ in range(1)], min_vals=None, deviation_tol: float = 1.E-9, divide_range: float = 1.01, partitions=5, parameter_tol: float = float('inf'), depth: int = 1, coarse: float = 0, shrink_strategy: str = "divide", partition_strategy: str = "split", randomize: bool = False, weights=None, verbose: bool = True, validation_loss=None): """ Implements a coordinate descent algorithm for optimizing the argument vector of the given loss function. Arguments: loss: The loss function. Could be an expression of the form `lambda p: f(p)' where f takes a list as an argument. max_vals: Optional. The maximum value for each parameter to search for. Helps determine the number of parameters. Default is a list of ones for one parameter. min_vals: Optional. The minimum value for each paramter to search for. If None (default) it becomes a list of zeros and equal length to max_vals. deviation_tol: Optional. The numerical tolerance of the loss to optimize to. Default is 1.E-8. divide_range: Optional. Value greater than 1 with which to divide the range at each iteration. Default is 1.01, which guarantees convergence even for difficult-to-optimize functions, but values such as 1.1, 1.2 or 2 may also be used for much faster, albeit a little coarser, convergence. If the *shrink_strategy* argument is set to "shrinking" instead, the range is scaled proportionally to *iteration<sup>divide_range</sup>/log(iteration)* per block coordinate descent. partitions: Optional. In how many pieces to break the search space on each iteration. Default is 5. parameter_tol: Optional. The numerical tolerance of parameter values to optimize to. **Both** this and deviation_tol need to be met. Default is infinity. depth: Optional. Declares the number of times to re-perform the optimization given the previous found solution. Default is 1, which only runs the optimization once. Larger depth values can help offset coarseness introduced by divide_range. coarse: Optional. Optional. Snaps solution to this precision. If 0 (default) then this behavior is ignored. shrink_strategy: Optional. The shrinking strategy towards convergence. If "divide" (default), then the search range is divided by the argument *divide_range*, but if "shrinking" then it is scaled based on block coordinate descent. partition_strategy: Optional. Strategy with which to traverse partitions. If "split" (default), then the partition is split to *partitions* parts. If "step", then the *partitions* argument is used as a fixed step and however many splits are needed to achieve this are performed. This last strategy helps force block coordinate descent traverse a finite set of values, as long as it holds that **coarse==partitions**. randomize: Optional. If True (default), then a random parameter is updated each time instead of moving though them in a cyclic order. weights: Optional. An estimation of parameters to start optimization from. The algorithm tries to center solution search around these - hence the usefulness of *depth* as an iterative scheme. If None (default), the center of the search range (max_vals+min_vals)/2 is used as a starting estimation. verbose: Options. If True, optimization outputs its intermediate steps. Default is False. Example: >>> import pygrank as pg >>> p = pg.optimize(loss=lambda p: (1.5-p[0]+p[0]*p[1])**2+(2.25-p[0]+p[0]*p[1]**2)**2+(2.625-p[0]+p[0]*p[1]**3)**2, max_vals=[4.5, 4.5], min_vals=[-4.5, -4.5]) >>> # desired optimization point for the Beale function of this example is [3, 0.5] >>> print(p) [3.000000052836577, 0.5000000141895036] """ if min_vals is None: min_vals = [0 for _ in max_vals] #if divide_range<=1: # raise Exception("Need to have a divide_range parameter greater than 1 to actually reduce the search area") for min_val, max_val in zip(min_vals, max_vals): if min_val > max_val: raise Exception("Empty parameter range [" + str(min_val) + "," + str(max_val) + "]") if str(divide_range) != "shrinking" and divide_range <= 1: raise Exception( "divide_range should be greater than 1, otherwise the search space never shrinks." ) #weights = [1./dims for i in range(dims)] if weights is None: weights = [(min_val + max_val) / 2 for min_val, max_val in zip(min_vals, max_vals)] range_search = [(max_val - min_val) / 2 for min_val, max_val in zip(min_vals, max_vals)] curr_variable = 0 iter = 0 range_deviations = [float('inf')] * len(max_vals) #checkpoint_weights = weights best_weights = weights best_loss = float('inf') evals = 0 while True: if randomize: curr_variable = int(random() * len(weights)) if max(range_search) == 0: break assert max( range_search ) != 0, "Something went wrong and took too many iterations for optimizer to run (check for nans)" if shrink_strategy == "shrinking": range_search[curr_variable] = ( max_vals[curr_variable] - min_vals[curr_variable]) / ( (iter + 1)**divide_range * log(iter + 2)) elif shrink_strategy == "divide": range_search[curr_variable] /= divide_range else: raise Exception( "Invalid shrink strategy: either shrinking or divide expected") if range_search[curr_variable] == 0: range_deviations[curr_variable] = 0 curr_variable += 1 if curr_variable >= len(max_vals): curr_variable -= len(max_vals) continue if partition_strategy == "split": candidate_weights = [ __add(weights, curr_variable, range_search[curr_variable] * (part * 2. / (partitions - 1) - 1), max_vals[curr_variable], min_vals[curr_variable], coarse=coarse) for part in range(partitions) ] elif partition_strategy == "step": candidate_weights = [ __add(weights, curr_variable, part * partitions, max_vals[curr_variable], min_vals[curr_variable], coarse=coarse) for part in range( -int(range_search[curr_variable] / partitions), 1 + int(range_search[curr_variable] / partitions)) ] else: raise Exception( "Invalid partition strategy: either split or step expected") loss_pairs = [(w, loss(w)) for w in candidate_weights if w is not None] evals += len(loss_pairs) weights, weights_loss = min(loss_pairs, key=lambda pair: pair[1]) prev_best_loss = best_loss if validation_loss is not None: weights_loss = validation_loss(weights) if weights_loss < best_loss: best_loss = weights_loss best_weights = weights else: best_loss = weights_loss best_weights = weights range_deviations[curr_variable] = abs(prev_best_loss - best_loss) if verbose: utils.log( f"Tuning evaluations {evals} loss {best_loss:.8f} +- {max(range_deviations):.8f}" ) if max(range_deviations) <= deviation_tol and max( range_search) <= parameter_tol: break # move to next var iter += 1 curr_variable += 1 if curr_variable >= len(max_vals): curr_variable -= len(max_vals) #if sum(abs(w1-w2) for w1, w2 in zip(weights, checkpoint_weights)) == 0: # break #checkpoint_weights = weights #print("trained weights in", iter, "iterations", weights, "final loss", loss(weights)) weights = best_weights if verbose: utils.log() if depth > 1: return optimize(loss, max_vals, min_vals, deviation_tol, divide_range, partitions, parameter_tol, depth - 1, coarse, shrink_strategy, partition_strategy, randomize, weights, verbose, validation_loss) return weights
def import_snap_format_dataset(dataset: str, path: Union[Iterable[str], str] = (os.path.join(os.path.expanduser('~'), '.pygrank/data'), ".", "data"), pair_file: str = 'pairs.txt', group_file: str = 'groups.txt', directed: bool = False, min_group_size: float = 0.01, max_group_number: int = 20, graph_api=nx, verbose=True): """ Imports a dataset of the SNAP format. Args: dataset: The name of the dataset to be loaded. If a name among 'dataset' path: The dataset's path in which *dataset* is a folder, or a list of paths in which to search. The first of these will be set as the preferred download location if the dataset is not found and can be downloaded. Default is a list comprising the path where pygrank's settings file resides, "." and "data". pair_file: Optional. The rows of the file *[path]/[dataset]/pair_file* should contain pairs of ","-separated node names. Default is "pairs.txt". group_file: Optional. The rows of the file *[path]/[dataset]/pair_file* should contain lists of ","-separated node names. Default is "groups.txt". directed: Whether a directed or undirected graph should be returned. Default is False. min_group_size: Optional. The minimum group length to be considered for inclusion in groups. Can be either a number less than 1 to indicate group size as a fraction of the dataset or an integer to denote Default is 0.01, meaning that groups comprising at least 1% of graph nodes are considered. max_group_number: Limits the numbers of found groups to be up to that number. Default is 20. graph_api: The library used to construct the graph. Either `networkx` or `pygrank.fastgraph` are supported. verbose: Whether to show intermediate status for lengthy loading. These messages use carriage return to eventually disappear. Default is True. Returns: graph: A graph of node relations. groups: A dictionary whose values are lists of group node members. """ path = _select_path(path, dataset) download_dataset(dataset, path=path) if verbose: utils.log(f"Loading {dataset} graph") G = (graph_api.DiGraph() if hasattr(graph_api, "DiGraph") else graph_api.Graph(directed)) if directed else graph_api.Graph() groups = {} with open(path+'/'+dataset+'/'+pair_file, 'r', encoding='utf-8') as file: for line in file: if len(line) != 0 and line[0] != '#': splt = line[:-1].split() if len(splt) > 1: G.add_edge(splt[0], splt[1]) if min_group_size < 1: min_group_size *= len(G) if verbose: utils.log(f"Loading {dataset} communities") if group_file is not None and os.path.isfile(path+'/'+dataset+'/'+group_file): with open(path+'/'+dataset+'/'+group_file, 'r', encoding='utf-8') as file: for line in file: if line[0] != '#': group = [item for item in line[:-1].split() if len(item) > 0 and item in G] if len(group) >= min_group_size: groups[len(groups)] = group if verbose: utils.log(f"Loaded {dataset} communities {len(groups)}/{max_group_number}") if len(groups) >= max_group_number: break if verbose: utils.log() return G, groups
def download_dataset(dataset, path: str = os.path.join(os.path.expanduser('~'), '.pygrank/data'), verbose=True): # pragma: no cover dataset = dataset.lower() if dataset not in datasets: return source = datasets[dataset] if isinstance(dataset, str) else dataset credentials = "REQUIRED CITATION: Please visit the url "+source["url"]\ + " for instructions on how to cite the dataset "+dataset+" in your research" print(credentials, file=sys.stderr) sys.stderr.flush() if verbose: utils.log("Downloading " + dataset + " into " + path) if not os.path.isdir(path): os.mkdir(path) download_path = os.path.join(path, dataset) if not os.path.isdir(download_path): os.mkdir(download_path) if "all" in source: all_path = download_path + "/all." + source["all"].split(".")[-1] wget.download(source["all"], all_path) try: tarfile.open(all_path, 'r').extractall(download_path + "/") except tarfile.ReadError: with gzip.open(all_path, 'rb') as f_in: with open(download_path + "/all.txt", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(all_path) if "script" in source: source["script"](path) if "pairs" in source: if source["pairs"].startswith("http"): pairs_path = download_path + "/pairs." + source["pairs"].split( ".")[-1] wget.download(source["pairs"], pairs_path) if pairs_path.split(".")[-1] not in ["txt", "csv"]: with gzip.open(pairs_path, 'rb') as f_in: with open(download_path + "/pairs.txt", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(pairs_path) else: shutil.move(download_path + "/" + source["pairs"], download_path + "/pairs.txt") if "pair_process" in source: pairs = list() with open(download_path + "/pairs.txt", "r") as file: for line in file: pair = source["pair_process"](line[:-1].split()) if pair is not None: pairs.append(pair) os.remove(download_path + "/pairs.txt") with open(download_path + "/pairs.txt", "w") as file: for pair in pairs: file.write(pair[0] + "\t" + pair[1] + "\n") if "node2group" in source: groups = dict() with open(download_path + "/pairs.txt", "r") as file: for line in file: pair = line[:-1].split() for node in pair[0:1]: group = source["node2group"](node) if group is not None: if group not in groups: groups[group] = list() groups[group].append(node) with open(download_path + "/groups.txt", "w") as file: for group in groups.values(): if len(group) > 1: file.write(" ".join(group) + "\n") if "features" in source and "groups" not in source: features_path = download_path + "/" + source["features"] groups = dict() features = dict() with open(features_path) as features_file: for line in features_file: line = line[:-1].split() if "feature_process" in source: line = source["feature_process"](line) if line is None: continue node_id = line[0] group = line[-1] if group not in groups: groups[group] = list() groups[group].append(node_id) features[node_id] = [val.strip() for val in line[1:-1]] groups = { group: nodes for group, nodes in groups.items() if len(nodes) > 1 } with open(download_path + '/groups.txt', 'w', encoding='utf-8') as file: for g in groups.values(): for uid in g: file.write(str(uid) + '\t') file.write('\n') with open(download_path + '/features.txt', 'w', encoding='utf-8') as file: for p in features: file.write(str(p) + '\t' + '\t'.join(features[p]) + '\n') if "features" in source and "groups" in source: if source["features"].startswith("http"): pairs_path = download_path + "/features." + source[ "features"].split(".")[-1] wget.download(source["features"], pairs_path) if pairs_path.split(".")[-1] not in ["txt", "csv"]: with gzip.open(pairs_path, 'rb') as f_in: with open(download_path + "/features.txt", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(pairs_path) else: shutil.move(download_path + "/" + source["features"], download_path + "/features.txt") if "groups" in source: groups_path = download_path + "/groups." + source["groups"].split( ".")[-1] wget.download(source["groups"], groups_path) if groups_path.split(".")[-1] not in ["txt", "csv"]: with gzip.open(groups_path, 'rb') as f_in: with open(download_path + "/groups.txt", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(groups_path) elif "labels" in source: labels_path = download_path + "/labels." + source["labels"].split( ".")[-1] wget.download(source["labels"], labels_path) with gzip.open(labels_path, 'rb') as f_in: with open(download_path + "/labels.txt", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(labels_path) groups = dict() with open(download_path + "/labels.txt", 'r', encoding='utf-8') as file: for line in file: if line[0] != '#': splt = line[:-1].split() if len(splt) >= 2: if splt[1] not in groups: groups[splt[1]] = list() groups[splt[1]].append(splt[0]) with open(download_path + "/groups.txt", 'w', encoding='utf-8') as file: for group in groups.values(): file.write((" ".join(group)) + "\n") if "remove" in source: shutil.rmtree(download_path + "/" + source["remove"]) if verbose: utils.log() return credentials