def factorial(n, log=True): # pragma: no cover """ Calculates the factorial of n, automatically switching to Stirling's approaximation for n>20. Parameters ---------- n: int The value n for which the fatorial should be calculated. log: bool Whether or not to return the (natural) logarithm of the factorial. Default is True. Returns ------- float """ f = np.float64(0) n_ = np.float64(n) if n > 20: # use Stirling's approximation try: f = (n_ * np.log(n_) - n_ + 0.5 * np.log(2.0 * np.pi * n_) + 1.0 / (12.0 * n_) - 1 / (360.0 * n_**3.0)) except Warning as w: msg = 'Factorial calculation for n={}: {}'.format(n, w) Log.add(msg, severity=Severity.WARNING) else: f = np.log(np.math.factorial(n)) if log: return f else: return np.exp(f)
def fit_markov_model(self, k=1): """ Generates a k-th order Markov model for the underlying sequence """ # TODO: Add support for k=0 assert self.sequence, "Error: Empty sequence" # MLE fit of transition probabilities self.P[k] = defaultdict(lambda: defaultdict(lambda: 0.0)) Log.add('Fitting Markov model with order k = ' + str(k)) # Generate initial memory prefix mem = (()) for s in self.sequence[:k]: mem += (s, ) # count state transitions for s in self.sequence[k:]: self.P[k][mem][s] += 1.0 # shift memory by one element mem = mem[1:] + (s, ) # normalize transitions for m in self.P[k]: S = float(sum(self.P[k][m].values())) for s in self.P[k][m]: self.P[k][m][s] /= S Log.add('finished.')
def generate_walk(tempnet, l=100, start_node=None): """ DEPRECATED """ Log.add('The temporal_walk.generate_walk function is deprecated. \ Please use random_walk.generate_walk instead.', Severity.WARNING) return random_walk.generate_walk(tempnet, l, start_node)
def paths_to_origin_destination(paths): """ Returns a list that contains path frequencies between all origin destination pairs in a path object. The result can e.g. be used to create shortest path models that preserve the origin-destination statistics in real path data. Parameters ---------- paths: Paths collection of weighted paths based on which origin destination statistics shall be computed Returns ------- list of tuples (o, d, w) where o is origin, d is destination, and w is the weight """ od_stats = defaultdict(lambda: 0.0) Log.add('Calculating origin/destination statistics from paths ...') # iterate through all paths and create path statistics for x in paths.paths: for p in paths.paths[x]: o = p[0] d = p[-1] if paths.paths[x][p][1] > 0: od_stats[o, d] += paths.paths[x][p][1] od_list = [(od[0], od[1], f) for od, f in od_stats.items()] Log.add('finished.') return od_list
def add_edges(self, edges): """Add a list of edges Parameters ---------- edges: list a list of edges [(s_1, t_1), (s_1, t_2), ...] """ self_loops = 0 redundant_edges = 0 for e in edges: is_redundant = False has_self_loop = False if e[0] == e[1]: has_self_loop = True self_loops += 1 if (e[0], e[1]) in self.edges: is_redundant = True redundant_edges += 1 if not has_self_loop and not is_redundant: self.add_edge(e[0], e[1]) if self_loops > 0: Log.add('Warning: omitted %d self-loops' % self_loops, Severity.WARNING) if redundant_edges > 0: Log.add('Warning: omitted %d redundant edges' % redundant_edges, Severity.WARNING)
def betweenness(network, normalized=False): assert isinstance(network, Network), \ "network must be an instance of Network" Log.add('Calculating betweenness centralities ...', Severity.INFO) all_paths = shortest_paths(network) node_centralities = defaultdict(lambda: 0) for s in all_paths: for d in all_paths[s]: for p in all_paths[s][d]: for x in p[1:-1]: if s != d != x: node_centralities[x] += 1.0 / len(all_paths[s][d]) if normalized: max_centr = max(node_centralities.values()) for v in node_centralities: node_centralities[v] /= max_centr # assign zero values to nodes not occurring on shortest paths for v in network.nodes: node_centralities[v] += 0 return node_centralities
def node_traversals(paths): """Calculates the number of times any path traverses each of the nodes. Parameters ---------- paths: Paths Returns ------- dict """ if not isinstance(paths, Paths): raise PathpyNotImplemented("`paths` must be an instance of Paths") Log.add('Calculating node traversals...', Severity.INFO) # entries capture the number of times nodes are "visited by paths" # Note: this is identical to the subpath count of zero-length paths traversals = defaultdict(lambda: 0) for p in paths.paths[0]: traversals[p[0]] += paths.paths[0][p].sum() Log.add('finished.', Severity.INFO) return traversals
def _cl(higher_order_net, normalized=False): if not isinstance(higher_order_net, HigherOrderNetwork): raise PathpyNotImplemented( "`higher_order_net` must be an instance of HigherOrderNetwork") distances = distance_matrix(higher_order_net) node_centralities = defaultdict(lambda: 0) nodes = higher_order_net.paths.nodes Log.add('Calculating closeness (k = %s) ...' % higher_order_net.order, Severity.INFO) for x in nodes: # calculate closeness centrality of x for d in nodes: if x != d and distances[d][x] < _np.inf: node_centralities[x] += 1.0 / distances[d][x] # assign centrality zero to those nodes for which no higher-order path exists for v in nodes: node_centralities[v] += 0 if normalized: m = max(node_centralities.values()) for v in nodes: node_centralities[v] /= m Log.add('finished.', Severity.INFO) return node_centralities
def sequence(self, stop_char='|'): """ Parameters ---------- stop_char : str the character used to separate paths Returns ------- tuple: Returns a single sequence in which all paths have been concatenated. Individual paths are separated by a stop character. """ Log.add('Concatenating paths to sequence ...') sequence = [] for p_length in self.paths: for p in self.paths[p_length]: segment = [] for s in p: segment.append(s) if stop_char != '': segment.append(stop_char) for _ in range(int(self.paths[p_length][p][1])): sequence += segment Log.add('finished') return sequence
def filter_edges(self, edge_filter): """Returns a copy of the temporal network where time-stamped edges are filtered according to a given filter expression. This can be used, e.g., to create time slice networks by filtering edges within certain time windows, or to reduce a temporal network to interactions between a subset of nodes. Parameters ---------- edge_filter: callable an arbitrary filter function of the form filter_func(v, w, time) that returns True for time-stamped edges that shall pass the filter, and False for time-stamped edges that shall be filtered out. Returns ------- """ Log.add('Starting filtering ...', Severity.INFO) new_t_edges = [] for (v, w, t) in self.tedges: if edge_filter(v, w, t): new_t_edges.append((v, w, t)) n_filtered = self.ecount() - len(new_t_edges) msg = 'finished. Filtered out {} time-stamped edges.'.format( n_filtered) Log.add(msg, Severity.INFO) return TemporalNetwork(tedges=new_t_edges)
def visitation_probabilities(paths): """Calculates the probabilities that a randomly chosen path passes through each of the nodes. If 5 out of 100 paths (of any length) traverse node v, node v will be assigned a visitation probability of 0.05. This measure can be interpreted as ground truth for the notion of importance captured by PageRank applied to a graphical abstraction of the paths. Parameters ---------- paths: Paths Returns ------- dict """ if not isinstance(paths, Paths): raise PathpyNotImplemented("`paths` must be an instance of Paths") Log.add('Calculating visitation probabilities...', Severity.INFO) # entries capture the probability that a given node is visited on an arbitrary path # Note: this is identical to the subpath count of zero-length paths # (i.e. the relative frequencies of nodes across all pathways) visit_probabilities = node_traversals(paths) # total number of visits visits = 0.0 for v in visit_probabilities: visits += visit_probabilities[v] for v in visit_probabilities: visit_probabilities[v] /= visits Log.add('finished.', Severity.INFO) return visit_probabilities
def read_origin_destination(filename, separator=','): """Reads origin/destination statistics from a csv file with the following structure: origin1,destination1,weight origin2,destination2,weight origin3,destination3,weight Parameters ---------- filename: str path to the file containing the origin/destination statistics separator: str arbitrary separation character (default: ',') Returns ------- list """ origin_destination_list = [] Log.add('Reading origin/destination statistics from file ...') with open(filename, 'r') as f: line = f.readline() while line: fields = line.rstrip().split(separator) origin_destination_list.append( (fields[0].strip(), fields[1].strip(), float(fields[2].strip()))) line = f.readline() Log.add('Finished.') return origin_destination_list
def _dm(paths): """ Calculates shortest path distances between all pairs of nodes based on the observed shortest paths (and subpaths) """ dist = defaultdict(lambda: defaultdict(lambda: _np.inf)) Log.add('Calculating distance matrix based on empirical paths ...', Severity.INFO) # Node: no need to initialize shortest_path_lengths[v][v] = 0 # since paths of length zero are contained in self.paths for v in paths.nodes: dist[v][v] = 0 for p_length in paths.paths: for p in paths.paths[p_length]: start = p[0] end = p[-1] if p_length < dist[start][end]: dist[start][end] = p_length Log.add('finished.', Severity.INFO) return dist
def read_edges(filename, separator=',', weight=False, undirected=False, maxlines=None): """ Read path in edgelist format Reads data from a file containing multiple lines of *edges* of the form "v,w,frequency,X" (where frequency is optional and X are arbitrary additional columns). The default separating character ',' can be changed. Parameters ---------- filename : str path to edgelist file separator : str character separating the nodes weight : bool is a weight given? if ``True`` it is the last element in the edge (i.e. ``a,b,2``) undirected : bool are the edges directed or undirected maxlines : int number of lines to read (useful to test large files). None means the entire file is read Returns ------- Paths a ``Paths`` object obtained from the edgelist """ p = Paths() p.separator = separator with open(filename, 'r') as f: Log.add('Reading edge data ... ') for n, line in enumerate(f): fields = line.rstrip().split(separator) assert len(fields) >= 2, 'Error: malformed line: {0}'.format( line) path = (fields[0], fields[1]) frequency = int(fields[2]) if weight else 1 p.paths[1][path] += (0, frequency) if undirected: reverse_path = (fields[1], fields[0]) p.paths[1][reverse_path] += (0, frequency) if maxlines is not None and n >= maxlines: break p.expand_subpaths() Log.add('finished.') return p
def parallel(order_k): # pragma: no cover Log.add('Generating ' + str(order_k) + '-th order network layer ...') p_layer = HigherOrderNetwork(paths, k=order_k, null_model=False) # compute transition matrices for all layers. In order to use the # maximally available statistics, we always use sub paths in the # calculation trans_mat = p_layer.transition_matrix(include_subpaths=True) Log.add('... finished') return [order_k, p_layer, trans_mat]
def eigenvalue_gap(network, include_sub_paths=True, lanczos_vectors=15, maxiter=20): """Returns the eigenvalue gap of the transition matrix. Parameters ---------- network include_sub_paths: bool whether or not to include subpath statistics in the calculation of transition probabilities. lanczos_vectors: int number of Lanczos vectors to be used in the approximate calculation of eigenvectors and eigenvalues. This maps to the ncv parameter of scipy's underlying function eigs. maxiter: int scaling factor for the number of iterations to be used in the approximate calculation of eigenvectors and eigenvalues. The number of iterations passed to scipy's underlying eigs function will be n*maxiter where n is the number of rows/columns of the Laplacian matrix. Returns ------- float """ assert isinstance(network, HigherOrderNetwork), \ "network must be an instance of HigherOrderNetwork" # NOTE to myself: most of the time goes for construction of the 2nd order # NOTE null graph, then for the 2nd order null transition matrix Log.add('Calculating eigenvalue gap ... ', Severity.INFO) # Build transition matrices trans_mat = network.transition_matrix(include_sub_paths) # Compute the two largest eigenvalues # NOTE: ncv sets additional auxiliary eigenvectors that are computed # NOTE: in order to be more confident to actually find the one with the largest # NOTE: magnitude, see https://github.com/scipy/scipy/issues/4987 eig_vals = sla.eigs(trans_mat, which="LM", k=2, ncv=lanczos_vectors, return_eigenvectors=False, maxiter=maxiter) eigen_values2_sorted = _np.sort(-_np.absolute(eig_vals)) Log.add('finished.', Severity.INFO) return _np.abs(eigen_values2_sorted[1])
def __add_layers_sequential(self, orders): paths = self.paths for k in sorted(orders): Log.add('Generating %d-th order layer ...' % k) self.layers[k] = HigherOrderNetwork(paths, k, null_model=False) # compute transition matrices for all layers. In order to use the # maximally available statistics, we always use sub paths in the # calculation self.transition_matrices[k] = self.layers[k].transition_matrix( include_subpaths=True) Log.add('finished.')
def paths_from_temporal_network(tempnet, delta=1, max_length=sys.maxsize, max_subpath_length=sys.maxsize): """ Warning: This function is deprecated. Calls will be rerouted to to paths_from_temporal_network_dag. If you intended to calculate paths with a single continuing edge, use paths_from_temporal_network_single instead (see documentation of this function for details). """ Log.add( 'This function is deprecated. Rerouting call to paths_from_temporal_network_dag. If you intended to calculate paths with a single continuing edge, use paths_from_temporal_network_single instead.', Severity.WARNING) return paths_from_temporal_network_dag( tempnet, delta, max_subpath_length=max_subpath_length)
def expand_subpaths(self): """ This function implements the sub path expansion, i.e. for a four-gram a,b,c,d, the paths a->b, b->c, c->d of length one and the paths a->b->c and b->c->d of length two will be counted. This process will consider restrictions to the maximum sub path length defined in self.max_subpath_length """ # nothing to see here ... if not self.paths: return Log.add('Calculating sub path statistics ... ') # the expansion of all subpaths in paths with a maximum path length of maxL # necessarily generates paths of *any* length up to MaxL. # Forcing the generation of all these indices here, prevents us # from mutating indices during subpath creation. The fact that indices are # immutable allows us to use efficient iterators and prevent unnecessarily copying # Thanks to the use of defaultdict, the following trick will prevent us from # repeatedly testing whether l already exists as a key for p_length in range(max(self.paths)): self.paths[p_length] = self.paths[p_length] # expand subpaths in paths of any length ... for path_length in self.paths: for path, value in self.paths[path_length].items(): # The frequency is given by the number of occurrences as longest # path, which is stored in the second entry of the numpy array frequency = value[1] # compute the maximum length of sub paths to consider # (maximum up to pathLength) max_length = min(self.max_subpath_length + 1, path_length) # Generate all subpaths of length k for k = 0 to k = max_len-1 (inclusive) for k in range(max_length): # Generate subpaths of length k for all start indices s # for s = 0 to s = pathLength-k (inclusive) for s in range(path_length - k + 1): # Add frequency as a subpath to *first* entry of array path_slice = path[s:s + k + 1] self.paths[k][path_slice][0] += frequency
def closeness(network, normalized=False): """Calculates the closeness of all nodes. If the order of the higher-order network is larger than one centralities calculated based on the higher-order topology will automatically be projected back to first-order nodes. Parameters ---------- network: HigherOrderNetwork Returns ------- dict """ if not isinstance(network, Network): raise PathpyNotImplemented("`network` must be an instance of Network") distances = distance_matrix(network) node_centralities = defaultdict(lambda: 0) mapping = {idx: v for idx, v in enumerate(network.nodes)} Log.add('Calculating closeness in network ...', Severity.INFO) n = network.ncount() # calculate closeness values for d in range(n): for x in range(n): if d != x and distances[d, x] < _np.inf: node_centralities[mapping[x]] += distances[d, x] # assign centrality zero to nodes not occurring on higher-order shortest paths for v in network.nodes: node_centralities[v] += 0.0 if node_centralities[v] > 0.0: node_centralities[v] = (network.ncount() - 1.0) / node_centralities[v] if normalized: max_centr = max(node_centralities.values()) for v in network.nodes: node_centralities[v] /= max_centr Log.add('finished.', Severity.INFO) return node_centralities
def make_acyclic(self): """Removes all back-links from the graph to make it acyclic, then performs another topological sorting of the DAG """ if self.is_acyclic is None: self.topsort() removed_links = 0 if not self.is_acyclic: # Remove all back links for e in list(self.edge_classes): if self.edge_classes[e] == 'back': self.remove_edge(*e) removed_links += 1 self.topsort() assert self.is_acyclic, "Error: make_acyclic did not generate acyclic graph!" Log.add( 'Removed ' + str(removed_links) + ' back links to make graph acyclic', Severity.INFO)
def from_sqlite(cls, cursor, directed=True): r"""Returns a new Network instance generated from links obtained from an SQLite cursor. The cursor must refer to a table with at least two columns source target in which each row contains one link. Additional columns will be used as named edge properties. Since columns are accessed by name this function requires that a row factory object is set for the SQLite connection prior to cursor creation, i.e. you should set connection.row_factory = sqlite3.Row Parameters ---------- cursor : The SQLite cursor to fetch rows from. directed : bool Whether or not links should be interpreted as directed. Default is True. Returns ------- Network A Network instance created from the SQLite database. """ from pathpy.classes import DAG if cls == DAG: n = cls() else: n = cls(directed=directed) assert cursor.connection.row_factory, \ 'Cannot access columns by name. Please set ' \ 'connection.row_factory = sqlite3.Row before creating DB cursor.' Log.add('Retrieving links from database ...') for row in cursor: n.add_edge(str(row['source']), str(row['target'])) return n
def algebraic_connectivity(network, lanczos_vectors=15, maxiter=20): """ Parameters ---------- network: HigherOrderNetwork lanczos_vectors: int number of Lanczos vectors to be used in the approximate calculation of eigenvectors and eigenvalues. This maps to the ncv parameter of scipy's underlying function eigs. maxiter: int scaling factor for the number of iterations to be used in the approximate calculation of eigenvectors and eigenvalues. The number of iterations passed to scipy's underlying eigs function will be n*maxiter where n is the number of rows/columns of the Laplacian matrix. Returns ------- """ assert isinstance(network, HigherOrderNetwork), \ "network must be an instance of HigherOrderNetwork" Log.add('Calculating algebraic connectivity ... ', Severity.INFO) lapl_mat = network.laplacian_matrix() # NOTE: ncv sets additional auxiliary eigenvectors that are computed # NOTE: in order to be more confident to find the one with the largest # NOTE: magnitude, see https://github.com/scipy/scipy/issues/4987 w = sla.eigs(lapl_mat, which="SM", k=2, ncv=lanczos_vectors, return_eigenvectors=False, maxiter=maxiter) eigen_values_sorted = _np.sort(_np.absolute(w)) Log.add('finished.', Severity.INFO) # TODO: result is unstable, it looks like it depends on a "warm start" # (i.e. run after other eigen velue calculations) see test_algebraic_connectivity # problems with order k=3 return _np.abs(eigen_values_sorted[1])
def random_walk(network, l, n=1, start_node=None): """ [DEPRECATED] Generates n paths of a random walker in the given network and returns them as a paths object. Each path has a length of l steps. Parameters ---------- network: Network, TemporalNetwork, HigherOrderNetwork The network structure on which the random walks will be simulated. int: l The (maximum) length of each random walk path. A path will terminate if a node with outdegree zero is reached. int: n The number of random walk paths to generate. """ Log.add( 'The path_extraction.random_walk function is deprecated. Please use paths_from_random_walk instead.', Severity.WARNING) return paths_from_random_walk(network, l, n, start_node)
def write_file(self, filename, separator=','): """Writes the time-stamped edge list of this temporal network instance as CSV file Parameters ---------- filename: str name of CSV file to save data to separator: str character used to separate columns in generated CSV file Returns ------- """ msg = 'Writing {0} time-stamped edges to file {1}'.format(self.ecount(), filename) Log.add(msg, Severity.INFO) with open(filename, 'w+') as f: f.write('source' + separator + 'target' + separator + 'time' + '\n') for time in self.ordered_times: for (v, w, t) in self.time[time]: f.write(str(v) + separator + str(w) + separator + str(t)+'\n')
def __init__(self, tedges=None): """Constructor that generates a temporal network instance. Parameters ---------- tedges: an optional list of directed time-stamped edges from which to construct a temporal network instance. For the default value None an empty temporal network will be created. """ # A list of time-stamped edges of this temporal network self.tedges = [] # A list of nodes of this temporal network self.nodes = [] # A dictionary storing all time-stamped links, indexed by time-stamps self.time = defaultdict(lambda: list()) # A dictionary storing all time-stamped links, indexed by time and target node self.targets = defaultdict(lambda: dict()) # A dictionary storing all time-stamped links, indexed by time and source node self.sources = defaultdict(lambda: dict()) # A dictionary storing time stamps at which links (v,*;t) originate from node v self.activities = defaultdict(lambda: list()) # A dictionary storing sets of time stamps at which links (v,*;t) originate from # node v # Note that the insertion into a set is much faster than repeatedly checking # whether an element already exists in a list! self.activities_sets = defaultdict(lambda: set()) # An ordered list of time-stamps self.ordered_times = [] nodes_seen = defaultdict(lambda: False) if tedges is not None: Log.add('Building index data structures ...') for e in tedges: self.activities_sets[e[0]].add(e[2]) self.time[e[2]].append(e) self.targets[e[2]].setdefault(e[1], []).append(e) self.sources[e[2]].setdefault(e[0], []).append(e) if not nodes_seen[e[0]]: nodes_seen[e[0]] = True if not nodes_seen[e[1]]: nodes_seen[e[1]] = True self.tedges = tedges self.nodes = list(nodes_seen.keys()) Log.add('Sorting time stamps ...') self.ordered_times = sorted(list(self.time.keys())) for v in self.nodes: self.activities[v] = sorted(self.activities_sets[v]) Log.add('finished.')
def _bw(paths, normalized=False): """Calculates the betweenness of nodes based on observed shortest paths between all pairs of nodes Parameters ---------- paths: Paths object normalized: bool normalize such that largest value is 1.0 Returns ------- dict """ assert isinstance(paths, Paths), "argument must be an instance of pathpy.Paths" node_centralities = defaultdict(lambda: 0) Log.add('Calculating betweenness in paths ...', Severity.INFO) all_paths = shortest_paths(paths) for s in all_paths: for d in all_paths[s]: for p in all_paths[s][d]: for x in p[1:-1]: if s != d != x: node_centralities[x] += 1.0 / len(all_paths[s][d]) if normalized: max_centr = max(node_centralities.values()) for v in node_centralities: node_centralities[v] /= max_centr # assign zero values to nodes not occurring on shortest paths nodes = paths.nodes for v in nodes: node_centralities[v] += 0 Log.add('finished.') return node_centralities
def read_file(cls, filename, separator=',', weighted=False, directed=False, header=False): r"""Reads a network from an edge list file. Reads data from a file containing multiple lines of *edges* of the form "v,w,frequency,X" (where frequency is optional and X are arbitrary additional columns). The default separating character ',' can be changed. In order to calculate the statistics of paths of any length, by default all subpaths of length 0 (i.e. single nodes) contained in an edge will be considered. Parameters ---------- filename : str path to edgelist file separator : str character separating the nodes weighted : bool is a weight given? if ``True`` it is the last element in the edge (i.e. ``a,b,2``) directed : bool are the edges directed or undirected header : bool if true skip the first row, useful if header row in file Returns ------- Network a ``Network`` object obtained from the edgelist """ net = cls(directed) with open(filename, 'r') as f: Log.add('Reading edge list ... ') header_offset = 0 if header: f.readline() header_offset = 1 for n, line in enumerate(f): fields = line.rstrip().split(separator) fields = [field.strip() for field in fields] if len(fields) < 2: Log.add('Ignoring malformed line {0}: {1}'.format(n, line+header_offset), Severity.WARNING) else: if weighted: net.add_edge(fields[0], fields[1], weight=int(fields[2])) else: net.add_edge(fields[0], fields[1]) Log.add('finished.') return net
def read_file(cls, filename, separator=',', maxlines=None, mapping=None, header=False): """ Reads a directed acyclic graph from a file containing an edge list of the form source,target where ',' can be an arbitrary separator character """ with open(filename, 'r') as f: edges = [] if mapping is not None: Log.add('Filtering mapped edges') Log.add('Reading edge list ...') if header: # Read header f.readline() for i, line in enumerate(f): if maxlines and i > maxlines: break fields = line.rstrip().split(separator) try: if mapping is None or (fields[0] in mapping and fields[1] in mapping): edges.append((fields[0], fields[1])) except (IndexError, ValueError): # pragma: no cover msg = 'Ignoring malformed data in ' \ 'line {}: "{}"'.format((i+header), line.strip()) Log.add(msg, Severity.WARNING) return cls(edges=edges)
def pagerank(network, alpha=0.85, max_iter=100, tol=1.0e-6, projection='scaled', include_sub_paths=True, weighted=False): """Calculates the PageRank of higher-order nodes based on a power iteration. If the order of the higher-order network is larger than one, the PageRank calculated based on the higher-order topology will automatically be projected back to first-order nodes. Parameters ---------- network: HigherOrderNetwork alpha: float damping factor max_iter: int maximum number or iterations in solver tol: float accepted tolerance for convergence check projection: str Indicates how the projection from k-th-order nodes (v1, v2, ... , v{k-1}) shall be performed. For the method 'all', the PageRank value of the higher-order node will be added to *all* first-order nodes on the path corresponding to the higher-order node. For the method 'last', the PR value of the higher-order node will only be assigned to *last* first-order node v{k-1}. For the method 'scaled' (default), the PageRank of higher-order nodes will be assigned proportionally to first-order nodes, i.e. each of the three nodes in the third-order node (a,b,c) will receive one third of the PageRank of (a,b,c). include_sub_paths: bool whether or not to use subpath statistics in the PageRank calculation weighted: bool use path weights in the calculation Returns ------- dict """ assert isinstance(network, HigherOrderNetwork), \ "network must be an instance of HigherOrderNetwork" assert projection in ['all', 'last', 'first', 'scaled'], 'Invalid projection method' Log.add( 'Calculating PageRank in ' + str(network.order) + '-th order network...', Severity.INFO) higher_order_pr = defaultdict(lambda: 0) n_nodes = float(len(network.nodes)) assert n_nodes > 0, "Number of nodes is zero" # entries A[s,t] give directed link s -> t adj_mat = network.adjacency_matrix(include_subpaths=include_sub_paths, weighted=weighted, transposed=False) # sum of outgoing node degrees row_sums = sp.array(adj_mat.sum(axis=1)).flatten() # replace non-zero entries x by 1/x row_sums[row_sums != 0] = 1.0 / row_sums[row_sums != 0] # indices of zero entries in row_sums d = sp.where(row_sums == 0)[0] # create sparse matrix with row_sums as diagonal elements q_mat = sparse.spdiags(row_sums.T, 0, adj_mat.shape[0], adj_mat.shape[1], format='csr') # with this, we have divided elements in non-zero rows in A by 1 over the row sum q_mat = q_mat * adj_mat # vector with n entries 1/n inv_n_nodes = sp.array([1.0 / n_nodes] * int(n_nodes)) p_rank = inv_n_nodes # Power iteration for _ in range(max_iter): last = p_rank # sum(pr[d]) is the sum of PageRanks for nodes with zero out-degree # sum(pr[d]) * p yields a vector with length n p_rank = (alpha * (p_rank * q_mat + sum(p_rank[d]) * inv_n_nodes) + (1 - alpha) * inv_n_nodes) if sp.absolute(p_rank - last).sum() < n_nodes * tol: higher_order_pr = dict(zip(network.nodes, map(float, p_rank))) break if network.order == 1: return higher_order_pr # project PageRank of higher-order nodes to first-order network first_order_pr = defaultdict(lambda: 0.0) # sum PageRank values based on higher-order nodes # and normalize the result for v in network.nodes: # turns node a-b-c in path tuple (a,b,c) inv_n_nodes = network.higher_order_node_to_path(v) if projection == 'all': # assign PR of higher-order node to all first-order nodes for x in inv_n_nodes: first_order_pr[x] += higher_order_pr[v] / len(inv_n_nodes) elif projection == 'scaled': for x in inv_n_nodes: # each node on e.g. a 4-th-order path a-b-c-d receives one fourth of the # PageRank value, to ensure that the resulting first-order PageRank sums # to one first_order_pr[x] += higher_order_pr[v] / float( len(inv_n_nodes)) elif projection == 'last': # assign PR of higher-order node to last first-order node first_order_pr[inv_n_nodes[-1]] += higher_order_pr[v] elif projection == 'first': # assign PR of higher-order node to last first-order node first_order_pr[inv_n_nodes[0]] += higher_order_pr[v] # for projection method 'scaled', the values sum to one anyway if projection != 'scaled': for v in first_order_pr: first_order_pr[v] /= sum(first_order_pr.values()) # assign centrality zero to nodes not occurring in higher-order PR nodes = network.paths.nodes for v in nodes: first_order_pr[v] += 0 Log.add('finished.', Severity.INFO) return first_order_pr