def getEigenValueGap(self, includeSubPaths=True, lanczosVecs=15, maxiter=20): """ Returns the eigenvalue gap of the transition matrix. @param includeSubPaths: whether or not to include subpath statistics in the calculation of transition probabilities. """ #NOTE to myself: most of the time goes for construction of the 2nd order #NOTE null graph, then for the 2nd order null transition matrix Log.add('Calculating eigenvalue gap ... ', Severity.INFO) # Build transition matrices T = self.getTransitionMatrix(includeSubPaths) # Compute the two largest eigenvalues # NOTE: ncv sets additional auxiliary eigenvectors that are computed # NOTE: in order to be more confident to actually find the one with the largest # NOTE: magnitude, see https://github.com/scipy/scipy/issues/4987 w2 = _sla.eigs(T, which="LM", k=2, ncv=lanczosVecs, return_eigenvectors=False, maxiter=maxiter) evals2_sorted = _np.sort(-_np.absolute(w2)) Log.add('finished.', Severity.INFO) return _np.abs(evals2_sorted[1])
def fitMarkovModel(self, k=1): """ Generates a k-th order Markov model for the underlying sequence """ # TODO: Add support for k=0 assert len(self.sequence) > 0, "Error: Empty sequence" # MLE fit of transition probabilities self.P[k] = _co.defaultdict(lambda: _co.defaultdict(lambda: 0.0)) Log.add('Fitting Markov model with order k = ' + str(k)) # Generate initial memory prefix mem = (()) for s in self.sequence[:k]: mem += (s, ) # count state transitions for s in self.sequence[k:]: self.P[k][mem][s] += 1.0 # shift memory by one element mem = mem[1:] + (s, ) # normalize transitions for m in self.P[k]: S = float(sum(self.P[k][m].values())) for s in self.P[k][m]: self.P[k][m][s] /= S Log.add('finished.')
def __init__(self, paths, maxOrder=1): """ Generates a hierarchy of higher-order models for the given path statistics, up to a given maximum order @param paths: the paths instance for which the model should be created @param maxOrder: the maximum order of the multi-order model """ ## A dictionary containing the layers of HigherOrderNetworks, where ## layers[k] contains the network of order k self.layers = {} ## the maximum order of this multi-order model self.maxOrder = maxOrder ## the paths object from which this multi-order model was created self.paths = paths ## a dictionary of transition matrices for all layers of the model self.T = {} for k in range(maxOrder + 1): Log.add('Generating ' + str(k) + '-th order network layer ...') self.layers[k] = HigherOrderNetwork(paths, k, paths.separator, False) # compute transition matrices for all layers. In order to use the maximally # available statistics, we always use sub paths in the calculation self.T[k] = self.layers[k].getTransitionMatrix( includeSubPaths=True) Log.add('finished.')
def factorial(self, n, log=True): """ Caclulates (or approximates) the (log of the) factorial n!. The function applies Stirling's approximation if n>20. @param n: computes factorial of n @param log: whether or not to return the (natural) logarithm of the factorial """ f = _np.float64(0) n_ = _np.float64(n) if n > 20: # use Stirling's approximation try: f = n_ * _np.log(n_) - n_ + 0.5 * _np.log( 2.0 * _np.pi * n_) + 1.0 / (12.0 * n_) - 1 / (360.0 * n_**3.0) except Warning as w: Log.add('Factorial calculation for n = ' + str(n) + ': ' + str(w), severity=Severity.WARNING) else: f = _np.log(_np.math.factorial(n)) if log: return f else: return _np.exp(f)
def ClosenessCentrality(self): """ Calculates the closeness centralities of all nodes. If the order of the higher-order network is larger than one centralities calculated based on the higher-order topology will automatically be projected back to first-order nodes. """ dist_first = self.getDistanceMatrixFirstOrder() node_centralities = _co.defaultdict(lambda: 0) Log.add( 'Calculating closeness centralities (k = ' + str(self.order) + ') ...', Severity.INFO) # calculate closeness values for v1 in dist_first: for w1 in dist_first[v1]: if v1 != w1 and dist_first[v1][w1] < _np.inf: node_centralities[v1] += 1.0 / dist_first[v1][w1] # assign centrality zero to nodes not occurring on higher-order shortest paths nodes = self.paths.getNodes() for v in nodes: node_centralities[v] += 0 Log.add('finished.', Severity.INFO) return node_centralities
def BetweennessCentrality(self, normalized=False): """ Calculates the betweenness centralities of all nodes. If the order of the higher-order network is larger than one centralities calculated based on the higher-order topology will automatically be projected back to first-order nodes. @param normalized: If set to True, betweenness centralities of nodes will be scaled by the maximum value (default False) """ shortest_paths = self.getShortestPaths() node_centralities = _co.defaultdict(lambda: 0) shortest_paths_firstorder = _co.defaultdict( lambda: _co.defaultdict(lambda: set())) Log.add( 'Calculating betweenness centralities (k = ' + str(self.order) + ') ...', Severity.INFO) for sk in shortest_paths: for dk in shortest_paths: s1 = self.HigherOrderNodeToPath(sk)[0] d1 = self.HigherOrderNodeToPath(dk)[-1] # we consider a path in a k-th order network # connecting first-order node s1 to d1 for pk in shortest_paths[sk][dk]: # convert k-th order path to first-order path and add shortest_paths_firstorder[s1][d1].add( self.HigherOrderPathToFirstOrder(pk)) for s1 in shortest_paths_firstorder: for d1 in shortest_paths_firstorder[s1]: for p1 in shortest_paths_firstorder[s1][d1]: # increase betweenness centrality of all intermediary nodes # on path from s1 to d1 for v in p1[1:-1]: if s1 != v != d1: #print('node ' + x + ': ' + str(1.0 / len(shortest_paths[vk][wk]))) node_centralities[v] += 1.0 / ( len(shortest_paths_firstorder[s1][d1]) + self.order - 1) #else: # node_centralities[v] += 1.0 if normalized: m = max(node_centralities.values()) for v in node_centralities: node_centralities[v] /= m # assign centrality zero to nodes not occurring on higher-order shortest paths nodes = self.paths.getNodes() for v in nodes: node_centralities[v] += 0 Log.add('finished.', Severity.INFO) return node_centralities
def getDistanceMatrix(self): """ Calculates shortest path distances between all pairs of higher-order nodes using the Floyd-Warshall algorithm. """ Log.add( 'Calculating distance matrix in higher-order network (k = ' + str(self.order) + ') ...', Severity.INFO) dist = _co.defaultdict(lambda: _co.defaultdict(lambda: _np.inf)) for v in self.nodes: dist[v][v] = 0 for e in self.edges: dist[e[0]][e[1]] = 1 for v in self.nodes: for w in self.nodes: for k in self.nodes: if dist[v][w] > dist[v][k] + dist[k][w]: dist[v][w] = dist[v][k] + dist[k][w] Log.add('finished.', Severity.INFO) return dist
def getAlgebraicConnectivity(self, lanczosVecs=15, maxiter=20): """ Returns the algebraic connectivity of the higher-order network. @param lanczosVecs: number of Lanczos vectors to be used in the approximate calculation of eigenvectors and eigenvalues. This maps to the ncv parameter of scipy's underlying function eigs. @param maxiter: scaling factor for the number of iterations to be used in the approximate calculation of eigenvectors and eigenvalues. The number of iterations passed to scipy's underlying eigs function will be n*maxiter where n is the number of rows/columns of the Laplacian matrix. """ Log.add('Calculating algebraic connectivity ... ', Severity.INFO) L = self.getLaplacianMatrix() # NOTE: ncv sets additional auxiliary eigenvectors that are computed # NOTE: in order to be more confident to find the one with the largest # NOTE: magnitude, see https://github.com/scipy/scipy/issues/4987 w = _sla.eigs(L, which="SM", k=2, ncv=lanczosVecs, return_eigenvectors=False, maxiter=maxiter) evals_sorted = _np.sort(_np.absolute(w)) Log.add('finished.', Severity.INFO) return _np.abs(evals_sorted[1])
def EvCent(self, projection='scaled', includeSubPaths=True): """ Calculates the eigenvector centralities of higher-order nodes. If the order of the HigherOrderNetwork is larger than one, the centralities will be projected to the first-order nodes. @param projection: Indicates how the projection from k-th-order nodes (v1, v2, ... , v{k-1}) shall be performed. For the method 'all', the eigenvector centrality of the higher-order node will be added to *all* first-order nodes on the path corresponding to the higher-order node. For the method 'last', the centrality of the higher-order node will only be assigned to *last* first-order node v{k-1}. For the method 'scaled' (default), the eigenvector centrality of higher-order nodes will be assigned proportionally to first-order nodes, i.e. each of the three nodes in the third-order node (a,b,c) will receive one third of the eigenvector centrality of (a,b,c). @param includeSubPaths: whether or not to include subpath statistics in the calculation (default True) """ A = self.getAdjacencyMatrix(includeSubPaths=includeSubPaths, weighted=False, transposed=True) # calculate leading eigenvector of A w, v = _sla.eigs(A, k=1, which="LM", ncv=13) v = v.reshape(v.size, ) higher_order_evcent = dict(zip(self.nodes, map(_np.abs, v))) # project evcent of higher-order nodes to first-order network first_order_evcent = _co.defaultdict(lambda: 0.0) # sum evcent values based on higher-order nodes # and normalize the result for v in self.nodes: # turns node a-b-c in path tuple (a,b,c) p = self.HigherOrderNodeToPath(v) if projection == 'all': # assign evcent of higher-order node to all first-order nodes for x in p: first_order_evcent[x] += higher_order_evcent[v] elif projection == 'scaled': for x in p: first_order_evcent[x] += higher_order_evcent[v] / float( len(p)) elif projection == 'last': # assign evcent of higher-order node to last first-order node first_order_evcent[p[-1]] += higher_order_evcent[v] elif projection == 'first': # assign evcent of higher-order node to last first-order node first_order_evcent[p[0]] += higher_order_evcent[v] # for scaled, values sum to one anyway if projection != 'scaled': for v in first_order_evcent: first_order_evcent[v] /= sum(first_order_evcent.values()) Log.add('finished.', Severity.INFO) return first_order_evcent return v
def filterEdges(self, edge_filter): """Filter time-stamped edges according to a given filter expression. @param edge_filter: an arbitrary filter function of the form filter_func(v, w, time) that returns True for time-stamped edges that shall pass the filter, and False for all edges that shall be filtered out. """ Log.add('Starting filtering ...', Severity.INFO) new_t_edges = [] for (v,w,t) in self.tedges: if edge_filter(v,w,t): new_t_edges.append((v,w,t)) Log.add('finished. Filtered out ' + str(self.ecount() - len(new_t_edges)) + ' time-stamped edges.', Severity.INFO) return TemporalNetwork(tedges=new_t_edges)
def getShortestPaths(self): """ Calculates all shortest paths between all pairs of higher-order nodes using the Floyd-Warshall algorithm. """ Log.add( 'Calculating shortest paths in higher-order network (k = ' + str(self.order) + ') ...', Severity.INFO) dist = _co.defaultdict(lambda: _co.defaultdict(lambda: _np.inf)) shortest_paths = _co.defaultdict( lambda: _co.defaultdict(lambda: set())) for e in self.edges: dist[e[0]][e[1]] = 1 shortest_paths[e[0]][e[1]].add(e) for v in self.nodes: for w in self.nodes: if v != w: for k in self.nodes: if dist[v][w] > dist[v][k] + dist[k][w]: dist[v][w] = dist[v][k] + dist[k][w] shortest_paths[v][w] = set() for p in list(shortest_paths[v][k]): for q in list(shortest_paths[k][w]): shortest_paths[v][w].add(p + q[1:]) elif dist[v][w] == dist[v][k] + dist[k][w]: for p in list(shortest_paths[v][k]): for q in list(shortest_paths[k][w]): shortest_paths[v][w].add(p + q[1:]) for v in self.nodes: dist[v][v] = 0 shortest_paths[v][v].add((v, )) Log.add('finished.', Severity.INFO) return shortest_paths
def __init__(self, tedges=None): """ Constructor that generates a temporal network instance. @param tedges: an optional list of (possibly unordered time-stamped) links from which to construct a temporal network instance. For the default value None an empty temporal network will be created. """ ## A list of time-stamped edges of this temporal network self.tedges = [] ## A list of nodes of this temporal network self.nodes = [] ## A dictionary storing all time-stamped links, indexed by time-stamps self.time = _co.defaultdict(lambda: list()) ## A dictionary storing all time-stamped links, indexed by time and target node self.targets = _co.defaultdict(lambda: dict()) ## A dictionary storing all time-stamped links, indexed by time and source node self.sources = _co.defaultdict(lambda: dict()) ## A dictionary storing time stamps at which links (v,*;t) originate from node v self.activities = _co.defaultdict(lambda: list()) ## A dictionary storing sets of time stamps at which links (v,*;t) originate from node v ## Note that the insertion into a set is much faster than repeatedly checking whether ## an element already exists in a list! self.activities_sets = _co.defaultdict(lambda: set()) ## An ordered list of time-stamps self.ordered_times = [] nodes_seen = _co.defaultdict(lambda: False) if tedges is not None: Log.add('Building index data structures ...') for e in tedges: self.activities_sets[e[0]].add(e[2]) self.time[e[2]].append(e) self.targets[e[2]].setdefault(e[1], []).append(e) self.sources[e[2]].setdefault(e[0], []).append(e) if not nodes_seen[e[0]]: nodes_seen[e[0]] = True if not nodes_seen[e[1]]: nodes_seen[e[1]] = True self.tedges = tedges self.nodes = list(nodes_seen.keys()) Log.add('Sorting time stamps ...') self.ordered_times = sorted(self.time.keys()) for v in self.nodes: self.activities[v] = sorted(self.activities_sets[v]) Log.add('finished.')
def likeliHoodRatioTest(self, paths, maxOrderNull=0, maxOrder=1, assumption='paths', significanceThreshold=0.01): """ Performs a likelihood-ratio test between two multi-order models with given maximum orders, where maxOrderNull serves as null hypothesis and maxOrder serves as alternative hypothesis. The null hypothesis is rejected if the p-value for the observed paths under the null hypothesis is smaller than the given significance threshold. Applying this test makes the assumption that we have nested models, i.e. that the null model is contained as a special case in the parameter space of the more complex model. If we assume that the path constraint holds, this is not true for the test of the first- against the zero-order model (since some sequences of the zero order model cannot be generated in the first-order model). However, since the set of possible higher-order transitions is generated based on the first-order model, the nestedness property holds for all higher order models. @param paths: the path data to be used in the liklihood ratio test @param maxOrderNull: maximum order of the multi-order model to be used as a null hypothesis @param maxOrder: maximum order of the multi-order model to be used as alternative hypothesis @param assumption: paths or ngrams @param significanceThreshold: the threshold for the p-value below which to accept the alternative hypothesis @returns: a tuple of the format (reject, p) which captures whether or not the null hypothesis is rejected in favor of the alternative hypothesis, as well as the p-value that led to the decision """ assert maxOrderNull < maxOrder, 'Error: order of null hypothesis must be smaller than order of alternative hypothesis' # let L0 be the likelihood for the null model and L1 be the likelihood for the alternative model # we first compute a test statistic x = -2 * log (L0/L1) = -2 * (log L0 - log L1) x = -2 * (self.getLikelihood(paths, maxOrder=maxOrderNull, log=True) - self.getLikelihood(paths, maxOrder=maxOrder, log=True)) # we calculate the additional degrees of freedom in the alternative model dof_diff = self.getDegreesOfFreedom( maxOrder=maxOrder, assumption=assumption) - self.getDegreesOfFreedom( maxOrder=maxOrderNull, assumption=assumption) Log.add('Likelihood ratio test for K_opt = ' + str(maxOrder) + ', x = ' + str(x)) Log.add('Likelihood ratio test, d_1-d_0 = ' + str(dof_diff)) # if the p-value is *below* the significance threshold, we reject the null hypothesis p = 1 - chi2.cdf(x, dof_diff) Log.add('Likelihood ratio test, p = ' + str(p)) return (p < significanceThreshold), p
def readFile(filename, sep=',', timestampformat="%Y-%m-%d %H:%M", maxlines=_sys.maxsize): """ Reads time-stamped links from a file and returns a new instance of the class TemporalNetwork. The file is assumed to have a header source target time where columns can be in arbitrary order and separated by arbitrary characters. Each time-stamped link must occur in a separate line and links are assumed to be directed. The time column can be omitted and in this case all links are assumed to occur in consecutive time stamps (that have a distance of one). Time stamps can be simple integers, or strings to be converted to UNIX time stamps via a custom timestamp format. For this, the python function datetime.strptime will be used. @param sep: the character that separates columns @param filename: path of the file to read from @param timestampformat: used to convert string timestamps to UNIX timestamps. This parameter is ignored, if the timestamps are digit types (like a simple int). @param maxlines: limit reading of file to certain number of lines, default sys.maxsize """ assert (filename != ''), 'Empty filename given' # Read header with open(filename, 'r') as f: tedges = [] header = f.readline() header = header.split(sep) # If header columns are included, arbitrary column orders are supported time_ix = -1 source_ix = -1 mid_ix = -1 weight_ix = -1 target_ix = -1 for i in range(len(header)): header[i] = header[i].strip() if header[i] == 'node1' or header[i] == 'source': source_ix = i elif header[i] == 'node2' or header[i] == 'target': target_ix = i elif header[i] == 'time' or header[i] == 'timestamp': time_ix = i assert (source_ix >= 0 and target_ix >= 0 ), "Detected invalid header columns: %s" % header if time_ix < 0: Log.add( 'No time stamps found in data, assuming consecutive links', Severity.WARNING) Log.add('Reading time-stamped links ...') line = f.readline() n = 1 while line and n <= maxlines: fields = line.rstrip().split(sep) try: if time_ix >= 0: timestamp = fields[time_ix] # if the timestamp is a number, we use this if timestamp.isdigit(): t = int(timestamp) else: # if it is a string, we use the timestamp format to convert it to a UNIX timestamp x = _dt.datetime.strptime(timestamp, timestampformat) t = int(_t.mktime(x.timetuple())) else: t = n if t >= 0: tedge = (fields[source_ix], fields[target_ix], t) tedges.append(tedge) else: Log.add( 'Ignoring negative timestamp in line ' + str(n + 1) + ': "' + line.strip() + '"', Severity.WARNING) except (IndexError, ValueError): Log.add( 'Ignoring malformed data in line ' + str(n + 1) + ': "' + line.strip() + '"', Severity.WARNING) line = f.readline() n += 1 # end of with open() return TemporalNetwork(tedges=tedges)
def PageRank(self, alpha=0.85, maxIterations=100, convergenceThres=1.0e-6, projection='scaled', includeSubPaths=True): """ Calculates the PageRank of higher-order nodes based on a power iteration. If the order of the higher-order network is larger than one, the PageRank calculated based on the higher-order topology will automatically be projected back to first-order nodes. @param projection: Indicates how the projection from k-th-order nodes (v1, v2, ... , v{k-1}) shall be performed. For the method 'all', the pagerank value of the higher-order node will be added to *all* first-order nodes on the path corresponding to the higher-order node. For the method 'last', the PR value of the higher-order node will only be assigned to *last* first-order node v{k-1}. For the method 'scaled' (default), the PageRank of higher-order nodes will be assigned proportionally to first-order nodes, i.e. each of the three nodes in the third-order node (a,b,c) will receive one third of the PageRank of (a,b,c). @param includeSubpaths: whether or not to use subpath statistics in the PageRank calculation """ assert projection == 'all' or projection == 'last' or projection == 'first' or projection == 'scaled', 'Invalid projection method' Log.add( 'Calculating PageRank in ' + str(self.order) + '-th order network...', Severity.INFO) higher_order_PR = _co.defaultdict(lambda: 0) n = float(len(self.nodes)) assert n > 0, "Number of nodes is zero" # entries A[s,t] give directed link s -> t A = self.getAdjacencyMatrix(includeSubPaths=includeSubPaths, weighted=False, transposed=False) # sum of outgoing node degrees row_sums = _sp.array(A.sum(axis=1)).flatten() row_sums[row_sums != 0] = 1.0 / row_sums[row_sums != 0] d = _sp.where(row_sums == 0)[0] Q = _sparse.spdiags(row_sums.T, 0, *A.shape, format='csr') A = Q * A p = _sp.array([1.0 / n] * int(n)) pr = p # Power iteration for i in range(maxIterations): last = pr pr = alpha * (pr * A + sum(pr[d]) * p) + (1 - alpha) * p if _sp.absolute(pr - last).sum() < n * convergenceThres: higher_order_PR = dict(zip(self.nodes, map(float, pr))) break if self.order == 1: return higher_order_PR # project PageRank of higher-order nodes to first-order network first_order_PR = _co.defaultdict(lambda: 0.0) # sum PageRank values based on higher-order nodes # and normalize the result for v in self.nodes: # turns node a-b-c in path tuple (a,b,c) p = self.HigherOrderNodeToPath(v) if projection == 'all': # assign PR of higher-order node to all first-order nodes for x in p: first_order_PR[x] += higher_order_PR[v] elif projection == 'scaled': for x in p: # each node on e.g. a 4-th-order path a-b-c-d receives one fourth of the # PageRank value, to ensure that the resulting first-order PageRank sums # to one first_order_PR[x] += higher_order_PR[v] / float(len(p)) elif projection == 'last': # assign PR of higher-order node to last first-order node first_order_PR[p[-1]] += higher_order_PR[v] elif projection == 'first': # assign PR of higher-order node to last first-order node first_order_PR[p[0]] += higher_order_PR[v] # for projection method 'scaled', the values sum to one anyway if projection != 'scaled': for v in first_order_PR: first_order_PR[v] /= sum(first_order_PR.values()) # assign centrality zero to nodes not occurring in higher-order PR nodes = self.paths.getNodes() for v in nodes: first_order_PR[v] += 0 Log.add('finished.', Severity.INFO) return first_order_PR