Exemple #1
0
    def factorial(n, log=True):  # pragma: no cover
        """
        Calculates the factorial of n, automatically switching to 
        Stirling's approaximation for n>20.

        Parameters
        ----------
        n: int
            The value n for which the fatorial should be calculated.
        log: bool
            Whether or not to return the (natural) logarithm of the factorial. Default is True.

        Returns
        -------
        float
        """

        f = np.float64(0)
        n_ = np.float64(n)
        if n > 20:  # use Stirling's approximation
            try:
                f = (n_ * np.log(n_) - n_ + 0.5 * np.log(2.0 * np.pi * n_) +
                     1.0 / (12.0 * n_) - 1 / (360.0 * n_**3.0))
            except Warning as w:
                msg = 'Factorial calculation for n={}: {}'.format(n, w)
                Log.add(msg, severity=Severity.WARNING)

        else:
            f = np.log(np.math.factorial(n))

        if log:
            return f
        else:
            return np.exp(f)
Exemple #2
0
    def fit_markov_model(self, k=1):
        """ Generates a k-th order Markov model
            for the underlying sequence
        """

        # TODO: Add support for k=0

        assert self.sequence, "Error: Empty sequence"

        # MLE fit of transition probabilities
        self.P[k] = defaultdict(lambda: defaultdict(lambda: 0.0))

        Log.add('Fitting Markov model with order k = ' + str(k))

        # Generate initial memory prefix
        mem = (())
        for s in self.sequence[:k]:
            mem += (s, )

        # count state transitions
        for s in self.sequence[k:]:
            self.P[k][mem][s] += 1.0

            # shift memory by one element
            mem = mem[1:] + (s, )

        # normalize transitions
        for m in self.P[k]:
            S = float(sum(self.P[k][m].values()))
            for s in self.P[k][m]:
                self.P[k][m][s] /= S
        Log.add('finished.')
Exemple #3
0
def generate_walk(tempnet, l=100, start_node=None):
    """
    DEPRECATED
    """
    Log.add('The temporal_walk.generate_walk function is deprecated. \
             Please use random_walk.generate_walk instead.', Severity.WARNING)
    return random_walk.generate_walk(tempnet, l, start_node)
Exemple #4
0
def paths_to_origin_destination(paths):
    """
    Returns a list that contains path frequencies between all 
    origin destination pairs in a path object. The result can e.g. be used to 
    create shortest path models that preserve the origin-destination statistics in real 
    path data.

    Parameters
    ----------
    paths: Paths
        collection of weighted paths based on which origin destination
        statistics shall be computed
    
    Returns
    -------
    list of tuples (o, d, w) where o is origin, d is destination, and w is the weight
    """
    od_stats = defaultdict(lambda: 0.0)

    Log.add('Calculating origin/destination statistics from paths ...')
    # iterate through all paths and create path statistics
    for x in paths.paths:
        for p in paths.paths[x]:
            o = p[0]
            d = p[-1]
            if paths.paths[x][p][1] > 0:
                od_stats[o, d] += paths.paths[x][p][1]
    od_list = [(od[0], od[1], f) for od, f in od_stats.items()]
    Log.add('finished.')
    return od_list
Exemple #5
0
    def add_edges(self, edges):
        """Add a list of edges

        Parameters
        ----------
        edges: list
            a list of edges [(s_1, t_1), (s_1, t_2), ...]

        """
        self_loops = 0
        redundant_edges = 0
        for e in edges:
            is_redundant = False
            has_self_loop = False
            if e[0] == e[1]:
                has_self_loop = True
                self_loops += 1
            if (e[0], e[1]) in self.edges:
                is_redundant = True
                redundant_edges += 1
            if not has_self_loop and not is_redundant:
                self.add_edge(e[0], e[1])
        if self_loops > 0:
            Log.add('Warning: omitted %d self-loops' % self_loops,
                    Severity.WARNING)
        if redundant_edges > 0:
            Log.add('Warning: omitted %d redundant edges' % redundant_edges,
                    Severity.WARNING)
Exemple #6
0
def betweenness(network, normalized=False):
    assert isinstance(network, Network), \
        "network must be an instance of Network"

    Log.add('Calculating betweenness centralities ...', Severity.INFO)

    all_paths = shortest_paths(network)
    node_centralities = defaultdict(lambda: 0)

    for s in all_paths:
        for d in all_paths[s]:
            for p in all_paths[s][d]:
                for x in p[1:-1]:
                    if s != d != x:
                        node_centralities[x] += 1.0 / len(all_paths[s][d])
    if normalized:
        max_centr = max(node_centralities.values())
        for v in node_centralities:
            node_centralities[v] /= max_centr

    # assign zero values to nodes not occurring on shortest paths
    for v in network.nodes:
        node_centralities[v] += 0

    return node_centralities
Exemple #7
0
def node_traversals(paths):
    """Calculates the number of times any path traverses each of the nodes.

    Parameters
    ----------
    paths: Paths

    Returns
    -------
    dict
    """
    if not isinstance(paths, Paths):
        raise PathpyNotImplemented("`paths` must be an instance of Paths")

    Log.add('Calculating node traversals...', Severity.INFO)

    # entries capture the number of times nodes are "visited by paths"
    # Note: this is identical to the subpath count of zero-length paths
    traversals = defaultdict(lambda: 0)

    for p in paths.paths[0]:
        traversals[p[0]] += paths.paths[0][p].sum()

    Log.add('finished.', Severity.INFO)

    return traversals
Exemple #8
0
def _cl(higher_order_net, normalized=False):

    if not isinstance(higher_order_net, HigherOrderNetwork):
        raise PathpyNotImplemented(
            "`higher_order_net` must be an instance of HigherOrderNetwork")

    distances = distance_matrix(higher_order_net)
    node_centralities = defaultdict(lambda: 0)
    nodes = higher_order_net.paths.nodes

    Log.add('Calculating closeness (k = %s) ...' % higher_order_net.order,
            Severity.INFO)

    for x in nodes:
        # calculate closeness centrality of x
        for d in nodes:
            if x != d and distances[d][x] < _np.inf:
                node_centralities[x] += 1.0 / distances[d][x]

    # assign centrality zero to those nodes for which no higher-order path exists
    for v in nodes:
        node_centralities[v] += 0

    if normalized:
        m = max(node_centralities.values())
        for v in nodes:
            node_centralities[v] /= m

    Log.add('finished.', Severity.INFO)

    return node_centralities
Exemple #9
0
    def sequence(self, stop_char='|'):
        """

        Parameters
        ----------
        stop_char : str
            the character used to separate paths

        Returns
        -------
        tuple:
            Returns a single sequence in which all paths have been concatenated.
            Individual paths are separated by a stop character.
        """
        Log.add('Concatenating paths to sequence ...')
        sequence = []
        for p_length in self.paths:
            for p in self.paths[p_length]:
                segment = []
                for s in p:
                    segment.append(s)
                if stop_char != '':
                    segment.append(stop_char)
                for _ in range(int(self.paths[p_length][p][1])):
                    sequence += segment

        Log.add('finished')
        return sequence
Exemple #10
0
    def filter_edges(self, edge_filter):
        """Returns a copy of the temporal network where time-stamped edges are filtered 
        according to a given filter expression. This can be used, e.g., to create time 
        slice networks by filtering edges within certain time windows, or to reduce a 
        temporal network to interactions between a subset of nodes.

        Parameters
        ----------
        edge_filter: callable
            an arbitrary filter function of the form filter_func(v, w, time) that returns
            True for time-stamped edges that shall pass the filter, and False for time-stamped
            edges that shall be filtered out.

        Returns
        -------

        """
        Log.add('Starting filtering ...', Severity.INFO)
        new_t_edges = []

        for (v, w, t) in self.tedges:
            if edge_filter(v, w, t):
                new_t_edges.append((v, w, t))

        n_filtered = self.ecount() - len(new_t_edges)
        msg = 'finished. Filtered out {} time-stamped edges.'.format(
            n_filtered)
        Log.add(msg, Severity.INFO)

        return TemporalNetwork(tedges=new_t_edges)
Exemple #11
0
def visitation_probabilities(paths):
    """Calculates the probabilities that a randomly chosen path passes through each of
    the nodes. If 5 out of 100 paths (of any length) traverse node v, node v will be
    assigned a visitation probability of 0.05. This measure can be interpreted as ground
    truth for the notion of importance captured by PageRank applied to a graphical
    abstraction of the paths.

    Parameters
    ----------
    paths: Paths

    Returns
    -------
    dict
    """
    if not isinstance(paths, Paths):
        raise PathpyNotImplemented("`paths` must be an instance of Paths")
    Log.add('Calculating visitation probabilities...', Severity.INFO)

    # entries capture the probability that a given node is visited on an arbitrary path
    # Note: this is identical to the subpath count of zero-length paths
    # (i.e. the relative frequencies of nodes across all pathways)
    visit_probabilities = node_traversals(paths)

    # total number of visits
    visits = 0.0
    for v in visit_probabilities:
        visits += visit_probabilities[v]

    for v in visit_probabilities:
        visit_probabilities[v] /= visits

    Log.add('finished.', Severity.INFO)

    return visit_probabilities
Exemple #12
0
def read_origin_destination(filename, separator=','):
    """Reads origin/destination statistics from a csv file
    with the following structure:

    origin1,destination1,weight
    origin2,destination2,weight
    origin3,destination3,weight

    Parameters
    ----------
    filename: str
        path to the file containing the origin/destination statistics
    separator: str
        arbitrary separation character (default: ',')

    Returns
    -------
    list
    """
    origin_destination_list = []
    Log.add('Reading origin/destination statistics from file ...')

    with open(filename, 'r') as f:
        line = f.readline()
        while line:
            fields = line.rstrip().split(separator)
            origin_destination_list.append(
                (fields[0].strip(), fields[1].strip(),
                 float(fields[2].strip())))
            line = f.readline()
    Log.add('Finished.')

    return origin_destination_list
Exemple #13
0
def _dm(paths):
    """
    Calculates shortest path distances between all pairs of
    nodes based on the observed shortest paths (and subpaths)
    """
    dist = defaultdict(lambda: defaultdict(lambda: _np.inf))

    Log.add('Calculating distance matrix based on empirical paths ...',
            Severity.INFO)
    # Node: no need to initialize shortest_path_lengths[v][v] = 0
    # since paths of length zero are contained in self.paths

    for v in paths.nodes:
        dist[v][v] = 0

    for p_length in paths.paths:
        for p in paths.paths[p_length]:
            start = p[0]
            end = p[-1]
            if p_length < dist[start][end]:
                dist[start][end] = p_length

    Log.add('finished.', Severity.INFO)

    return dist
Exemple #14
0
    def read_edges(filename,
                   separator=',',
                   weight=False,
                   undirected=False,
                   maxlines=None):
        """
        Read path in edgelist format

        Reads data from a file containing multiple lines of *edges* of the
        form "v,w,frequency,X" (where frequency is optional and X are
        arbitrary additional columns). The default separating character ','
        can be changed.

        Parameters
        ----------
        filename : str
            path to edgelist file
        separator : str
            character separating the nodes
        weight : bool
            is a weight given? if ``True`` it is the last element in the edge
            (i.e. ``a,b,2``)
        undirected : bool
            are the edges directed or undirected
        maxlines : int
            number of lines to read (useful to test large files). None means the entire file is
            read
        Returns
        -------
        Paths
            a ``Paths`` object obtained from the edgelist
        """
        p = Paths()

        p.separator = separator

        with open(filename, 'r') as f:
            Log.add('Reading edge data ... ')
            for n, line in enumerate(f):
                fields = line.rstrip().split(separator)
                assert len(fields) >= 2, 'Error: malformed line: {0}'.format(
                    line)
                path = (fields[0], fields[1])

                frequency = int(fields[2]) if weight else 1

                p.paths[1][path] += (0, frequency)
                if undirected:
                    reverse_path = (fields[1], fields[0])
                    p.paths[1][reverse_path] += (0, frequency)

                if maxlines is not None and n >= maxlines:
                    break
        p.expand_subpaths()
        Log.add('finished.')

        return p
Exemple #15
0
        def parallel(order_k):  # pragma: no cover
            Log.add('Generating ' + str(order_k) +
                    '-th order network layer ...')
            p_layer = HigherOrderNetwork(paths, k=order_k, null_model=False)

            # compute transition matrices for all layers. In order to use the
            # maximally available statistics, we always use sub paths in the
            # calculation
            trans_mat = p_layer.transition_matrix(include_subpaths=True)

            Log.add('... finished')
            return [order_k, p_layer, trans_mat]
Exemple #16
0
def eigenvalue_gap(network,
                   include_sub_paths=True,
                   lanczos_vectors=15,
                   maxiter=20):
    """Returns the eigenvalue gap of the transition matrix.

    Parameters
    ----------
    network
    include_sub_paths: bool
        whether or not to include subpath statistics in the calculation of transition
        probabilities.
    lanczos_vectors: int
        number of Lanczos vectors to be used in the approximate
        calculation of eigenvectors and eigenvalues. This maps to the ncv parameter
        of scipy's underlying function eigs.
    maxiter: int
        scaling factor for the number of iterations to be used in the
        approximate calculation of eigenvectors and eigenvalues. The number of iterations
        passed to scipy's underlying eigs function will be n*maxiter where n is the
        number of rows/columns of the Laplacian matrix.

    Returns
    -------
    float
    """
    assert isinstance(network, HigherOrderNetwork), \
        "network must be an instance of HigherOrderNetwork"
    # NOTE to myself: most of the time goes for construction of the 2nd order
    # NOTE            null graph, then for the 2nd order null transition matrix

    Log.add('Calculating eigenvalue gap ... ', Severity.INFO)

    # Build transition matrices
    trans_mat = network.transition_matrix(include_sub_paths)

    # Compute the two largest eigenvalues
    # NOTE: ncv sets additional auxiliary eigenvectors that are computed
    # NOTE: in order to be more confident to actually find the one with the largest
    # NOTE: magnitude, see https://github.com/scipy/scipy/issues/4987
    eig_vals = sla.eigs(trans_mat,
                        which="LM",
                        k=2,
                        ncv=lanczos_vectors,
                        return_eigenvectors=False,
                        maxiter=maxiter)
    eigen_values2_sorted = _np.sort(-_np.absolute(eig_vals))

    Log.add('finished.', Severity.INFO)

    return _np.abs(eigen_values2_sorted[1])
Exemple #17
0
    def __add_layers_sequential(self, orders):
        paths = self.paths

        for k in sorted(orders):
            Log.add('Generating %d-th order layer ...' % k)
            self.layers[k] = HigherOrderNetwork(paths, k, null_model=False)

            # compute transition matrices for all layers. In order to use the
            # maximally available statistics, we always use sub paths in the
            # calculation
            self.transition_matrices[k] = self.layers[k].transition_matrix(
                include_subpaths=True)

        Log.add('finished.')
Exemple #18
0
def paths_from_temporal_network(tempnet,
                                delta=1,
                                max_length=sys.maxsize,
                                max_subpath_length=sys.maxsize):
    """
    Warning: This function is deprecated. Calls will be rerouted to to paths_from_temporal_network_dag. 
    
    If you intended to calculate paths with a single continuing edge, use paths_from_temporal_network_single instead (see documentation of this function for details).
    """
    Log.add(
        'This function is deprecated. Rerouting call to paths_from_temporal_network_dag. If you intended to calculate paths with a single continuing edge, use paths_from_temporal_network_single instead.',
        Severity.WARNING)
    return paths_from_temporal_network_dag(
        tempnet, delta, max_subpath_length=max_subpath_length)
Exemple #19
0
    def expand_subpaths(self):
        """
        This function implements the sub path expansion, i.e.
        for a four-gram a,b,c,d, the paths a->b, b->c, c->d of
        length one and the paths a->b->c and b->c->d of length
        two will be counted.

        This process will consider restrictions to the maximum
        sub path length defined in self.max_subpath_length
        """

        # nothing to see here ...
        if not self.paths:
            return

        Log.add('Calculating sub path statistics ... ')

        # the expansion of all subpaths in paths with a maximum path length of maxL
        # necessarily generates paths of *any* length up to MaxL.
        # Forcing the generation of all these indices here, prevents us
        # from mutating indices during subpath creation. The fact that indices are
        # immutable allows us to use efficient iterators and prevent unnecessarily copying

        # Thanks to the use of defaultdict, the following trick will prevent us from
        # repeatedly testing whether l already exists as a key
        for p_length in range(max(self.paths)):
            self.paths[p_length] = self.paths[p_length]

        # expand subpaths in paths of any length ...
        for path_length in self.paths:
            for path, value in self.paths[path_length].items():

                # The frequency is given by the number of occurrences as longest
                # path, which is stored in the second entry of the numpy array
                frequency = value[1]

                # compute the maximum length of sub paths to consider
                # (maximum up to pathLength)
                max_length = min(self.max_subpath_length + 1, path_length)

                # Generate all subpaths of length k for k = 0 to k = max_len-1 (inclusive)
                for k in range(max_length):
                    # Generate subpaths of length k for all start indices s
                    # for s = 0 to s = pathLength-k (inclusive)
                    for s in range(path_length - k + 1):
                        # Add frequency as a subpath to *first* entry of array
                        path_slice = path[s:s + k + 1]
                        self.paths[k][path_slice][0] += frequency
Exemple #20
0
def closeness(network, normalized=False):
    """Calculates the closeness of all nodes.

    If the order of the higher-order network is larger than one
    centralities calculated based on the higher-order
    topology will automatically be projected back to first-order
    nodes.

    Parameters
    ----------
    network: HigherOrderNetwork

    Returns
    -------
    dict
    """
    if not isinstance(network, Network):
        raise PathpyNotImplemented("`network` must be an instance of Network")

    distances = distance_matrix(network)
    node_centralities = defaultdict(lambda: 0)

    mapping = {idx: v for idx, v in enumerate(network.nodes)}

    Log.add('Calculating closeness in network ...', Severity.INFO)
    n = network.ncount()
    # calculate closeness values
    for d in range(n):
        for x in range(n):
            if d != x and distances[d, x] < _np.inf:
                node_centralities[mapping[x]] += distances[d, x]

    # assign centrality zero to nodes not occurring on higher-order shortest paths
    for v in network.nodes:
        node_centralities[v] += 0.0
        if node_centralities[v] > 0.0:
            node_centralities[v] = (network.ncount() -
                                    1.0) / node_centralities[v]

    if normalized:
        max_centr = max(node_centralities.values())
        for v in network.nodes:
            node_centralities[v] /= max_centr

    Log.add('finished.', Severity.INFO)

    return node_centralities
Exemple #21
0
 def make_acyclic(self):
     """Removes all back-links from the graph to make it acyclic, then performs another
     topological sorting of the DAG
     """
     if self.is_acyclic is None:
         self.topsort()
     removed_links = 0
     if not self.is_acyclic:
         # Remove all back links
         for e in list(self.edge_classes):
             if self.edge_classes[e] == 'back':
                 self.remove_edge(*e)
                 removed_links += 1
         self.topsort()
         assert self.is_acyclic, "Error: make_acyclic did not generate acyclic graph!"
         Log.add(
             'Removed ' + str(removed_links) +
             ' back links to make graph acyclic', Severity.INFO)
Exemple #22
0
    def from_sqlite(cls, cursor, directed=True):
        r"""Returns a new Network instance generated from links obtained
        from an SQLite cursor. The cursor must refer to a table with at least
        two columns

                source target

        in which each row contains one link. Additional columns will be used as
        named edge properties. Since columns are accessed by name this function requires that a
        row factory object is set for the SQLite connection prior to cursor creation,
        i.e. you should set

                connection.row_factory = sqlite3.Row

        Parameters
        ----------
        cursor :
            The SQLite cursor to fetch rows from.
        directed : bool
            Whether or not links should be interpreted as directed. Default is True.

        Returns
        -------
        Network
            A Network instance created from the SQLite database.

        """
        from pathpy.classes import DAG
        if cls == DAG:
            n = cls()
        else:
            n = cls(directed=directed)

        assert cursor.connection.row_factory, \
            'Cannot access columns by name. Please set ' \
            'connection.row_factory = sqlite3.Row before creating DB cursor.'

        Log.add('Retrieving links from database ...')

        for row in cursor:
            n.add_edge(str(row['source']), str(row['target']))

        return n
Exemple #23
0
def algebraic_connectivity(network, lanczos_vectors=15, maxiter=20):
    """

    Parameters
    ----------
    network: HigherOrderNetwork
    lanczos_vectors: int
        number of Lanczos vectors to be used in the approximate calculation of
        eigenvectors and eigenvalues. This maps to the ncv parameter of scipy's underlying
        function eigs.
    maxiter: int
        scaling factor for the number of iterations to be used in the approximate
        calculation of eigenvectors and eigenvalues. The number of iterations passed to
        scipy's underlying eigs function will be n*maxiter where n is the number of
        rows/columns of the Laplacian matrix.

    Returns
    -------

    """
    assert isinstance(network, HigherOrderNetwork), \
        "network must be an instance of HigherOrderNetwork"
    Log.add('Calculating algebraic connectivity ... ', Severity.INFO)

    lapl_mat = network.laplacian_matrix()
    # NOTE: ncv sets additional auxiliary eigenvectors that are computed
    # NOTE: in order to be more confident to find the one with the largest
    # NOTE: magnitude, see https://github.com/scipy/scipy/issues/4987
    w = sla.eigs(lapl_mat,
                 which="SM",
                 k=2,
                 ncv=lanczos_vectors,
                 return_eigenvectors=False,
                 maxiter=maxiter)
    eigen_values_sorted = _np.sort(_np.absolute(w))

    Log.add('finished.', Severity.INFO)

    # TODO: result is unstable, it looks like it depends on a "warm start"
    # (i.e. run after other eigen velue calculations) see test_algebraic_connectivity
    # problems with order k=3

    return _np.abs(eigen_values_sorted[1])
Exemple #24
0
def random_walk(network, l, n=1, start_node=None):
    """
    [DEPRECATED]
    Generates n paths of a random walker in the given network
    and returns them as a paths object.
    Each path has a length of l steps.
    Parameters
    ----------
    network: Network, TemporalNetwork, HigherOrderNetwork
        The network structure on which the random walks will be simulated.
    int: l
        The (maximum) length of each random walk path. A path will
        terminate if a node with outdegree zero is reached.
    int: n
        The number of random walk paths to generate.
    """
    Log.add(
        'The path_extraction.random_walk function is deprecated. Please use paths_from_random_walk instead.',
        Severity.WARNING)
    return paths_from_random_walk(network, l, n, start_node)
Exemple #25
0
    def write_file(self, filename, separator=','):
        """Writes the time-stamped edge list of this temporal network instance as CSV file

        Parameters
        ----------
        filename: str
            name of CSV file to save data to
        separator: str
            character used to separate columns in generated CSV file

        Returns
        -------

        """
        msg = 'Writing {0} time-stamped edges to file {1}'.format(self.ecount(), filename)
        Log.add(msg, Severity.INFO)
        with open(filename, 'w+') as f:
            f.write('source' + separator + 'target' + separator + 'time' + '\n')
            for time in self.ordered_times:
                for (v, w, t) in self.time[time]:
                    f.write(str(v) + separator + str(w) + separator + str(t)+'\n')
Exemple #26
0
    def __init__(self, tedges=None):
        """Constructor that generates a temporal network instance.

        Parameters
        ----------
        tedges:
            an optional list of directed time-stamped edges from which to construct a
            temporal network instance. For the default value None an empty temporal
            network will be created.
        """
        # A list of time-stamped edges of this temporal network
        self.tedges = []

        # A list of nodes of this temporal network
        self.nodes = []

        # A dictionary storing all time-stamped links, indexed by time-stamps
        self.time = defaultdict(lambda: list())

        # A dictionary storing all time-stamped links, indexed by time and target node
        self.targets = defaultdict(lambda: dict())

        # A dictionary storing all time-stamped links, indexed by time and source node
        self.sources = defaultdict(lambda: dict())

        # A dictionary storing time stamps at which links (v,*;t) originate from node v
        self.activities = defaultdict(lambda: list())

        # A dictionary storing sets of time stamps at which links (v,*;t) originate from
        # node v
        # Note that the insertion into a set is much faster than repeatedly checking
        # whether an element already exists in a list!
        self.activities_sets = defaultdict(lambda: set())

        # An ordered list of time-stamps
        self.ordered_times = []

        nodes_seen = defaultdict(lambda: False)

        if tedges is not None:
            Log.add('Building index data structures ...')

            for e in tedges:
                self.activities_sets[e[0]].add(e[2])
                self.time[e[2]].append(e)
                self.targets[e[2]].setdefault(e[1], []).append(e)
                self.sources[e[2]].setdefault(e[0], []).append(e)
                if not nodes_seen[e[0]]:
                    nodes_seen[e[0]] = True
                if not nodes_seen[e[1]]:
                    nodes_seen[e[1]] = True
            self.tedges = tedges
            self.nodes = list(nodes_seen.keys())

            Log.add('Sorting time stamps ...')

            self.ordered_times = sorted(list(self.time.keys()))
            for v in self.nodes:
                self.activities[v] = sorted(self.activities_sets[v])
            Log.add('finished.')
Exemple #27
0
def _bw(paths, normalized=False):
    """Calculates the betweenness of nodes based on observed shortest paths
    between all pairs of nodes

    Parameters
    ----------
    paths:
        Paths object
    normalized: bool
        normalize such that largest value is 1.0

    Returns
    -------
    dict
    """
    assert isinstance(paths,
                      Paths), "argument must be an instance of pathpy.Paths"
    node_centralities = defaultdict(lambda: 0)

    Log.add('Calculating betweenness in paths ...', Severity.INFO)

    all_paths = shortest_paths(paths)

    for s in all_paths:
        for d in all_paths[s]:
            for p in all_paths[s][d]:
                for x in p[1:-1]:
                    if s != d != x:
                        node_centralities[x] += 1.0 / len(all_paths[s][d])
    if normalized:
        max_centr = max(node_centralities.values())
        for v in node_centralities:
            node_centralities[v] /= max_centr

    # assign zero values to nodes not occurring on shortest paths
    nodes = paths.nodes
    for v in nodes:
        node_centralities[v] += 0
    Log.add('finished.')
    return node_centralities
Exemple #28
0
    def read_file(cls, filename, separator=',', weighted=False, directed=False, header=False):
        r"""Reads a network from an edge list file.

        Reads data from a file containing multiple lines of *edges* of the
        form "v,w,frequency,X" (where frequency is optional and X are
        arbitrary additional columns). The default separating character ','
        can be changed. In order to calculate the statistics of paths of any length,
        by default all subpaths of length 0 (i.e. single nodes) contained in an edge
        will be considered.

        Parameters
        ----------
        filename : str
            path to edgelist file
        separator : str
            character separating the nodes
        weighted : bool
            is a weight given? if ``True`` it is the last element in the edge
            (i.e. ``a,b,2``)
        directed : bool
            are the edges directed or undirected
        header : bool
            if true skip the first row, useful if header row in file

        Returns
        -------
        Network
            a ``Network`` object obtained from the edgelist
        """
        net = cls(directed)

        with open(filename, 'r') as f:
            Log.add('Reading edge list ... ')
            header_offset = 0
            if header:
                f.readline()
                header_offset = 1

            for n, line in enumerate(f):
                fields = line.rstrip().split(separator)
                fields = [field.strip() for field in fields]
                if len(fields) < 2:
                    Log.add('Ignoring malformed line {0}: {1}'.format(n, line+header_offset), Severity.WARNING)
                else:
                    if weighted:
                        net.add_edge(fields[0], fields[1], weight=int(fields[2]))
                    else:
                        net.add_edge(fields[0], fields[1])

        Log.add('finished.')

        return net
Exemple #29
0
    def read_file(cls,
                  filename,
                  separator=',',
                  maxlines=None,
                  mapping=None,
                  header=False):
        """
        Reads a directed acyclic graph from a file
        containing an edge list of the form

        source,target

        where ',' can be an arbitrary separator character
        """
        with open(filename, 'r') as f:
            edges = []

            if mapping is not None:
                Log.add('Filtering mapped edges')

            Log.add('Reading edge list ...')

            if header:  # Read header
                f.readline()
            for i, line in enumerate(f):
                if maxlines and i > maxlines:
                    break
                fields = line.rstrip().split(separator)
                try:
                    if mapping is None or (fields[0] in mapping
                                           and fields[1] in mapping):
                        edges.append((fields[0], fields[1]))

                except (IndexError, ValueError):  # pragma: no cover
                    msg = 'Ignoring malformed data in ' \
                          'line {}: "{}"'.format((i+header), line.strip())
                    Log.add(msg, Severity.WARNING)

        return cls(edges=edges)
Exemple #30
0
def pagerank(network,
             alpha=0.85,
             max_iter=100,
             tol=1.0e-6,
             projection='scaled',
             include_sub_paths=True,
             weighted=False):
    """Calculates the PageRank of higher-order nodes based on a power iteration.

    If the order of the higher-order network is larger than one, the PageRank calculated
    based on the higher-order topology will automatically be projected back to first-order
    nodes.

    Parameters
    ----------
    network: HigherOrderNetwork
    alpha: float
        damping factor
    max_iter: int
        maximum number or iterations in solver
    tol: float
        accepted tolerance for convergence check
    projection: str
        Indicates how the projection from k-th-order nodes (v1, v2, ... , v{k-1}) shall be
        performed. For the method 'all', the PageRank value of the higher-order node will
        be added to *all* first-order nodes on the path corresponding to the higher-order
        node. For the method 'last', the PR value of the higher-order node will only be
        assigned to *last* first-order node v{k-1}. For the method 'scaled' (default), the
        PageRank of higher-order nodes will be assigned proportionally to first-order
        nodes, i.e. each of the three nodes in the third-order node (a,b,c) will receive
        one third of the PageRank of (a,b,c).
    include_sub_paths: bool
        whether or not to use subpath statistics in the PageRank calculation
    weighted: bool
        use path weights in the calculation

    Returns
    -------
    dict

    """
    assert isinstance(network, HigherOrderNetwork), \
        "network must be an instance of HigherOrderNetwork"
    assert projection in ['all', 'last', 'first',
                          'scaled'], 'Invalid projection method'

    Log.add(
        'Calculating PageRank in ' + str(network.order) +
        '-th order network...', Severity.INFO)

    higher_order_pr = defaultdict(lambda: 0)

    n_nodes = float(len(network.nodes))

    assert n_nodes > 0, "Number of nodes is zero"

    # entries A[s,t] give directed link s -> t
    adj_mat = network.adjacency_matrix(include_subpaths=include_sub_paths,
                                       weighted=weighted,
                                       transposed=False)

    # sum of outgoing node degrees
    row_sums = sp.array(adj_mat.sum(axis=1)).flatten()

    # replace non-zero entries x by 1/x
    row_sums[row_sums != 0] = 1.0 / row_sums[row_sums != 0]

    # indices of zero entries in row_sums
    d = sp.where(row_sums == 0)[0]

    # create sparse matrix with row_sums as diagonal elements
    q_mat = sparse.spdiags(row_sums.T,
                           0,
                           adj_mat.shape[0],
                           adj_mat.shape[1],
                           format='csr')

    # with this, we have divided elements in non-zero rows in A by 1 over the row sum
    q_mat = q_mat * adj_mat

    # vector with n entries 1/n
    inv_n_nodes = sp.array([1.0 / n_nodes] * int(n_nodes))

    p_rank = inv_n_nodes

    # Power iteration
    for _ in range(max_iter):
        last = p_rank

        # sum(pr[d]) is the sum of PageRanks for nodes with zero out-degree
        # sum(pr[d]) * p yields a vector with length n
        p_rank = (alpha * (p_rank * q_mat + sum(p_rank[d]) * inv_n_nodes) +
                  (1 - alpha) * inv_n_nodes)

        if sp.absolute(p_rank - last).sum() < n_nodes * tol:
            higher_order_pr = dict(zip(network.nodes, map(float, p_rank)))
            break

    if network.order == 1:
        return higher_order_pr

    # project PageRank of higher-order nodes to first-order network
    first_order_pr = defaultdict(lambda: 0.0)

    # sum PageRank values based on higher-order nodes
    # and normalize the result
    for v in network.nodes:
        # turns node a-b-c in path tuple (a,b,c)
        inv_n_nodes = network.higher_order_node_to_path(v)
        if projection == 'all':
            # assign PR of higher-order node to all first-order nodes
            for x in inv_n_nodes:
                first_order_pr[x] += higher_order_pr[v] / len(inv_n_nodes)
        elif projection == 'scaled':
            for x in inv_n_nodes:
                # each node on e.g. a 4-th-order path a-b-c-d receives one fourth of the
                # PageRank value, to ensure that the resulting first-order PageRank sums
                # to one
                first_order_pr[x] += higher_order_pr[v] / float(
                    len(inv_n_nodes))
        elif projection == 'last':
            # assign PR of higher-order node to last first-order node
            first_order_pr[inv_n_nodes[-1]] += higher_order_pr[v]
        elif projection == 'first':
            # assign PR of higher-order node to last first-order node
            first_order_pr[inv_n_nodes[0]] += higher_order_pr[v]

    # for projection method 'scaled', the values sum to one anyway
    if projection != 'scaled':
        for v in first_order_pr:
            first_order_pr[v] /= sum(first_order_pr.values())

    # assign centrality zero to nodes not occurring in higher-order PR
    nodes = network.paths.nodes
    for v in nodes:
        first_order_pr[v] += 0

    Log.add('finished.', Severity.INFO)

    return first_order_pr