Ejemplo n.º 1
0
def _dwpc_general_case(graph, metapath, damping=0, dtype=numpy.float64):
    """
    A slow but general function to compute the degree-weighted
    path count. Works by splitting the metapath at junctions
    where one node is joined to multiple nodes over a metaedge.

    Parameters
    ----------
    graph : hetio.hetnet.Graph
    metapath : hetio.hetnet.MetaPath
    damping : float
    dtype : dtype object
    """
    dwpc_step = functools.partial(_node_to_children,
                                  graph=graph,
                                  metapath=metapath,
                                  damping=damping,
                                  dtype=dtype)

    start_nodes, cols, adj = metaedge_to_adjacency_matrix(graph, metapath[0])
    rows, fin_nodes, adj = metaedge_to_adjacency_matrix(graph, metapath[-1])
    number_start = len(start_nodes)
    number_end = len(fin_nodes)

    dwpc_matrix = []
    if len(metapath) > 1:
        for i in range(number_start):
            search = numpy.zeros(number_start, dtype=dtype)
            search[i] = 1
            step1 = [dwpc_step(node=search, metapath_index=0, history=None)]
            k = 1
            while k < len(metapath):
                k += 1
                step2 = []
                for group in step1:
                    for child in group['children']:
                        hist = copy.deepcopy(group['history'])
                        out = dwpc_step(node=child,
                                        metapath_index=group['next_index'],
                                        history=hist)
                        if out['children']:
                            step2.append(out)
                    step1 = step2

            final_children = [
                group for group in step2 if group['children'] != []
            ]

            end_nodes = sum([
                child for group in final_children
                for child in group['children']
            ])
            if type(end_nodes) not in (list, numpy.ndarray):
                end_nodes = numpy.zeros(number_end)
            dwpc_matrix.append(end_nodes)
    else:
        dwpc_matrix = _degree_weight(adj, damping=damping, dtype=dtype)
    dwpc_matrix = numpy.array(dwpc_matrix, dtype=dtype)
    return start_nodes, fin_nodes, dwpc_matrix
Ejemplo n.º 2
0
def metapath_to_degree_dicts(graph, metapath):
    metapath = graph.metagraph.get_metapath(metapath)
    _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7)
    _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7)
    source_degrees = source_adj_mat.sum(axis=1).flat
    target_degrees = target_adj_mat.sum(axis=0).flat
    source_degree_to_ind = degrees_to_degree_to_ind(source_degrees)
    target_degree_to_ind = degrees_to_degree_to_ind(target_degrees)
    return source_degree_to_ind, target_degree_to_ind
Ejemplo n.º 3
0
def _node_to_children(graph,
                      metapath,
                      node,
                      metapath_index,
                      damping=0,
                      history=None,
                      dtype=numpy.float64):
    """
    Returns a history adjusted list of child nodes. Used in _dwpc_general_case.

    Parameters
    ----------
    graph : hetio.hetnet.Graph
    metapath : hetio.hetnet.MetaPath
    node : numpy.ndarray
    metapath_index : int
    damping : float
    history : numpy.ndarray
    dtype : dtype object

    Returns
    -------
    dict
        List of child nodes and a single numpy.ndarray of the newly
        updated history vector.
    """
    metaedge = metapath[metapath_index]
    metanodes = list(metapath.get_nodes())
    freq = collections.Counter(metanodes)
    repeated = {i for i in freq.keys() if freq[i] > 1}

    if history is None:
        history = {
            i.target:
            numpy.ones(len(metaedge_to_adjacency_matrix(graph, i)[1]),
                       dtype=dtype)
            for i in metapath if i.target in repeated
        }
    history = history.copy()
    if metaedge.source in history:
        history[metaedge.source] -= numpy.array(node != 0, dtype=dtype)

    rows, cols, adj = metaedge_to_adjacency_matrix(graph,
                                                   metaedge,
                                                   dtype=dtype)
    adj = _degree_weight(adj, damping, dtype=dtype)
    vector = node @ adj

    if metaedge.target in history:
        vector *= history[metaedge.target]

    children = [i for i in numpy.diag(vector) if i.any()]
    return {
        'children': children,
        'history': history,
        'next_index': metapath_index + 1
    }
Ejemplo n.º 4
0
def dwwc_sequential(graph,
                    metapath,
                    damping=0.5,
                    dense_threshold=0,
                    dtype=numpy.float64):
    """
    Compute the degree-weighted walk count (DWWC) in which nodes can be
    repeated within a path.

    Parameters
    ----------
    graph : hetio.hetnet.Graph
    metapath : hetio.hetnet.MetaPath
    damping : float
    dense_threshold : float (0 <= dense_threshold <= 1)
        sets the density threshold at which a sparse matrix will be
        converted to a dense automatically.
    dtype : dtype object
    """
    dwwc_matrix = None
    row_names = None
    for metaedge in metapath:
        rows, cols, adj_mat = metaedge_to_adjacency_matrix(
            graph, metaedge, dense_threshold=dense_threshold, dtype=dtype)
        adj_mat = _degree_weight(adj_mat, damping, dtype=dtype)
        if dwwc_matrix is None:
            row_names = rows
            dwwc_matrix = adj_mat
        else:
            dwwc_matrix = dwwc_matrix @ adj_mat
            dwwc_matrix = sparsify_or_densify(dwwc_matrix, dense_threshold)
    return row_names, cols, dwwc_matrix
Ejemplo n.º 5
0
def dwpc_to_degrees(graph, metapath, damping=0.5, ignore_zeros=False):
    """
    Yield a description of each cell in a DWPC matrix adding source and target
    node degree info as well as the corresponding path count.
    """
    metapath = graph.metagraph.get_metapath(metapath)
    _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7)
    _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7)
    source_degrees = source_adj_mat.sum(axis=1).flat
    target_degrees = target_adj_mat.sum(axis=0).flat
    del source_adj_mat, target_adj_mat

    source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')
    source_node_df = pandas.read_table(source_path)
    source_node_names = list(source_node_df['name'])

    target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')
    target_node_df = pandas.read_table(target_path)
    target_node_names = list(target_node_df['name'])

    row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)
    dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())
    if scipy.sparse.issparse(dwpc_matrix):
        dwpc_matrix = dwpc_matrix.toarray()

    _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)
    if scipy.sparse.issparse(path_count):
        path_count = path_count.toarray()

    row_inds, col_inds = range(len(row_names)), range(len(col_names))
    for row_ind, col_ind in itertools.product(row_inds, col_inds):
        dwpc_value = dwpc_matrix[row_ind, col_ind]
        if ignore_zeros and dwpc_value == 0:
            continue
        row = {
            'source_id': row_names[row_ind],
            'target_id': col_names[col_ind],
            'source_name': source_node_names[row_ind],
            'target_name': target_node_names[col_ind],
            'source_degree': source_degrees[row_ind],
            'target_degree': target_degrees[col_ind],
            'path_count': path_count[row_ind, col_ind],
            'dwpc': dwpc_value,
        }
        yield collections.OrderedDict(row)
Ejemplo n.º 6
0
def _multi_dot(metapath, order, i, j, graph, damping, dense_threshold, dtype):
    """
    Perform matrix multiplication with the given order. Modified from
    numpy.linalg.linalg._multi_dot (https://git.io/vh31f) which is released
    under a 3-Clause BSD License (https://git.io/vhCDC).
    """
    if i == j:
        _, _, adj_mat = metaedge_to_adjacency_matrix(
            graph, metapath[i], dense_threshold=dense_threshold, dtype=dtype)
        adj_mat = _degree_weight(adj_mat, damping=damping, dtype=dtype)
        return adj_mat
    return _multi_dot(metapath, order, i, order[i, j], graph, damping, dense_threshold, dtype) \
        @ _multi_dot(metapath, order, order[i, j] + 1, j, graph, damping, dense_threshold, dtype)
Ejemplo n.º 7
0
def metaedge_to_data_array(graph, metaedge, dtype=numpy.bool_):
    """
    Return an xarray.DataArray that's an adjacency matrix where source nodes
    are columns and target nodes are rows.
    """
    source_node_ids, target_node_ids, adjacency_matrix = (
        metaedge_to_adjacency_matrix(graph, metaedge, dtype=dtype))

    dims = metaedge.source.identifier, metaedge.target.identifier
    coords = source_node_ids, target_node_ids

    data_array = xarray.DataArray(adjacency_matrix,
                                  coords=coords,
                                  dims=dims,
                                  name=metaedge.get_unicode_str())
    return data_array
Ejemplo n.º 8
0
def _dwpc_short_repeat(graph,
                       metapath,
                       damping=0.5,
                       dense_threshold=0,
                       dtype=numpy.float64):
    """
    One metanode repeated 3 or fewer times (A-A-A), not (A-A-A-A)
    This can include other random inserts, so long as they are not
    repeats. Must start and end with the repeated node. Acceptable
    examples: (A-B-A-A), (A-B-A-C-D-E-F-A), (A-B-A-A), etc.
    """
    segments = get_segments(graph.metagraph, metapath)
    assert len(segments) <= 3

    # Account for different head and tail possibilities.
    head_segment = None
    tail_segment = None
    dwpc_matrix = None
    dwpc_tail = None

    # Label the segments as head, tail, and repeat
    for i, segment in enumerate(segments):
        if segment.source() == segment.target():
            repeat_segment = segment
        else:
            if i == 0:
                head_segment = segment
            else:
                tail_segment = segment

    # Calculate DWPC for the middle ("repeat") segment
    repeated_metanode = repeat_segment.source()

    index_of_repeats = [
        i for i, v in enumerate(repeat_segment.get_nodes())
        if v == repeated_metanode
    ]

    for metaedge in repeat_segment[:index_of_repeats[1]]:
        rows, cols, adj = metaedge_to_adjacency_matrix(
            graph, metaedge, dtype=dtype, dense_threshold=dense_threshold)
        adj = _degree_weight(adj, damping, dtype=dtype)
        if dwpc_matrix is None:
            row_names = rows
            dwpc_matrix = adj
        else:
            dwpc_matrix = dwpc_matrix @ adj

    dwpc_matrix = remove_diag(dwpc_matrix, dtype=dtype)

    # Extra correction for random metanodes in the repeat segment
    if len(index_of_repeats) == 3:
        for metaedge in repeat_segment[index_of_repeats[1]:]:
            rows, cols, adj = metaedge_to_adjacency_matrix(
                graph, metaedge, dtype=dtype, dense_threshold=dense_threshold)
            adj = _degree_weight(adj, damping, dtype=dtype)
            if dwpc_tail is None:
                dwpc_tail = adj
            else:
                dwpc_tail = dwpc_tail @ adj
        dwpc_tail = remove_diag(dwpc_tail, dtype=dtype)
        dwpc_matrix = dwpc_matrix @ dwpc_tail
        dwpc_matrix = remove_diag(dwpc_matrix, dtype=dtype)
    col_names = cols

    if head_segment:
        row_names, cols, head_dwpc = dwpc(graph,
                                          head_segment,
                                          damping=damping,
                                          dense_threshold=dense_threshold,
                                          dtype=dtype)
        dwpc_matrix = head_dwpc @ dwpc_matrix
    if tail_segment:
        rows, col_names, tail_dwpc = dwpc(graph,
                                          tail_segment,
                                          damping=damping,
                                          dense_threshold=dense_threshold,
                                          dtype=dtype)
        dwpc_matrix = dwpc_matrix @ tail_dwpc

    return row_names, col_names, dwpc_matrix