Esempio n. 1
0
def blend_svd(mats, factors=None, k=50):
    '''
    Special optimized version of blend for doing just an SVD.

    Like matrix.svd, returns a triple of:

    - U as a dense labeled matrix
    - S, a dense vector representing the diagonal of Sigma
    - V as a dense labeled matrix

    '''
    
    if factors is None:
        factors = [blend_factor(mat) for mat in mats]

    # Align matrices.
    # FIXME: only works for fully labeleed matrices right now.
    # TODO: could micro-optimize by using the first ordered set's indices.
    from csc_utils.ordered_set import OrderedSet
    row_labels, row_mappings = OrderedSet(), []
    for mat in mats:
        row_mappings.append(np.array([row_labels.add(item) for item in mat.row_labels], dtype=np.uint64))
    col_labels, col_mappings = OrderedSet(), []
    for mat in mats:
        col_mappings.append(np.array([col_labels.add(item) for item in mat.col_labels], dtype=np.uint64))

    # Elide zero row tests, etc.

    from divisi2._svdlib import svd_sum
    from divisi2 import DenseMatrix
    Ut, S, Vt = svd_sum(mats, k, factors, row_mappings, col_mappings)
    U = DenseMatrix(Ut.T, row_labels, None)
    V = DenseMatrix(Vt.T, col_labels, None)
    return U, S, V
Esempio n. 2
0
def blend_svd(mats, factors=None, k=50):
    '''
    Special optimized version of blend for doing just an SVD.

    Like matrix.svd, returns a triple of:

    - U as a dense labeled matrix
    - S, a dense vector representing the diagonal of Sigma
    - V as a dense labeled matrix

    '''

    if factors is None:
        factors = [blend_factor(mat) for mat in mats]

    # Align matrices.
    # FIXME: only works for fully labeleed matrices right now.
    # TODO: could micro-optimize by using the first ordered set's indices.
    from csc_utils.ordered_set import OrderedSet
    row_labels, row_mappings = OrderedSet(), []
    for mat in mats:
        row_mappings.append(
            np.array([row_labels.add(item) for item in mat.row_labels],
                     dtype=np.uint64))
    col_labels, col_mappings = OrderedSet(), []
    for mat in mats:
        col_mappings.append(
            np.array([col_labels.add(item) for item in mat.col_labels],
                     dtype=np.uint64))

    # Elide zero row tests, etc.

    from divisi2._svdlib import svd_sum
    from divisi2 import DenseMatrix
    Ut, S, Vt = svd_sum(mats, k, factors, row_mappings, col_mappings)
    U = DenseMatrix(Ut.T, row_labels, None)
    V = DenseMatrix(Vt.T, col_labels, None)
    return U, S, V
Esempio n. 3
0
def sparse_triples(graph, row_labeler, col_labeler, cutoff=1, filter=None):
    """
    A generator of sparse triples to put into a matrix, given a NetworkX graph.
    It is assumed that each edge of the graph yields two entries of the matrix.
    
    `row_labeler` and `col_labeler` are functions that are given each edge
    as a tuple of (source, target, data), and choose two rows and columns for
    the matrix. The first row is paired with the second column and vice versa.

    In practice, you don't need to worry abaout that, because `row_labeler`
    and `col_labeler` can also be strings choosing a predefined function.
    To get an adjacency matrix that relates nodes to nodes, for example, use::

        divisi2.network.sparse_triples(graph, 'nodes', 'nodes')

    To get an AnalogySpace concept-by-feature matrix:

        divisi2.network.sparse_triples(graph, 'nodes', 'features')

    To get a pair-relation matrix, as in Latent Relational Analysis:

        divisi2.network.sparse_triples(graph, 'pairs', 'relations')
    
    `cutoff` specifies the minimum degree of nodes to include.
    
    The edge weights should be expressed in one of two forms:

    - The standard way for NetworkX, as the entry named 'weight' in the edge
      data dictionaries.
    - As ConceptNet-style 'score' and 'freq' values in the edge data
      dictionaries, which will be transformed into appropriate weights.

    If no edge weights can be found, the edges will be given a default weight
    of 1.
    """
    first_edge = graph.edges_iter(data=True).next()
    first_data = first_edge[2]
    if 'score' in first_data and 'weight' not in first_data:
        set_conceptnet_weights(graph)

    try:
        if isinstance(row_labeler, basestring):
            row_labeler = LABELERS[row_labeler]
        if isinstance(col_labeler, basestring):
            col_labeler = LABELERS[col_labeler]
    except KeyError:
        raise KeyError("Unknown row or column type. The valid types are: %s"
          % sorted(LABELERS.keys()))
    
    row_labels = OrderedSet()
    if row_labeler == col_labeler:
        col_labels = row_labels
    else:
        col_labels = OrderedSet()
    
    subgraph = prune(graph, cutoff=cutoff)
    values = []
    rows = []
    cols = []
    for edge in subgraph.edges_iter(data=True):
        if filter is not None and not filter(*edge): continue
        rownames = row_labeler(*edge)
        colnames = col_labeler(*edge)
        weight = edge[2].get('weight', 1)
        row0 = row_labels.add(rownames[0])
        row1 = row_labels.add(rownames[1])
        col0 = col_labels.add(colnames[0])
        col1 = col_labels.add(colnames[1])
        values.append(weight)
        rows.append(row0)
        cols.append(col1)
        values.append(weight)
        rows.append(row1)
        cols.append(col0)
    return values, rows, cols, row_labels, col_labels
Esempio n. 4
0
# import code from Divisi2, a sparse matrix machine-learning library
# from the Media Lab
# (http://csc.media.mit.edu/docs/divisi2)
import divisi2
import numpy as np
import sys
from csc_utils.ordered_set import OrderedSet

thebands = OrderedSet([])

file = open('band-graph.txt')
for line in file:
    band, fan = line.strip().split()
    thebands.add(band)

NBANDS = len(thebands)
NBITS = 12
MODULO = 1<<NBITS
print NBANDS

matrix = divisi2.DenseMatrix(
    np.zeros((NBANDS, MODULO)),
    row_labels = thebands
)

file.seek(0)
counter = 0
for line in file:
    band, fan = line.strip().split()
    row = matrix.row_labels.index(band)
    col = hash(fan) % MODULO
Esempio n. 5
0
def sparse_triples(graph, row_labeler, col_labeler, cutoff=1, filter=None):
    """
    A generator of sparse triples to put into a matrix, given a NetworkX graph.
    It is assumed that each edge of the graph yields two entries of the matrix.
    
    `row_labeler` and `col_labeler` are functions that are given each edge
    as a tuple of (source, target, data), and choose two rows and columns for
    the matrix. The first row is paired with the second column and vice versa.

    In practice, you don't need to worry abaout that, because `row_labeler`
    and `col_labeler` can also be strings choosing a predefined function.
    To get an adjacency matrix that relates nodes to nodes, for example, use::

        divisi2.network.sparse_triples(graph, 'nodes', 'nodes')

    To get an AnalogySpace concept-by-feature matrix:

        divisi2.network.sparse_triples(graph, 'nodes', 'features')

    To get a pair-relation matrix, as in Latent Relational Analysis:

        divisi2.network.sparse_triples(graph, 'pairs', 'relations')
    
    `cutoff` specifies the minimum degree of nodes to include.
    
    The edge weights should be expressed in one of two forms:

    - The standard way for NetworkX, as the entry named 'weight' in the edge
      data dictionaries.
    - As ConceptNet-style 'score' and 'freq' values in the edge data
      dictionaries, which will be transformed into appropriate weights.

    If no edge weights can be found, the edges will be given a default weight
    of 1.
    """
    first_edge = graph.edges_iter(data=True).next()
    first_data = first_edge[2]
    if 'score' in first_data and 'weight' not in first_data:
        set_conceptnet_weights(graph)

    try:
        if isinstance(row_labeler, basestring):
            row_labeler = LABELERS[row_labeler]
        if isinstance(col_labeler, basestring):
            col_labeler = LABELERS[col_labeler]
    except KeyError:
        raise KeyError("Unknown row or column type. The valid types are: %s" %
                       sorted(LABELERS.keys()))

    row_labels = OrderedSet()
    if row_labeler == col_labeler:
        col_labels = row_labels
    else:
        col_labels = OrderedSet()

    subgraph = prune(graph, cutoff=cutoff)
    values = []
    rows = []
    cols = []
    for edge in subgraph.edges_iter(data=True):
        if filter is not None and not filter(*edge): continue
        rownames = row_labeler(*edge)
        colnames = col_labeler(*edge)
        weight = edge[2].get('weight', 1)
        row0 = row_labels.add(rownames[0])
        row1 = row_labels.add(rownames[1])
        col0 = col_labels.add(colnames[0])
        col1 = col_labels.add(colnames[1])
        values.append(weight)
        rows.append(row0)
        cols.append(col1)
        values.append(weight)
        rows.append(row1)
        cols.append(col0)
    return values, rows, cols, row_labels, col_labels