def blend_svd(mats, factors=None, k=50): ''' Special optimized version of blend for doing just an SVD. Like matrix.svd, returns a triple of: - U as a dense labeled matrix - S, a dense vector representing the diagonal of Sigma - V as a dense labeled matrix ''' if factors is None: factors = [blend_factor(mat) for mat in mats] # Align matrices. # FIXME: only works for fully labeleed matrices right now. # TODO: could micro-optimize by using the first ordered set's indices. from csc_utils.ordered_set import OrderedSet row_labels, row_mappings = OrderedSet(), [] for mat in mats: row_mappings.append(np.array([row_labels.add(item) for item in mat.row_labels], dtype=np.uint64)) col_labels, col_mappings = OrderedSet(), [] for mat in mats: col_mappings.append(np.array([col_labels.add(item) for item in mat.col_labels], dtype=np.uint64)) # Elide zero row tests, etc. from divisi2._svdlib import svd_sum from divisi2 import DenseMatrix Ut, S, Vt = svd_sum(mats, k, factors, row_mappings, col_mappings) U = DenseMatrix(Ut.T, row_labels, None) V = DenseMatrix(Vt.T, col_labels, None) return U, S, V
def blend_svd(mats, factors=None, k=50): ''' Special optimized version of blend for doing just an SVD. Like matrix.svd, returns a triple of: - U as a dense labeled matrix - S, a dense vector representing the diagonal of Sigma - V as a dense labeled matrix ''' if factors is None: factors = [blend_factor(mat) for mat in mats] # Align matrices. # FIXME: only works for fully labeleed matrices right now. # TODO: could micro-optimize by using the first ordered set's indices. from csc_utils.ordered_set import OrderedSet row_labels, row_mappings = OrderedSet(), [] for mat in mats: row_mappings.append( np.array([row_labels.add(item) for item in mat.row_labels], dtype=np.uint64)) col_labels, col_mappings = OrderedSet(), [] for mat in mats: col_mappings.append( np.array([col_labels.add(item) for item in mat.col_labels], dtype=np.uint64)) # Elide zero row tests, etc. from divisi2._svdlib import svd_sum from divisi2 import DenseMatrix Ut, S, Vt = svd_sum(mats, k, factors, row_mappings, col_mappings) U = DenseMatrix(Ut.T, row_labels, None) V = DenseMatrix(Vt.T, col_labels, None) return U, S, V
def sparse_triples(graph, row_labeler, col_labeler, cutoff=1, filter=None): """ A generator of sparse triples to put into a matrix, given a NetworkX graph. It is assumed that each edge of the graph yields two entries of the matrix. `row_labeler` and `col_labeler` are functions that are given each edge as a tuple of (source, target, data), and choose two rows and columns for the matrix. The first row is paired with the second column and vice versa. In practice, you don't need to worry abaout that, because `row_labeler` and `col_labeler` can also be strings choosing a predefined function. To get an adjacency matrix that relates nodes to nodes, for example, use:: divisi2.network.sparse_triples(graph, 'nodes', 'nodes') To get an AnalogySpace concept-by-feature matrix: divisi2.network.sparse_triples(graph, 'nodes', 'features') To get a pair-relation matrix, as in Latent Relational Analysis: divisi2.network.sparse_triples(graph, 'pairs', 'relations') `cutoff` specifies the minimum degree of nodes to include. The edge weights should be expressed in one of two forms: - The standard way for NetworkX, as the entry named 'weight' in the edge data dictionaries. - As ConceptNet-style 'score' and 'freq' values in the edge data dictionaries, which will be transformed into appropriate weights. If no edge weights can be found, the edges will be given a default weight of 1. """ first_edge = graph.edges_iter(data=True).next() first_data = first_edge[2] if 'score' in first_data and 'weight' not in first_data: set_conceptnet_weights(graph) try: if isinstance(row_labeler, basestring): row_labeler = LABELERS[row_labeler] if isinstance(col_labeler, basestring): col_labeler = LABELERS[col_labeler] except KeyError: raise KeyError("Unknown row or column type. The valid types are: %s" % sorted(LABELERS.keys())) row_labels = OrderedSet() if row_labeler == col_labeler: col_labels = row_labels else: col_labels = OrderedSet() subgraph = prune(graph, cutoff=cutoff) values = [] rows = [] cols = [] for edge in subgraph.edges_iter(data=True): if filter is not None and not filter(*edge): continue rownames = row_labeler(*edge) colnames = col_labeler(*edge) weight = edge[2].get('weight', 1) row0 = row_labels.add(rownames[0]) row1 = row_labels.add(rownames[1]) col0 = col_labels.add(colnames[0]) col1 = col_labels.add(colnames[1]) values.append(weight) rows.append(row0) cols.append(col1) values.append(weight) rows.append(row1) cols.append(col0) return values, rows, cols, row_labels, col_labels
# import code from Divisi2, a sparse matrix machine-learning library # from the Media Lab # (http://csc.media.mit.edu/docs/divisi2) import divisi2 import numpy as np import sys from csc_utils.ordered_set import OrderedSet thebands = OrderedSet([]) file = open('band-graph.txt') for line in file: band, fan = line.strip().split() thebands.add(band) NBANDS = len(thebands) NBITS = 12 MODULO = 1<<NBITS print NBANDS matrix = divisi2.DenseMatrix( np.zeros((NBANDS, MODULO)), row_labels = thebands ) file.seek(0) counter = 0 for line in file: band, fan = line.strip().split() row = matrix.row_labels.index(band) col = hash(fan) % MODULO