def createSparseMatrix(assertions, path, use_left_features = True): def _get_matrix_cells(assertion): concept1, relation, concept2 = assertion value1 = float(1) row1 = concept1 col1 = ('right', relation, concept2) yield value1, row1, col1 if use_left_features: value2 = float(1) row2 = concept2 col2 = ('left', relation, concept1) yield value2, row2, col2 values, rows, cols = [], [], [] for assertion in assertions: for value, row, col in _get_matrix_cells(assertion): values.append(value) rows.append(row) cols.append(col) row_labels = set(rows) col_labels = set(cols) sparseMatrix = SparseMatrix((len(row_labels), len(col_labels)), row_labels=row_labels, col_labels=col_labels) assert len(values) == len(rows) and len(rows) == len(cols) for i in xrange(len(values)): value, row, col = values[i], rows[i], cols[i] # TODO: more explicit handling of multiple entries for same cell sparseMatrix.set_entry_named(row, col, value) divisi2.save(sparseMatrix, path)
def create(self, data, row_labels=None, col_labels=None, foldin=False, truncate=False): #is_row is what I'm originally folding in self._values = map(itemgetter(0), data) self._rows = map(itemgetter(1), data) self._cols = map(itemgetter(2), data) if foldin: #new to make sure not folding in user and item at same time #idea: create matrix normally but keep track of the columns (items) or rows to be folded in before doing update if col_labels: #if col_labels defined then I'm folding in a row self._additional_elements = [ x for x in self._cols if x not in col_labels ] else: #else I am folding in a column self._additional_elements = [ x for x in self._rows if x not in row_labels ] if truncate: for item in self._additional_elements: if col_labels: index_remove = self._cols.index(item) else: index_remove = self._rows.index(item) del self._values[index_remove] del self._rows[index_remove] del self._cols[index_remove] self._matrix = divisiSparseMatrix.from_named_lists( self._values, self._rows, self._cols, row_labels, col_labels)
def blend(mats, factors=None, symmetric=False, post_weights=None): """ Combine multiple labeled matrices into one, with weighted data from all the matrices. mats: a list of matrices to blend. factors: List of scaling factor for each matrix. If None, the reciprocal of the first singular value is used. post_weights: List of weights to apply to each scaled matrix. You can use this to, for example, say that one matrix is twice as important as another. If None, no post-weighting is performed. symmetric: Use square_from_named_lists. """ assert len(mats) > 0 if len(mats) == 1: if factors is None: return mats[0] else: return mats[0] * factors[0] b_values = [] b_row_labels = [] b_col_labels = [] if factors is None: factors = [blend_factor(mat) for mat in mats] if post_weights is not None: factors = [ factor * post_weight for factor, post_weight in zip(factors, post_weights) ] for mat, factor in zip(mats, factors): # FIXME: using bare find(), multiplying in numpy form, and # translating the labels manually would be a bit faster values, row_labels, col_labels = mat.named_lists() b_values.extend([v * factor for v in values]) b_row_labels.extend(row_labels) b_col_labels.extend(col_labels) if symmetric: return SparseMatrix.square_from_named_lists(b_values, b_row_labels, b_col_labels) else: return SparseMatrix.from_named_lists(b_values, b_row_labels, b_col_labels)
def blend(mats, factors=None, symmetric=False, post_weights=None): """ Combine multiple labeled matrices into one, with weighted data from all the matrices. mats: a list of matrices to blend. factors: List of scaling factor for each matrix. If None, the reciprocal of the first singular value is used. post_weights: List of weights to apply to each scaled matrix. You can use this to, for example, say that one matrix is twice as important as another. If None, no post-weighting is performed. symmetric: Use square_from_named_lists. """ assert len(mats) > 0 if len(mats) == 1: if factors is None: return mats[0] else: return mats[0] * factors[0] b_values = [] b_row_labels = [] b_col_labels = [] if factors is None: factors = [blend_factor(mat) for mat in mats] if post_weights is not None: factors = [factor*post_weight for factor, post_weight in zip(factors, post_weights)] for mat, factor in zip(mats, factors): # FIXME: using bare find(), multiplying in numpy form, and # translating the labels manually would be a bit faster values, row_labels, col_labels = mat.named_lists() b_values.extend([v*factor for v in values]) b_row_labels.extend(row_labels) b_col_labels.extend(col_labels) if symmetric: return SparseMatrix.square_from_named_lists(b_values, b_row_labels, b_col_labels) else: return SparseMatrix.from_named_lists(b_values, b_row_labels, b_col_labels)
def update( self, matrix, is_batch=False ): #isbatch is for creating the final sparse matrix ,since you will want to collect all then construct final matrix at end #To update the stored data matrix with the new values and create a new divisi spare matrix with it to retain the zeroes self._values.extend(matrix._values) self._rows.extend(matrix._rows) self._cols.extend(matrix._cols) if not is_batch: self._matrix = divisiSparseMatrix.from_named_lists( self._values, self._rows, self._cols)
def index_sparseMatrix( self ): #create the divisi2 sparse matrix from already existing values self._matrix = divisiSparseMatrix.from_named_lists( self._values, self._rows, self._cols)
mat[j, k] = divisi2.dot(user_mat[i,:], movie_mat[j,:]) print "Learning process complete." start_time = time.time() predictions = divisi2.reconstruct(user_mat, axis_weights, movie_mat) print "Matrix reconstruction (elapsed time: %f s)." % (time.time() - start_time) return predictions def predict(mat): f_testing = open(TESTING_FILENAME, 'r') f_out = open(OUTPUT_FILENAME, 'w') print "Making %d predictions..." % NUM_TESTING start_time = time.time() i = 0 j = 0 for line in f_testing: user, movie, date = line.strip().split() f_out.write(str(mat.entry_named(int(user), int(movie))) + '\n') i += 1 if i % (NUM_TESTING / INCR) == 0: j += 100.0 / INCR sys.stdout.write("\r%.1f%% done (elapsed time: %f s)." % (j, time.time() - start_time)) sys.stdout.flush() f_testing.close() print "Predictions complete (elapsed time: %f s)." % (time.time() - start_time) f_out.close() if __name__=='__main__': training_mat = SparseMatrix((NUM_USERS, NUM_MOVIES), range(1,NUM_USERS+1), range(1,NUM_MOVIES+1)) add_data_to_matrix(training_mat) predictions = learn_iter(training_mat) predict(predictions)
def create(self, data): values = map(itemgetter(0), data) rows = map(itemgetter(1), data) cols = map(itemgetter(2), data) self._matrix = divisiSparseMatrix.from_named_lists(values, rows, cols)
from divisi2.sparse import SparseMatrix from divisi2.reconstructed import ReconstructedMatrix from divisi2.operators import dot import numpy as np mat_4x3 = SparseMatrix.from_named_entries([ (2, "apple", "red"), (2, "orange", "orange"), (1, "apple", "green"), (1, "celery", "green"), (-1, "apple", "orange"), (-1, "banana", "orange") ]) def test_incremental_svd(): U_sparse, S_sparse, V_sparse = mat_4x3.svd(2) rec = dot(U_sparse * S_sparse, V_sparse.T) rec2 = ReconstructedMatrix.make_random(mat_4x3.row_labels, mat_4x3.col_labels, 2, learning_rate = 0.01) for iter in xrange(1000): for row in xrange(4): for col in xrange(3): rec2.hebbian_step(row, col, mat_4x3[row, col]) print np.linalg.norm(rec2.to_dense() - rec) dense = rec2.to_dense() assert rec.same_labels_as(rec2) assert np.linalg.norm(rec2.to_dense() - rec) < 0.1