def compute_ratings_matrix(ratings_matrix_file): """ Computes the rating matrix Input: ratings_matrix_file: Filename output rating matrix """ mongo = Mongo('Acme-Supermarket') mongo.connect() matrix_file = ratings_matrix_file hdf5_matrix = tables.openFile(matrix_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') products = mongo.database.products.find({}, {'_id': 1}) products = [p['_id'] for p in products] products = numpy.concatenate((numpy.array([-1]), products)) products_count = mongo.database.products.count() customers = mongo.database.actors.find({'_type': 'Customer'}, {'_id': 1}) customers = [c['_id'] for c in customers] customers_count = mongo.database.actors.count({'_type': 'Customer'}) data_storage = hdf5_matrix.createEArray(hdf5_matrix.root, 'data', tables.UInt32Atom(), shape=(0, products_count + 1), filters=filters, expectedrows=customers_count) data_storage.append(products[:][None]) for customer_id in customers: # Each column 0: Customer IDs # Product ratings in columns 1+ row = numpy.zeros((products_count + 1, )) row[0] = customer_id ratings = mongo.database.rates.find({'customer_id': customer_id}, { 'product_id': 1, 'value': 1 }) for rating in ratings: row[numpy.where( products == rating['product_id'])[0][0]] = rating['value'] data_storage.append(row[:][None]) hdf5_matrix.close() mongo.disconnect() return matrix_file
class AcmeSupermarket: def __init__(self, transactions_filepath): self.schema = 'Acme-Supermarket-Recommendations' self.transactions_filepath = transactions_filepath self.database = Mongo(self.schema) def load(self): self.database.connect() purchases = self.database.database.purchases.find() transactions = numpy.array([]) i = 0 row_starts = numpy.array([0]) for purchase in purchases: i += 1 purchase_id = purchase['_id'] purchase_lines = self.database.database.purchase_lines.find({'purchase_id': purchase_id}) transaction = numpy.array([line['product_id'] for line in purchase_lines], dtype='i4') row_starts = numpy.append(row_starts, row_starts[-1] + transaction.size) transactions = numpy.concatenate((transactions, transaction)) row_ends = numpy.concatenate((row_starts, [transactions.size])) lengths = numpy.diff(row_ends) pad_lengths = numpy.max(lengths) - lengths pad_indices = numpy.repeat(row_ends[1:], pad_lengths) transactions_padded = numpy.insert( transactions, pad_indices, -1).reshape(-1, numpy.max(lengths)) numpy.save(self.transactions_filepath, transactions_padded) self.database.close() def save_rules(self, rules): self.database.save_rules(rules)