X = X + tmp print("%s %d to %d finished" % (prefix, lo, hi)) sys.stdout.flush() return X BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True X, Y = None, None if BOOLEAN_LOAD_PP_COOCC_FROM_FILE: print 'Loading project project negative_co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix = 'project') #project project co-occurrence matrix print 'dumping matrix ...' text_utils.save_pickle(X, os.path.join(DATA_DIR,'negative_pro_pro_cooc_fold%d.dat'%FOLD)) t2 = time.time() print 'Time : %d seconds'%(t2-t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle(os.path.join(DATA_DIR,'negative_pro_pro_cooc_fold%d.dat'%FOLD)) t2 = time.time() print '[INFO]: sparse matrix size of project project negative_co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds'%(t2-t1) #X = None BOOLEAN_LOAD_UU_COOCC_FROM_FILE = True if BOOLEAN_LOAD_UU_COOCC_FROM_FILE: print 'Loading user user negative_co-occurrence matrix'
sys.stdout.flush() return X BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE = True X_neg, Y_neg = None, None if BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE: print 'Loading negative project project co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X_neg = _load_negative_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix='%s-project'%NEGATIVE_SELECTION_MODE) # project project co-occurrence matrix print X_neg print 'dumping matrix ...' text_utils.save_pickle(X_neg, os.path.join(DATA_DIR, '%s_negative_pro_pro_cooc.dat'%NEGATIVE_SELECTION_MODE)) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X_neg = text_utils.load_pickle(os.path.join(DATA_DIR, '%s_negative_pro_pro_cooc.dat'%NEGATIVE_SELECTION_MODE)) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X_neg.data.nbytes + X_neg.indices.nbytes + X_neg.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) # X = None BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE = False if BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE: print 'Loading negative user user co-occurrence matrix'
def produce_neg_embeddings(DATA_DIR, train_data, n_users, n_items, batch_size=5000, iter=0): print n_users, n_items #clear the negative-co-temp folder: if os.path.exists(os.path.join(DATA_DIR, 'negative-co-temp')): for f in glob.glob(os.path.join(DATA_DIR, 'negative-co-temp', '*.npy')): os.remove(f) GENERATE_ITEM_ITEM_COOCCURENCE_FILE = True if GENERATE_ITEM_ITEM_COOCCURENCE_FILE: t1 = time.time() print 'Generating item item negative_co-occurrence matrix' start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] Parallel(n_jobs=1)( delayed(_coord_batch)(DATA_DIR, lo, hi, train_data, prefix='item') for lo, hi in zip(start_idx, end_idx)) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) pass ######################################################################################################################## ####################Generate user-user co-occurrence matrix based on the same items they backed###################### ##################### This will build a user-user co-occurrence matrix ########################################## def _load_coord_matrix(start_idx, end_idx, nrow, ncol, prefix='item'): X = sparse.csr_matrix((nrow, ncol), dtype='float32') for lo, hi in zip(start_idx, end_idx): coords = np.load( os.path.join(DATA_DIR, 'negative-co-temp', 'negative_%s_coo_%d_%d.npy' % (prefix, lo, hi))) rows = coords[:, 0] cols = coords[:, 1] tmp = sparse.coo_matrix((np.ones_like(rows), (rows, cols)), shape=(nrow, ncol), dtype='float32').tocsr() X = X + tmp print("%s %d to %d finished" % (prefix, lo, hi)) sys.stdout.flush() return X BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True X, Y = None, None if BOOLEAN_LOAD_PP_COOCC_FROM_FILE: print 'Loading item item negative_co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix(start_idx, end_idx, n_items, n_items, prefix='item') #item item co-occurrence matrix print 'dumping matrix ...' text_utils.save_pickle( X, os.path.join(DATA_DIR, 'negative_item_item_cooc_iter%d.dat' % (iter))) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle( os.path.join(DATA_DIR, 'negative_item_item_cooc_iter%d.dat' % (iter))) t2 = time.time() print '[INFO]: sparse matrix size of item item negative_co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) if os.path.exists(os.path.join(DATA_DIR, 'negative-co-temp')): for f in glob.glob(os.path.join(DATA_DIR, 'negative-co-temp', '*.npy')): os.remove(f) return X, None
return X BOOLEAN_LOAD_PP_COOCC_FROM_FILE = False X, Y = None, None if BOOLEAN_LOAD_PP_COOCC_FROM_FILE: print 'Loading project project co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix( start_idx, end_idx, n_projects, n_projects, prefix='project') #project project co-occurrence matrix print X print 'dumping matrix ...' text_utils.save_pickle(X, os.path.join(DATA_DIR, 'pro_pro_cooc.dat')) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle(os.path.join(DATA_DIR, 'pro_pro_cooc.dat')) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) #X = None BOOLEAN_LOAD_UU_COOCC_FROM_FILE = False if BOOLEAN_LOAD_UU_COOCC_FROM_FILE: print 'Loading user user co-occurrence matrix'
for lo, hi in zip(start_idx, end_idx): coords = np.load( os.path.join(DATA_DIR, 'negative-co-temp', 'negative_%s_coo_%d_%d.npy' % (prefix, lo, hi))) rows = coords[:, 0] cols = coords[:, 1] tmp = sparse.coo_matrix((np.ones_like(rows), (rows, cols)), shape=(nrow, ncol), dtype='float32').tocsr() X = X + tmp print("%s %d to %d finished" % (prefix, lo, hi)) sys.stdout.flush() return X X, Y = None, None print 'Loading item item negative_co-occurrence matrix and saving to pickle file for fast loading' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix(start_idx, end_idx, n_items, n_items, prefix='item') #item item co-occurrence matrix print 'dumping matrix ...' text_utils.save_pickle(X, os.path.join(DATA_DIR, 'negative_item_item_cooc.dat')) t2 = time.time() print 'Time : %d seconds' % (t2 - t1)
print("%s %d to %d finished" % (prefix, lo, hi)) sys.stdout.flush() return X BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True X, Y = None, None if BOOLEAN_LOAD_PP_COOCC_FROM_FILE: print 'Loading item item co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix(start_idx, end_idx, n_items, n_items, prefix='item') #item item co-occurrence matrix print 'dumping matrix ...' text_utils.save_pickle(X, os.path.join(DATA_DIR, 'item_item_cooc.dat')) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle(os.path.join(DATA_DIR, 'item_item_cooc.dat')) t2 = time.time() print '[INFO]: sparse matrix size of item-item co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) #X = None BOOLEAN_LOAD_UU_COOCC_FROM_FILE = True if BOOLEAN_LOAD_UU_COOCC_FROM_FILE: print 'Loading user user co-occurrence matrix'
sys.stdout.flush() return X BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE = True X_neg, Y_neg = None, None if BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE: print 'Loading negative project project co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X_neg = _load_negative_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix='cate-project') # project project co-occurrence matrix print X_neg print 'dumping matrix ...' text_utils.save_pickle(X_neg, os.path.join(DATA_DIR, 'cate_negative_pro_pro_cooc_%d.dat'%NEGATIVE_NEIGHBOR_WORDS)) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X_neg = text_utils.load_pickle(os.path.join(DATA_DIR, 'cate_negative_pro_pro_cooc_%d.dat'%NEGATIVE_NEIGHBOR_WORDS)) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X_neg.data.nbytes + X_neg.indices.nbytes + X_neg.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) # X = None BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE = True if BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE: print 'Loading negative user user co-occurrence matrix'