BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True X, Y = None, None if BOOLEAN_LOAD_PP_COOCC_FROM_FILE: print 'Loading project project negative_co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix = 'project') #project project co-occurrence matrix print 'dumping matrix ...' text_utils.save_pickle(X, os.path.join(DATA_DIR,'negative_pro_pro_cooc_fold%d.dat'%FOLD)) t2 = time.time() print 'Time : %d seconds'%(t2-t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle(os.path.join(DATA_DIR,'negative_pro_pro_cooc_fold%d.dat'%FOLD)) t2 = time.time() print '[INFO]: sparse matrix size of project project negative_co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds'%(t2-t1) #X = None BOOLEAN_LOAD_UU_COOCC_FROM_FILE = True if BOOLEAN_LOAD_UU_COOCC_FROM_FILE: print 'Loading user user negative_co-occurrence matrix' t1 = time.time() start_idx = range(0, n_projects, batch_size) end_idx = start_idx[1:] + [n_projects] Y = _load_coord_matrix(start_idx, end_idx, n_users, n_users, prefix = 'backer') #user user co-occurrence matrix t2 = time.time()
def produce_neg_embeddings(DATA_DIR, train_data, n_users, n_items, batch_size=5000, iter=0): print n_users, n_items #clear the negative-co-temp folder: if os.path.exists(os.path.join(DATA_DIR, 'negative-co-temp')): for f in glob.glob(os.path.join(DATA_DIR, 'negative-co-temp', '*.npy')): os.remove(f) GENERATE_ITEM_ITEM_COOCCURENCE_FILE = True if GENERATE_ITEM_ITEM_COOCCURENCE_FILE: t1 = time.time() print 'Generating item item negative_co-occurrence matrix' start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] Parallel(n_jobs=1)( delayed(_coord_batch)(DATA_DIR, lo, hi, train_data, prefix='item') for lo, hi in zip(start_idx, end_idx)) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) pass ######################################################################################################################## ####################Generate user-user co-occurrence matrix based on the same items they backed###################### ##################### This will build a user-user co-occurrence matrix ########################################## def _load_coord_matrix(start_idx, end_idx, nrow, ncol, prefix='item'): X = sparse.csr_matrix((nrow, ncol), dtype='float32') for lo, hi in zip(start_idx, end_idx): coords = np.load( os.path.join(DATA_DIR, 'negative-co-temp', 'negative_%s_coo_%d_%d.npy' % (prefix, lo, hi))) rows = coords[:, 0] cols = coords[:, 1] tmp = sparse.coo_matrix((np.ones_like(rows), (rows, cols)), shape=(nrow, ncol), dtype='float32').tocsr() X = X + tmp print("%s %d to %d finished" % (prefix, lo, hi)) sys.stdout.flush() return X BOOLEAN_LOAD_PP_COOCC_FROM_FILE = True X, Y = None, None if BOOLEAN_LOAD_PP_COOCC_FROM_FILE: print 'Loading item item negative_co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix(start_idx, end_idx, n_items, n_items, prefix='item') #item item co-occurrence matrix print 'dumping matrix ...' text_utils.save_pickle( X, os.path.join(DATA_DIR, 'negative_item_item_cooc_iter%d.dat' % (iter))) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle( os.path.join(DATA_DIR, 'negative_item_item_cooc_iter%d.dat' % (iter))) t2 = time.time() print '[INFO]: sparse matrix size of item item negative_co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) if os.path.exists(os.path.join(DATA_DIR, 'negative-co-temp')): for f in glob.glob(os.path.join(DATA_DIR, 'negative-co-temp', '*.npy')): os.remove(f) return X, None
# return df U, V = None, None vad_data, vad_raw, vad_df = load_data( os.path.join(DATA_DIR, 'validation.csv')) train_data, train_raw, train_df = load_data( os.path.join(DATA_DIR, 'train.csv')) test_data, test_raw, test_df = load_data(os.path.join( DATA_DIR, 'test.csv')) U, V = wmf.decompose(train_data, vad_data, num_factors=n_components) VT = V.T iter, max_iter = 0, 10 #load postivie information X = text_utils.load_pickle(os.path.join(DATA_DIR, 'item_item_cooc.dat')) Y = text_utils.load_pickle(os.path.join(DATA_DIR, 'user_user_cooc.dat')) X_sppmi = convert_to_SPPMI_matrix(X, max_row=n_items, shifted_K=SHIFTED_K_VALUE) Y_sppmi = convert_to_SPPMI_matrix(Y, max_row=n_users, shifted_K=SHIFTED_K_VALUE) best_ndcg100 = 0.0 best_iter = 1 early_stopping = False while (iter < max_iter and not early_stopping): ################ Expectation step: ###################### user_slices = rec_eval.user_idx_generator(n_users, batch_users=5000) print 'GENERATING NEGATIVE INSTANCES ...'
if BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE: print 'Loading negative project project co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X_neg = _load_negative_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix='%s-project'%NEGATIVE_SELECTION_MODE) # project project co-occurrence matrix print X_neg print 'dumping matrix ...' text_utils.save_pickle(X_neg, os.path.join(DATA_DIR, '%s_negative_pro_pro_cooc.dat'%NEGATIVE_SELECTION_MODE)) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X_neg = text_utils.load_pickle(os.path.join(DATA_DIR, '%s_negative_pro_pro_cooc.dat'%NEGATIVE_SELECTION_MODE)) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X_neg.data.nbytes + X_neg.indices.nbytes + X_neg.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) # X = None BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE = False if BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE: print 'Loading negative user user co-occurrence matrix' t1 = time.time() start_idx = range(0, n_projects, batch_size) end_idx = start_idx[1:] + [n_projects] Y_neg = _load_negative_coord_matrix(start_idx, end_idx, n_users, n_users, prefix='backer') # user user co-occurrence matrix t2 = time.time()
print 'Loading project project co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X = _load_coord_matrix( start_idx, end_idx, n_projects, n_projects, prefix='project') #project project co-occurrence matrix print X print 'dumping matrix ...' text_utils.save_pickle(X, os.path.join(DATA_DIR, 'pro_pro_cooc.dat')) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X = text_utils.load_pickle(os.path.join(DATA_DIR, 'pro_pro_cooc.dat')) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) #X = None BOOLEAN_LOAD_UU_COOCC_FROM_FILE = False if BOOLEAN_LOAD_UU_COOCC_FROM_FILE: print 'Loading user user co-occurrence matrix' t1 = time.time() start_idx = range(0, n_projects, batch_size) end_idx = start_idx[1:] + [n_projects] Y = _load_coord_matrix(start_idx, end_idx, n_users,
if BOOLEAN_NEGATIVE_LOAD_PP_COOCC_FROM_FILE: print 'Loading negative project project co-occurrence matrix' t1 = time.time() start_idx = range(0, n_users, batch_size) end_idx = start_idx[1:] + [n_users] X_neg = _load_negative_coord_matrix(start_idx, end_idx, n_projects, n_projects, prefix='cate-project') # project project co-occurrence matrix print X_neg print 'dumping matrix ...' text_utils.save_pickle(X_neg, os.path.join(DATA_DIR, 'cate_negative_pro_pro_cooc_%d.dat'%NEGATIVE_NEIGHBOR_WORDS)) t2 = time.time() print 'Time : %d seconds' % (t2 - t1) else: print 'test loading model from pickle file' t1 = time.time() X_neg = text_utils.load_pickle(os.path.join(DATA_DIR, 'cate_negative_pro_pro_cooc_%d.dat'%NEGATIVE_NEIGHBOR_WORDS)) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X_neg.data.nbytes + X_neg.indices.nbytes + X_neg.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds' % (t2 - t1) # X = None BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE = True if BOOLEAN_LOAD_NEGATIVE_UU_COOCC_FROM_FILE: print 'Loading negative user user co-occurrence matrix' t1 = time.time() start_idx = range(0, n_projects, batch_size) end_idx = start_idx[1:] + [n_projects] Y_neg = _load_negative_coord_matrix(start_idx, end_idx, n_users, n_users, prefix='cate-backer') # user user co-occurrence matrix t2 = time.time()
#train_data, train_raw, train_df = load_data(os.path.join(DATA_DIR, 'train.num.sub.csv')) LOAD_NEGATIVE_MATRIX = True #for i in range(10): for i in [2]: # for i in range(9,-1,-1): FOLD = i print '*************************************FOLD %d ******************************************'%FOLD # train_data, train_raw, train_df = load_data(os.path.join(DATA_DIR, 'train_fold%d.csv'%FOLD)) vad_data, vad_raw, vad_df = load_data(os.path.join(DATA_DIR, 'vad.num.sub.fold%d.csv'%FOLD)) test_data, test_raw, test_df = load_data(os.path.join(DATA_DIR, 'test.num.sub.fold%d.csv'%FOLD)) train_data, train_raw, train_df = load_data(os.path.join(DATA_DIR, 'train.num.sub.fold%d.csv'%FOLD)) print 'loading pro_pro_cooc_fold%d.dat'%FOLD t1 = time.time() X = text_utils.load_pickle(os.path.join(DATA_DIR,'pro_pro_cooc_fold%d.dat'%FOLD)) t2 = time.time() print '[INFO]: sparse matrix size of project project co-occurrence matrix: %d mb\n' % ( (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds'%(t2-t1) print 'loading user_user_cooc_fold%d.dat'%FOLD t1 = time.time() Y = text_utils.load_pickle(os.path.join(DATA_DIR, 'user_user_cooc_fold%d.dat'%FOLD)) t2 = time.time() print '[INFO]: sparse matrix size of user user co-occurrence matrix: %d mb\n' % ( (Y.data.nbytes + Y.indices.nbytes + Y.indptr.nbytes) / (1024 * 1024)) print 'Time : %d seconds'%(t2-t1) ################# LOADING NEGATIVE CO-OCCURRENCE MATRIX ######################################## if LOAD_NEGATIVE_MATRIX:
vad_data, vad_raw, vad_df = load_data(os.path.join(DATA_DIR, 'validation.csv')) train_data, train_raw, train_df = load_data(os.path.join(DATA_DIR, 'train.csv')) test_data, test_raw, test_df = load_data(os.path.join(DATA_DIR, 'test.csv')) U, V = wmf.decompose(train_data, vad_data, num_factors= n_components) VT = V.T iter, max_iter = 0, 10 #load postivie information X = text_utils.load_pickle(os.path.join(DATA_DIR, 'item_item_cooc.dat')) Y = text_utils.load_pickle(os.path.join(DATA_DIR, 'user_user_cooc.dat')) X_sppmi = convert_to_SPPMI_matrix(X, max_row=n_items, shifted_K=SHIFTED_K_VALUE) Y_sppmi = convert_to_SPPMI_matrix(Y, max_row=n_users, shifted_K=SHIFTED_K_VALUE) best_ndcg100 = 0.0 best_iter = 1 early_stopping = False while (iter < max_iter and not early_stopping): ################ Expectation step: ###################### user_slices = rec_eval.user_idx_generator(n_users, batch_users=5000) print 'GENERATING NEGATIVE INSTANCES ...' t1 = time.time() df = Parallel(n_jobs=16)(delayed(gen_neg_instances)(train_data, U, VT, user_idx, neg_ratio = NEGATIVE_SAMPLE_RATIO, iter = iter) for user_idx in user_slices) t2 = time.time()