def evaluate(self, train, val, test, dim, area): eval_train = train + val s_mem_scores = self._train_mfs(['s_memory'],eval_train, dim, area)[0] log.info('Evaluating smoothed memory') s_mem_erank = self._compute_erank(test, s_mem_scores) results = {'S_MEMORY': s_mem_erank} self.pretty_print(results) return results
def get_factorized_mat(self, data, dim, area): log.info('Running sklearn NMF') start = time.time() model = sk_NMF(n_components=dim, init='random', random_state=0) W = model.fit_transform(data.toarray()) # It can't run on the sparse representation :( H = model.components_ mf = np.dot(W, H) log.info('Factorizing took %d seconds' % (time.time() - start)) return mf
def get_factorized_mat(self, data, dim, area): log.info('Running sklearn NMF') start = time.time() model = sk_NMF(n_components=dim, init='random', random_state=0) W = model.fit_transform( data.toarray()) # It can't run on the sparse representation :( H = model.components_ mf = np.dot(W, H) log.info('Factorizing took %d seconds' % (time.time() - start)) return mf
def learn_mix_mult_on_individual(alpha, mem_mult, mf_mult, val_mat, num_em_iter=10000, tol=0.00001): """ Learning the mixing weights for mixture of two multinomials. Each individual learns mixing weights. NOTE: In order for the algorithm to work, there can be no location that can get 0 probability by both the mem_mult and the mf_mult. In my runs, I use MPE to estimate the mf_mult while using MLE for the mum_mul. That way the mf_mult has no 0 values. INPUT: ------- 1. alpha: <float / (2, ) ndarray> Dirichlet prior for the pi learning. If <float> is given it is treated as a flat prior. Has to be bigger than 1. 2. mem_mult: <(I, L) ndarray> each row is the multinomial parameter according to the "self" data 3. mf_mult: <(I, L) ndarray> each row is the multinomial parameter according to the matrix factorization 4. val_mat: <(I, L) ndarray> counts matrix to optimize on 5. num_em_iter: <int> number of em iterations 6. tol: <float> convergence threshold OUTPUT: -------- 1. pis: <(I, 2) ndarray> each row is mixing weights for the i'th individual RAISE: ------- 1. ValueError: a. alphas are not bigger than 1 b. the multinomial's rows don't sum to 1 c. There is a location with both mults 0 (see NOTE) """ I = mem_mult.shape[0] pis = np.zeros([I, 2]) start = time.time() for i in range(I): # if i % 200 == 0: # log.info('Em for individual %d out of %d' % (i + 1, I)) # The way the global em is implemented, allows me to simply call it with the i_val_data and it will only # compute the \pi as a function of that user. i_val_data = convert_sparse_to_coo(val_mat[i]) # The learning method treats the multinomials as matrices. So I have to wrap it in an array. # All the rows in i_val_data are going to be 0 because I'm coverting a single row_vector. i_mem_mult = np.array([mem_mult[i]]) i_mf_mult = np.array([mf_mult[i]]) i_pi = _learn_mix_mult(alpha, i_mem_mult, i_mf_mult, i_val_data, num_em_iter, tol) pis[i] = i_pi total_time = time.time() - start log.info('Finished EM on users. Total time = %d secs -- %.3f per user' % (total_time, total_time / I)) return pis
def get_factorized_mat(self, data, dim, area): log.info('Running numpy svds') start = time.time() # For SVD we need to remove the mean from the data. tmp = np.copy(np.array(data.toarray())) m = np.mean(tmp, axis=0) tmp -= m u, s, v = svds(tmp, dim) W = u H = np.dot(np.diag(s), v) mf = np.dot(W, H) mf += m log.info('Factorizing took %d seconds' % (time.time() - start)) return mf
def evaluate(self, train, val, test, dim, area): mem_scores = self._train_mfs(['memory'],train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'],train,dim,area)[0] mem_mult = normalize_mat_row(mem_scores) popularity_mult = normalize_mat_row(popularity_scores+0.001) pi_mem_pop = learn_mix_mult_on_individual(1.1, mem_mult, popularity_mult, val) # The flat prior won't change the ranking so there's no need to add it here. log.info('Evaluating memory with popularity') mem_pop_erank = self._compute_erank(test, mem_mult, popularity_mult, pi_mem_pop) results = {'MEMORY+POPULARITY': mem_pop_erank} self.pretty_print(results) return results
def get_factorized_mat(self, data, dim, area): log.info('Running numpy svds') start = time.time() # For SVD we need to remove the mean from the data. tmp = np.copy(np.array(data.toarray())) m = np.mean(tmp, axis=0) tmp -= m u, s, v = svds(tmp, dim) W = u H = np.dot(np.diag(s), v) mf = np.dot(W, H) mf += m log.info('Factorizing took %d seconds' % (time.time() - start)) return mf
def load_data(area): """ Loads train, validation and test data for the given area. When testing, train and val should be combined (train += val). OUTPUT: -------- 1. train: <(I, L) csr_mat> sparse counts matrix. Rows are individuals, columns are locations. 2. val: <(I, L) csr_mat> sparse counts matrix. Rows are individuals, columns are locations. 3. test: <(I, L) csr_mat> sparse counts matrix. Rows are individuals, columns are locations. RAISE: ------- 1. IOError: Area or one of the files does not exist. """ root_folder = '' log.info('Loading all data for area %s' % area) train_file = join(root_folder, area, 'train.csv') val_file = join(root_folder, area, 'val.csv') test_file = join(root_folder, area, 'test.csv') train_data = np.loadtxt(train_file, delimiter=',') val_data = np.loadtxt(val_file, delimiter=',') test_data = np.loadtxt(test_file, delimiter=',') # In order to create the coo_matrix we need to have the number of rows and columns in the matrix # All individuals will have data in train, val and test so it's enough to check how many uses are in train. I = np.unique(train_data[:, 0]).shape[0] # For location that is not tha case. We need to check the maximum location across all 3. L = np.max(train_data[:, 1]) L = np.max([L, np.max(val_data[:, 1])]) L = np.max([L, np.max(test_data[:, 1])]) L += 1 # It's all 0 based train = coo_matrix( (train_data[:, 2], (train_data[:, 0], train_data[:, 1])), shape=(I, L)).tocsr() val = coo_matrix((val_data[:, 2], (val_data[:, 0], val_data[:, 1])), shape=(I, L)).tocsr() test = coo_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1])), shape=(I, L)).tocsr() return train, val, test
def evaluate(self, train, val, test, dim, area): mem_scores = self._train_mfs(['memory'],train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'],train,dim,area)[0] mem_mult = normalize_mat_row(mem_scores) popularity_mult = normalize_mat_row(popularity_scores+0.001) pi_mem_pop = learn_mix_mult_global(1.1, mem_mult, popularity_mult, val) log.info('Global mixing weight is %f and %f' % (pi_mem_pop[0],pi_mem_pop[1])) print sum((pi_mem_pop).astype(float)) # The flat prior won't change the ranking so there's no need to add it here. log.info('Evaluating memory with popularity') mem_pop_erank = self._compute_erank(test, mem_mult, popularity_mult, pi_mem_pop) results = {'MEMORY+POPULARITY': mem_pop_erank} self.pretty_print(results) return results
def get_factorized_mat(self, data, dim, area): log.info('Loading hierarchical bayes NMF') start = time.time() I, L = data.shape assert area is not None root_dir = '/extra/disij0/data/person_mf/%s/hier_nmf' % area htheta = fu.load_np_txt(join(root_dir, 'htheta.tsv'), delimiter='\s\s\s\s') htheta = self._fix_projection(htheta, I, dim) hbeta = fu.load_np_txt(join(root_dir, 'hbeta.tsv'), delimiter='\s\s\s\s') hbeta = self._fix_projection(hbeta, L, dim) mf = htheta.dot(hbeta.T) log.info('Factorizing took %d seconds' % (time.time() - start)) return mf
def load_data(area): """ Loads train, validation and test data for the given area. When testing, train and val should be combined (train += val). OUTPUT: -------- 1. train: <(I, L) csr_mat> sparse counts matrix. Rows are individuals, columns are locations. 2. val: <(I, L) csr_mat> sparse counts matrix. Rows are individuals, columns are locations. 3. test: <(I, L) csr_mat> sparse counts matrix. Rows are individuals, columns are locations. RAISE: ------- 1. IOError: Area or one of the files does not exist. """ root_folder = '' log.info('Loading all data for area %s' % area) train_file = join(root_folder, area, 'train.csv') val_file = join(root_folder, area, 'val.csv') test_file = join(root_folder, area, 'test.csv') train_data = np.loadtxt(train_file,delimiter=',') val_data = np.loadtxt(val_file,delimiter=',') test_data = np.loadtxt(test_file,delimiter=',') # In order to create the coo_matrix we need to have the number of rows and columns in the matrix # All individuals will have data in train, val and test so it's enough to check how many uses are in train. I = np.unique(train_data[:, 0]).shape[0] # For location that is not tha case. We need to check the maximum location across all 3. L = np.max(train_data[:, 1]) L = np.max([L, np.max(val_data[:, 1])]) L = np.max([L, np.max(test_data[:, 1])]) L += 1 # It's all 0 based train = coo_matrix((train_data[:, 2], (train_data[:, 0], train_data[:, 1])), shape=(I, L)).tocsr() val = coo_matrix((val_data[:, 2], (val_data[:, 0], val_data[:, 1])), shape=(I, L)).tocsr() test = coo_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1])), shape=(I, L)).tocsr() return train, val, test
def evaluate(self, train, val, test, dim, area): eval_train = train + val gt_scores = self._train_mfs(['memory'], test, dim, area)[0] mem_scores = self._train_mfs(['memory'], eval_train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'], eval_train, dim, area)[0] log.info('Evaluating popularity') popularity_logp = self._compute_logp(test, popularity_scores) log.info('Evaluating memory') mem_logp = self._compute_logp(test, mem_scores) log.info('Evaluating ground truth') gt_logp = self._compute_logp(test, gt_scores) results = { 'MEMORY': mem_logp, 'GROUNDTRUTH': gt_logp, 'POPULARITY': popularity_logp } self.pretty_print(results) return results
def get_factorized_mat(self, data, dim, area): log.info('Loading hierarchical bayes NMF') start = time.time() I, L = data.shape assert area is not None root_dir = '/extra/disij0/data/person_mf/%s/hier_nmf' % area htheta = fu.load_np_txt(join(root_dir, 'htheta.tsv'), delimiter='\s\s\s\s') htheta = self._fix_projection(htheta, I, dim) hbeta = fu.load_np_txt(join(root_dir, 'hbeta.tsv'), delimiter='\s\s\s\s') hbeta = self._fix_projection(hbeta, L, dim) mf = htheta.dot(hbeta.T) log.info('Factorizing took %d seconds' % (time.time() - start)) return mf
def learn_mix_mult_global(alpha, mem_mult, mf_mult, val_mat, num_em_iter=100000, tol=0.0001): """ Learning the mixing weights for mixture of two multinomials globally for all users. Each observation is a point in model. NOTE: In order for the algorithm to work, there can be no location that can get 0 probability by both the mem_mult and the mf_mult. In my runs, I use MPE to estimate the mf_mult while using MLE for the mum_mul. That way the mf_mult has no 0 values. INPUT: ------- 1. alpha: <float / (2, ) ndarray> Dirichlet prior for the pi learning. If <float> is given it is treated as a flat prior. Has to be bigger than 1. 2. mem_mult: <(I, L) ndarray> each row is the multinomial parameter according to the "self" data 3. mf_mult: <(I, L) ndarray> each row is the multinomial parameter according to the matrix factorization 4. val_mat: <(I, L) ndarray> counts matrix to optimize on 5. num_em_iter: <int> number of em iterations 6. tol: <float> convergence threshold OUTPUT: -------- 1. pis: <(I, 2) ndarray> each row is mixing weights for the i'th individual RAISE: ------- 1. ValueError: a. alphas are not bigger than 1 b. the multinomial's rows don't sum to 1 c. There is a location with both mults 0 (see NOTE) """ log.info('Learning global mixing weights for all points') start = time.time() pi = _learn_mix_mult(alpha, mem_mult, mf_mult, convert_sparse_to_coo(val_mat), num_em_iter, tol) total_time = time.time() - start log.info('Finished EM on all data. Total time = %d secs' % total_time) return pi
def evaluate(self, train, val, test, dim, area): eval_train = train + val gt_scores = self._train_mfs(['memory'],test, dim, area)[0] mem_scores = self._train_mfs(['memory'],eval_train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'],eval_train,dim,area)[0] log.info('Evaluating popularity') popularity_erank = self._compute_erank(test, popularity_scores) log.info('Evaluating memory') mem_erank = self._compute_erank(test, mem_scores) log.info('Evaluating ground truth') gt_erank = self._compute_erank(test, gt_scores) results = {'MEMORY': mem_erank, 'GROUNDTRUTH': gt_erank, 'POPULARITY': popularity_erank} self.pretty_print(results) return results
def __init__(self): log.info('Evaluating logp on a train+val without smoothing')
def print_methods(): """ Prints the available methods. """ log.info('Available MF methods: %s' % list(_mfs_factory.keys()))
def __init__(self): log.info('Evaluating ranking on a single component')
def __init__(self): log.info('Evaluating logp with global learned mixing weights')
def evaluate(self, train, val, test, dim, area): log.info('Learning Memory, NMF and hb NMF mfs on train only for mixing weights optimization') nmf_scores, hb_nmf_scores, mem_scores = self._train_mfs(['nmf', 'hbnmf', 'memory'], train, dim, area) log.info('Learning mix for MEM and NMF') mem_mult = normalize_mat_row(mem_scores) nmf_mult = normalize_mat_row(nmf_scores + 0.001) # Small flat prior to avoid 0. pis_mem_nmf = learn_mix_mult_on_individual(1.1, mem_mult, nmf_mult, val) log.info('Learning mix for MEM and hb NMF') hb_nmf_mult = normalize_mat_row(hb_nmf_scores + 0.001) # Small flat prior to avoid 0. pis_mem_hb_nmf = learn_mix_mult_on_individual(1.1, mem_mult, hb_nmf_mult, val) log.info('Learning Memory NMF and hier NMF mfs on train+val for evaluation') eval_train = train + val nmf_scores, hb_nmf_scores, mem_scores = self._train_mfs(['nmf', 'hbnmf', 'memory'], eval_train, dim, area) # The flat prior won't change the ranking so there's no need to add it here. log.info('Evaluating memory with NMF') mem_nmf_erank = self._compute_erank(test, mem_scores, nmf_scores, pis_mem_nmf) log.info('Evaluating memory with hb_NMF') mem_hb_nmf_erank = self._compute_erank(test, mem_scores, hb_nmf_scores, pis_mem_hb_nmf) results = {'mem_nmf': mem_nmf_erank, 'mem_hb_nmf': mem_hb_nmf_erank} self.pretty_print(results) return results
def evaluate(self, train, val, test, dim, area): # There is no mixing weights optimization in this code. # Therefore the val can be added to train. eval_train = train + val gt_scores = self._train_mfs(['memory'],test, dim, area)[0] s_mem_scores = self._train_mfs(['s_memory'],eval_train, dim, area)[0] mem_scores = self._train_mfs(['memory'],eval_train, dim, area)[0] log.info('Evaluating memory') mem_erank = self._compute_erank(test, mem_scores) log.info('Evaluating smoothed memory') s_mem_erank = self._compute_erank(test, s_mem_scores) log.info('Evaluating ground truth') gt_erank = self._compute_erank(test, gt_scores) svd_scores, nmf_scores, hb_nmf_scores= self._train_mfs(['svd', 'nmf', 'hbnmf'],eval_train, dim, area) log.info('Evaluating SVD') svd_erank = self._compute_erank(test, svd_scores) log.info('Evaluating sklearn NMF') nmf_erank = self._compute_erank(test, nmf_scores) log.info('Evaluating Hierarchical Bayes NMF') hb_nmf_erank = self._compute_erank(test, hb_nmf_scores) self.pretty_print({'SVD': svd_erank, 'NMF': nmf_erank}) results = {'MEMORY': mem_erank, 'SVD': svd_erank, 'NMF': nmf_erank, 'HBPF': hb_nmf_erank, 'GROUNDTRUTH': gt_erank, 'S_MEMORY': s_mem_erank, 'MEMORY': mem_erank} self.pretty_print(results) return results
def __init__(self): log.info('Evaluating ranking with individual learned mixing weights')
def __init__(self): log.info( 'Mem and popularity learnt from training data; searching alpha on validation set' )
def evaluate(self, train, val, test, dim, area): def logP(score_mat, test): logp_p = np.zeros(int(test.sum())) logp_indiv = np.zeros(test.shape[0]) test_data = coo_matrix(test) temp = score_mat / np.sum(score_mat) idx = 0 for i, j, v in zip(test_data.row, test_data.col, test_data.data): logp_p[int(idx):int(idx + v)] = np.log(temp[i, j]) idx += v temp = normalize_mat_row(score_mat) for i, j, v in zip(test_data.row, test_data.col, test_data.data): logp_indiv[i] += v * np.log(temp[i, j]) n_train = np.array([int(test.sum(axis=1)[i][0]) for i in range(I)]) logp_indiv /= n_train return logp_p, logp_indiv ALPHA = np.arange(0.1, 1.1, 0.1) mem_scores = self._train_mfs(['memory'], train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'], train, dim, area)[0] + 0.0001 mem_mult = normalize_mat_row(mem_scores) popularity_mult = normalize_mat_row(popularity_scores) N = int(np.sum(mem_scores)) I, L = train.shape n_train = np.array([int(train.sum(axis=1)[i][0]) for i in range(I)]) results = dict() headers = [ 'EM global', 'EM indiv', 'S_mem', 'Dirichlet', 'Translation_JM', 'Translation_Dirichlet' ] logP_p = DataFrame(np.zeros((int(test.sum()), 6)), columns=headers) logP_indiv = DataFrame(np.zeros((I, 6)), columns=headers) mix_alpha = DataFrame(np.zeros((I, 6)), columns=headers) log.info('#####learning statistical translation model#######') log.info('computing sparse mutual information') binary = (train > 0) * 1 #I*L count_1d = binary.sum(axis=0) #1*L count_2d = np.dot(binary.T, binary) #L*L P_1d = count_1d / I # exists zeros P_2d = count_2d / I temp = P_2d / np.outer(P_1d, P_1d) temp[~np.isfinite(temp)] = 1 # zero / zero = zero temp[temp == 0] = 1 # avoid log_zero PPMI = np.log2(temp) PPMI[PPMI < 0] = 0 k = 50 idx = np.array([[ j for j in np.asarray(PPMI[i].argsort().T).reshape(-1)[-k:][::-1] if PPMI[i, j] > 0 ] for i in range(L)]) for u in range(L): if u not in idx[u]: idx[u].append(u) binary = (np.array(train.toarray()) > 0) * 1 #I*L MI = np.zeros((L, L)) from sklearn import metrics for u in range(L): for w in idx[u]: if MI[u, w] == 0: MI[u, w] = metrics.mutual_info_score( None, None, contingency=np.histogram2d(binary[:, u], binary[:, w])[0]) MI[w, u] = MI[u, w] MI = normalize_mat_row(MI) MI[~np.isfinite(MI)] = 1 / L ##########and self transition probability######## log.info( 'gridsearching on validation set (can be optimized) with JM smoothing' ) val_result = dict() for alpha in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: for mu in [0, 0.1, 0.2, 0.3, 0.4, 0.5]: trans = MI * (1 - alpha) + np.identity(L) * alpha pref = np.dot( mem_mult, trans) # consider each trans[i] as a base vector temp = pref * mu + popularity_mult * (1 - mu) val_result[(alpha, mu)] = self._compute_logp_point(val, temp) #####choose alpha and mu that achieves best avg. point logP alpha, mu = max(val_result, key=val_result.get) trans = MI * (1 - alpha) + np.identity(L) * alpha pref = np.dot(mem_mult, trans) stm_scores = pref * mu + popularity_mult * (1 - mu) log.info('Evaluating MI based translation model with JM smoothing') stm_result = self._compute_erank_logp(test, stm_scores) results['Translation_JM'] = stm_result log.info("self transition weight and popularity weight: %f, %f" % (alpha, 1 - mu)) #####record results and mixture parameters######## logP_p['Translation_JM'], logP_indiv['Translation_JM'] = logP( stm_scores, test) mix_alpha['Translation_JM'] = np.zeros(I) + mu * alpha ##########and self transition probability######## log.info( 'gridsearching on validation set (can be optimized) with Dirichlet prior' ) val_result = dict() for alpha in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: for mu in [0, 0.1, 0.2, 0.3, 0.4, 0.5]: trans = MI * (1 - alpha) + np.identity(L) * alpha pref = np.dot( mem_scores, trans) # consider each trans[i] as a base vector temp = pref + popularity_mult * mu * N / I val_result[(alpha, mu)] = self._compute_logp_point(val, temp) #####choose alpha and mu that achieves best avg. point logP alpha, mu = max(val_result, key=val_result.get) trans = MI * (1 - alpha) + np.identity(L) * alpha pref = np.dot(mem_scores, trans) stm_scores = pref + popularity_mult * mu * N / I log.info('Evaluating MI based translation model with Dirichlet prior') stm_result = self._compute_erank_logp(test, stm_scores) results['Translation_Dirichlet'] = stm_result log.info("self transition weight and prior strength: %f, %f" % (alpha, mu * N / I)) #####record results and mixture parameters######## logP_p['Translation_Dirichlet'], logP_indiv[ 'Translation_Dirichlet'] = logP(stm_scores, test) mix_alpha['Translation_Dirichlet'] = n_train * alpha / (n_train + mu * N / I) log.info('#############learning EM global#################') pi_mem_pop = learn_mix_mult_global(1.1, mem_mult, popularity_mult, val) log.info('Global mixing weight is %f and %f' % (pi_mem_pop[0], pi_mem_pop[1])) log.info('Evaluating EM global') em_global_scores = pi_mem_pop[0] * mem_mult + pi_mem_pop[ 1] * popularity_mult EM_global_result = self._compute_erank_logp(test, em_global_scores) results['EM global'] = EM_global_result logP_p['EM global'], logP_indiv['EM global'] = logP( em_global_scores, test) mix_alpha['EM global'] = pi_mem_pop[0] + np.zeros(I) log.info('#############learning EM individual##############') pi_mem_pop = learn_mix_mult_on_individual(1.1, mem_mult, popularity_mult, val) log.info('Evaluating EM indiv') em_indiv_scores = col_vector(pi_mem_pop[:, 0]) * mem_mult + col_vector( pi_mem_pop[:, 1]) * popularity_mult EM_indiv_result = self._compute_erank_logp(test, mem_mult, popularity_mult, pi_mem_pop) results['EM indiv'] = EM_indiv_result logP_p['EM indiv'], logP_indiv['EM indiv'] = logP( em_indiv_scores, test) mix_alpha['EM indiv'] = pi_mem_pop[:, 0] log.info('#############learning S_memory###################') log.info('gridsearching on validation set') val_result = dict() for alpha in ALPHA: temp = mem_scores * alpha + popularity_scores * (1 - alpha) val_result[alpha] = self._compute_logp_point(val, temp) #####choose alpha that achieves best avg. point logP alpha = max(val_result, key=val_result.get) print('alpha:', alpha) s_mem_scores = mem_scores * alpha + popularity_scores * (1 - alpha) log.info('Evaluating smoothed memory') s_mem_result = self._compute_erank_logp(test, s_mem_scores) results['S_Mem'] = s_mem_result n_train = np.array([int(train.sum(axis=1)[i][0]) for i in range(I)]) temp = n_train.mean() logP_p['S_mem'], logP_indiv['S_mem'] = logP(s_mem_scores, test) mix_alpha['S_mem'] = alpha * n_train / (alpha * n_train + (1 - alpha) * temp) log.info('############learning with Dirichlet prior#############') log.info('gridsearching on validation set') val_result = dict() for alpha in ALPHA: temp = mem_scores + popularity_mult * alpha * N / I val_result[alpha] = self._compute_logp_point(val, temp) #####choose alpha that achieves best avg. point logP alpha = max(val_result, key=val_result.get) print('alpha:', alpha) dirichlet_scores = mem_scores + popularity_mult * alpha * N / I log.info('Evaluating with Dirichlet prior') dirichlet_result = self._compute_erank_logp(test, dirichlet_scores) results['Dirichlet'] = dirichlet_result logP_p['Dirichlet'], logP_indiv['Dirichlet'] = logP( dirichlet_scores, test) mix_alpha['Dirichlet'] = n_train / (n_train + alpha * N / I) self.pretty_print(results) return logP_p, logP_indiv, mix_alpha
def evaluate(self, train, val, test, dim, area): ALPHA = [0,0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.999,1] mem_scores = self._train_mfs(['memory'],train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'],train,dim,area)[0] mem_mult = normalize_mat_row(mem_scores) popularity_mult = normalize_mat_row(popularity_scores) log.info('Mem and popularity learnt from training data; searching alpha') results_val = dict() results_test = dict() for alpha in ALPHA: log.info('Ranking when alpha is %.2f' % alpha) scores = alpha * mem_mult + (1-alpha)*popularity_mult erank_val = self._compute_erank(val, scores) erank_test = self._compute_erank(test, scores) results_val['%.2f' % alpha] = erank_val results_test['%.2f' % alpha] = erank_test log.info('Erank on validation data') self.pretty_print(results_val) log.info('Erank on test data') self.pretty_print(results_test) eval_train = train + val mem_scores = self._train_mfs(['memory'],eval_train, dim, area)[0] popularity_scores = self._train_mfs(['popularity'],eval_train,dim,area)[0] mem_mult = normalize_mat_row(mem_scores) popularity_mult = normalize_mat_row(popularity_scores) log.info('Mem and popularity learnt from training and val data; searching alpha') results_val = dict() results_test = dict() for alpha in ALPHA: log.info('Ranking when alpha is %.2f' % alpha) scores = alpha * mem_mult + (1-alpha)*popularity_mult erank_val = self._compute_erank(val, scores) erank_test = self._compute_erank(test, scores) results_val['%.2f' % alpha] = erank_val results_test['%.2f' % alpha] = erank_test log.info('Erank on validation data') self.pretty_print(results_val) log.info('Erank on test data') self.pretty_print(results_test)
def __init__(self): log.info('Evaluating ranking with global gridsearched mixing weights')
def print_methods(): """ Prints the available methods. """ log.info('Available MF methods: %s' % list(_mfs_factory.keys()))
def __init__(self): log.info('Evaluating logp with global learned mixing weights')
def __init__(self): log.info('Evaluating ranking on a train+val with smoothing')
def __init__(self): log.info('Evaluating logp on a train+val without smoothing')