def learn_scoal_wrapper(**argv): Z, W, rowAttr, colAttr, crossAttr, learner, params, \ train_loss, test_loss, \ num_cv, init_K, init_L, model_filename = read_options(**argv) Z = Z.tocsr() W = W.tocsr() param = params[0] I,J= sp.find(W)[:2] num_data = len(I) model = GeneralScoalModel() model.set_attributes(rowAttr, colAttr, crossAttr) #learn_scoal(model, Z, W, K, L, learner, param, train_loss, test_loss) model.save(model_filename)
def model_selector(**argv): ''' select the best model from cv set and parameter options ''' # read data from input dict Z, W, rowAttr, colAttr, crossAttr, learner, params, \ train_loss, test_loss, \ num_cv, init_K, init_L, model_filename = read_options(**argv) save_validation_loss = defaultdict(list) I,J= sp.find(W)[:2] num_data = len(I) for ix, (trainIdx, validationIdx) in enumerate(validSets(num_data, num_cv)): if __debug__: print '\nValidation set:',ix # for each cv split model = GeneralScoalModel() model.set_attributes(rowAttr, colAttr, crossAttr) train_I = I[trainIdx] train_J = J[trainIdx] validation_I = I[validationIdx] validation_J = J[validationIdx] Z_training = sp.coo_matrix((np.ravel(Z[(train_I, train_J)]), (train_I, train_J)), shape=Z.shape).tocsr() W_training = sp.coo_matrix((np.ravel(W[(train_I, train_J)]), (train_I, train_J)), shape=W.shape).tocsr() Z_validation = sp.coo_matrix((np.ravel(Z[(validation_I, validation_J)]), (validation_I, validation_J)), shape=Z.shape).tocsr() W_validation = sp.coo_matrix((np.ravel(W[(validation_I, validation_J)]), (validation_I, validation_J)), shape=W.shape).tocsr() # Do model selection for jx, param in enumerate(params): # for each alpha parameter if __debug__: print 'parameter:', param K = init_K L = init_L learn_scoal(model, Z, W_training, K, L, learner, param, train_loss, test_loss) test_Z = np.ravel(Z[(validation_I, validation_J)]) validation_loss = model.test_loss(test_Z, model.predict(validation_I, validation_J) ) save_validation_loss[(jx, K, L)].append(validation_loss) if __debug__: print "Starting validation loss: %f" % (validation_loss,) for _ in range(maxIterations): row_test_model = model.copy() row_split_validation_loss = test_row_split(row_test_model, Z_training, W_training, \ Z_validation, W_validation, K, \ L, learner, param, train_loss, test_loss) save_validation_loss[(jx, K+1, L)].append(row_split_validation_loss) col_test_model = model.copy() col_split_validation_loss = test_col_split(col_test_model, Z_training, W_training, \ Z_validation, W_validation, K, \ L, learner, param, train_loss, test_loss) save_validation_loss[(jx, K, L+1)].append(col_split_validation_loss) if __debug__: print "Row split loss: %f" % (row_split_validation_loss,) print "Col split loss: %f" % (col_split_validation_loss,) if row_split_validation_loss <= col_split_validation_loss and row_split_validation_loss < validation_loss: K += 1 model = row_test_model validation_loss = row_split_validation_loss elif col_split_validation_loss < row_split_validation_loss and col_split_validation_loss < validation_loss: L += 1 model = col_test_model validation_loss = col_split_validation_loss else: break if __debug__: print "k: %d, l: %d" % (K, L) print "Current validation loss: %f" % validation_loss if __debug__: print "Final k,l: %d,%d" % (K, L) print "Final validation loss: %f" % validation_loss save_validation_loss = dict(save_validation_loss) if __debug__: print "\nValidation losses" for key, vals in save_validation_loss.iteritems(): print key, vals print "\n" ''' Compute best parameter: Sort by length, then by -mean() i.e. longest then min(mean(error)) ''' sorted_loss = sorted( [ [ key, -np.array(val).mean(), len(val) ] for key, val in save_validation_loss.iteritems()] , key=itemgetter(2,1) ) best_loss = sorted_loss[-1] best_loss[1] = -best_loss[1] print "\nVALIDATION COMPLETE" print "Selected parameter", params[best_loss[0][0]] print "Selected K:%d, L:%d" % (best_loss[0][1], best_loss[0][2]) print "Mean validation error %f, with %d observations" % (best_loss[1], best_loss[2]) return best_loss, save_validation_loss