Example #1
0
def test_expstate():
    es.exp("the_experiment")
    es.state("default")["cheese"] = 2
    es.state()["blah"] = rand(100, 20)
    es.flush()
    es.state("next")["cheese"] = 3
    es.state()["blah"] = rand(100, 20)
    es.flush()
    print es.load_states()
def test_expstate():
	es.exp("the_experiment")
	es.state("default")["cheese"] = 2
	es.state()["blah"] = rand(100,20)
	es.flush()
	es.state("next")["cheese"] = 3
	es.state()["blah"] = rand(100,20)
	es.flush()
	print es.load_states()
	def process(self,Y,X=None,Xt=None,tests=None):
		self.setYX(Y,X,Xt)
		bivariter = 0
		sumSSE = 0
		esiter = list()
		es.state()["iterations"] = esiter
		# in the first iteration we calculate W by using ones on U
		U = ssp.csc_matrix(ones(self.u.shape))
		while True:
			esiterdict = dict()
			esiterdict["i"] = bivariter
			logger.debug("Starting iteration: %d"%bivariter)
			bivariter += 1
			W,w_bias,err = self.calculateW(U,tests=tests)
			esiterdict["w"] = W
			esiterdict["w_sparcity"] = (abs(W) > 0).sum()
			esiterdict["w_bias"] = w_bias
			esiterdict["w_test_err"] = err
			if "test" in err: logger.debug("W sparcity=%d,test_total_err=%2.2f,test_err=%s"%(esiterdict["w_sparcity"],err['test']["totalsse"],str(err['test']["diffsse"])))
			W = ssp.csc_matrix(W)
			U,u_bias,err = self.calculateU(W,tests=tests)
			esiterdict["u"] = U
			esiterdict["u_sparcity"] = (abs(U) > 0).sum()
			esiterdict["u_bias"] = u_bias
			esiterdict["u_test_err"] = err
			if "test" in err: logger.debug("U sparcity=%d,test_total_err=%2.2f,test_err=%s"%(esiterdict["u_sparcity"],err['test']["totalsse"],str(err['test']["diffsse"])))
			U = ssp.csc_matrix(U)
			self.u = U
			self.w = W
			self.w_bias = w_bias
			self.u_bias = u_bias
			esiter += [esiterdict]
			if self.allParams['bivar_max_it'] <= bivariter:
				break
		return sumSSE
Example #4
0
	def optimise(self,spamsfunc, lambda_rng, x_parts, y_parts,name="opt"):
		min_err = None
		min_lambda = None
		search_state = es.state().get("lambdasearch",dict())
		es.state()["lambdasearch"] = search_state
		search_state[name] = dict()
		search_state = search_state[name]
		search_state["thetas"] = list()
		search_state["biases"] = list()
		search_state["errors"] = list()
		search_state["lambdas"] = list()
		search_state["project"] = list()
		search_state["sparcity"] = list()
		for lmbda_i in range(len(lambda_rng)):
			lmbda = lambda_rng[lmbda_i]
			logger.debug("Testing lambda %2.5f (%d/%d)"%(lmbda,lmbda_i,len(lambda_rng)))
			logger.debug("Calling SPAMS function")
			spamsfunc.params['lambda1'] = lmbda
			theta_new,bias = spamsfunc.call(
				x_parts.train, 
				y_parts.train
			)
			theta_sparse = (abs(theta_new) < 0.00001).sum()
			logger.debug("Calculating error")
			err = self.errorfunc.evaluate(
				x_parts.val_param,
				y_parts.val_param,
				theta_new,
				bias
			)
			search_state["thetas"] += [theta_new]
			search_state["biases"] += [bias]
			search_state["errors"] += [err]	
			search_state["lambdas"] += [lmbda]
			search_state["sparcity"] += [theta_sparse]
			if min_err is None or err["totalsse"] < min_err:
				logger.debug("New min error detected.") 
				min_err = err["totalsse"]
				min_lambda = lmbda
			logger.debug("Lambda = %2.5f, Error = %2.5f, f=%d"%(lmbda,err["totalsse"],theta_sparse)) 
		logger.debug("Lambda = %2.5f, Error = %2.5f"%(min_lambda,min_err)) 
		spamsfunc.params['lambda1'] = min_lambda
		return min_lambda
 def optimise(self, spamsfunc, lambda_rng, x_parts, y_parts, name="opt"):
     min_err = None
     min_lambda = None
     search_state = es.state().get("lambdasearch", dict())
     es.state()["lambdasearch"] = search_state
     search_state[name] = dict()
     search_state = search_state[name]
     search_state["thetas"] = list()
     search_state["biases"] = list()
     search_state["errors"] = list()
     search_state["lambdas"] = list()
     search_state["project"] = list()
     search_state["sparcity"] = list()
     for lmbda_i in range(len(lambda_rng)):
         lmbda = lambda_rng[lmbda_i]
         logger.debug("Testing lambda %2.5f (%d/%d)" %
                      (lmbda, lmbda_i, len(lambda_rng)))
         logger.debug("Calling SPAMS function")
         spamsfunc.params['lambda1'] = lmbda
         theta_new, bias = spamsfunc.call(x_parts.train, y_parts.train)
         theta_sparse = (abs(theta_new) < 0.00001).sum()
         logger.debug("Calculating error")
         err = self.errorfunc.evaluate(x_parts.val_param, y_parts.val_param,
                                       theta_new, bias)
         search_state["thetas"] += [theta_new]
         search_state["biases"] += [bias]
         search_state["errors"] += [err]
         search_state["lambdas"] += [lmbda]
         search_state["sparcity"] += [theta_sparse]
         if min_err is None or err["totalsse"] < min_err:
             logger.debug("New min error detected.")
             min_err = err["totalsse"]
             min_lambda = lmbda
         logger.debug("Lambda = %2.5f, Error = %2.5f, f=%d" %
                      (lmbda, err["totalsse"], theta_sparse))
     logger.debug("Lambda = %2.5f, Error = %2.5f" % (min_lambda, min_err))
     spamsfunc.params['lambda1'] = min_lambda
     return min_lambda
 def process(self, Y, X=None, Xt=None, tests=None):
     self.setYX(Y, X, Xt)
     bivariter = 0
     sumSSE = 0
     esiter = list()
     es.state()["iterations"] = esiter
     # in the first iteration we calculate W by using ones on U
     U = ssp.csc_matrix(ones(self.u.shape))
     while True:
         esiterdict = dict()
         esiterdict["i"] = bivariter
         logger.debug("Starting iteration: %d" % bivariter)
         bivariter += 1
         W, w_bias, err = self.calculateW(U, tests=tests)
         esiterdict["w"] = W
         esiterdict["w_sparcity"] = (abs(W) > 0).sum()
         esiterdict["w_bias"] = w_bias
         esiterdict["w_test_err"] = err
         if "test" in err:
             logger.debug(
                 "W sparcity=%d,test_total_err=%2.2f,test_err=%s" %
                 (esiterdict["w_sparcity"], err['test']["totalsse"],
                  str(err['test']["diffsse"])))
         W = ssp.csc_matrix(W)
         U, u_bias, err = self.calculateU(W, tests=tests)
         esiterdict["u"] = U
         esiterdict["u_sparcity"] = (abs(U) > 0).sum()
         esiterdict["u_bias"] = u_bias
         esiterdict["u_test_err"] = err
         if "test" in err:
             logger.debug(
                 "U sparcity=%d,test_total_err=%2.2f,test_err=%s" %
                 (esiterdict["u_sparcity"], err['test']["totalsse"],
                  str(err['test']["diffsse"])))
         U = ssp.csc_matrix(U)
         self.u = U
         self.w = W
         self.w_bias = w_bias
         self.u_bias = u_bias
         esiter += [esiterdict]
         if self.allParams['bivar_max_it'] <= bivariter:
             break
     return sumSSE
	"it0":10,
	"max_it":1000,
	"lambda2":0.5,
	"lambda1":0.3
})
u_spams = FistaFlat(**{
	"intercept": True,
	"loss":"square",
	"regul":"elastic-net",
	"max_it":1000,
	"lambda2":0.5,
	"lambda1":0.3
})

es.exp("randomExp",fake=False)
es.state("random")
gen = randomgen.RandomBiGen(noise=0.01,brng=(100,1000),ntasks=1,wu_sparcity=0.6,wrng=(2,3),urng=(2,3),nusers=100,nwords=400)
x,y = gen.generate(n=1000)
x = ssp.csc_matrix(vstack(x).T)
y = array(y)

fold = [f for f in tscv.tsfi(y.shape[0],ntest=100,ntraining=900)][0]
Xparts,Yparts = BatchBivariateLearner.XYparts(fold,x,y)

learner = BatchBivariateLearner(w_spams,u_spams,bivar_max_it=10)
learner.process(Yparts.train_all,Xparts.train_all,tests={"test":(Xparts.test,Yparts.test)})

print learner.w.todense()
print gen._w
print learner.u.todense()
print gen._u
def experiment(o):			
	logger.info("Reading initial data")
	start = o["start"];ndays = o["ndays"];end = start + ndays
	folds = tscv.tsfi(ndays,ntest=o['f_ntest'],nvalidation=o['f_nval'],ntraining=o['f_ntrain'])
	
	tasks = billdata.taskvals(o["task_file"])
	ndays_total = tasks.yvalues.shape[0]
	if o["user_file_corrected"] is None or not os.path.exists(o["user_file_corrected"]):
		logger.info("...Loading and correcting from source")
		if "voc_file" in o and not o["word_subsample"] < 1:
			logger.info("...Reading vocabulary")
			voc = billdata.voc(o["voc_file"]).voc()
			# voc = None
		else:
			voc = None
		logger.info("...Reading user days")
		user_col, word_col = billdata.suserdayword(
			o["user_file"],ndays_total,
			nwords=billdata.count_cols_h5(o["word_file"])
		).mat(
			days=(start,end),
			voc=voc
		)
		if o["user_file_corrected"] is not None:
			logger.info("...Saving corrected user_mat")
			sio.savemat(o["user_file_corrected"],{"data":user_col.data,"indices":user_col.indices,"indptr":user_col.indptr,"shape":user_col.shape})
	else:
		logger.info("...Loading corrected user_mat")
		# csc_matrix((data, indices, indptr), [shape=(M, N)])
		user_col_d = sio.loadmat(o["user_file_corrected"])
		user_col = ssp.csc_matrix((user_col_d["data"][:,0],user_col_d["indices"][:,0],user_col_d["indptr"][:,0]),shape=user_col_d["shape"])
	logger.info("...User Col read, dimensions: %s"%str(user_col.shape))
	logger.info("...Reading task data")
	tasks = tasks.mat(days=(start,end),cols=[3,4,5])
	logger.info("...Reading tree file")
	tree = billdata.tree(o["tree_file"]).spamsobj()

	if o["word_subsample"] < 1 or o["user_subsample"] < 1:
		user_col=billdata.subsample(user_col,word_subsample=o["word_subsample"],user_subsample=o["user_subsample"],ndays=ndays)
	# At this point we've just loaded all the data
	# Prepare the optimisation functions
	u_lambdas = [float(x) for x in o['u_lambdas_str'].split(",")]
	w_lambdas = [float(x) for x in o['w_lambdas_str'].split(",")]
	u_lambdas = np.arange(*u_lambdas)
	w_lambdas = np.arange(*w_lambdas)
	spams_avail = {
		"tree":FistaTree(tree,**{
			"intercept": True,
			"loss":"square",
			"regul":"multi-task-tree",
			"it0":10,
			"lambda2":1000,
			"max_it":1000,
			"verbose":True
		}),
		"treecheck":FistaTree(tree,**{
			"intercept": True,
			"loss":"square",
			"regul":"multi-task-tree",
			"it0":10,
			"max_it":100,
			"lambda2":1000,
			"verbose":True
		}),
		"flatcheck":FistaFlat(**{
			"intercept": True,
			"loss":"square",
			"regul":"l1l2",
			"it0":50,
			"max_it":100,
			"verbose":True
		}),
		"flat":FistaFlat(**{
			"intercept": True,
			"loss":"square",
			"regul":"l1l2",
			"it0":50,
			"max_it":1000,
			"verbose":True
		})
	}

	w_spams = copy.deepcopy(spams_avail[o["w_spams"]])
	u_spams = copy.deepcopy(spams_avail[o["u_spams"]])
	lambda_set = False
	if o["lambda_file"] is not None and os.path.exists(o["lambda_file"]):
		logger.info("... loading existing lambda")
		lambda_d = sio.loadmat(o["lambda_file"])
		w_spams.params["lambda1"] = lambda_d["w_lambda"][0][0]
		u_spams.params["lambda1"] = lambda_d["u_lambda"][0][0]
		lambda_set = True

	# Prepare the learner
	learner = BatchBivariateLearner(w_spams,u_spams,bivar_max_it=o["bivar_max_it"])
	fold_i = 0
	es.exp(os.sep.join([o['exp_out'],"ds:politics_word:l1_user:l1_task:multi"]),fake=False)
	# Go through the folds!
	for fold in folds:
		es.state("fold_%d"%fold_i)
		logger.info("Working on fold: %d"%fold_i)
		logger.info("... preparing fold parts")
		Xparts,Yparts = BatchBivariateLearner.XYparts(fold,user_col,tasks)
		if not o["optimise_lambda_once"] or (o["optimise_lambda_once"] and not lambda_set):
			logger.debug("... Setting max it to optimisation mode: %d"%o["opt_maxit"])
			w_spams.params["max_it"] = o["opt_maxit"]
			u_spams.params["max_it"] = o["opt_maxit"]
			logger.info("... optimising fold lambda")
			ulambda,wlambda = learner.optimise_lambda(
				w_lambdas,u_lambdas,Yparts,Xparts,
				w_lambda=o["w_lambda"],u_lambda=o["u_lambda"]
			)
			lambda_set = True
			if o["lambda_file"] is not None:
				logger.info("... saving optimised lambdas")
				sio.savemat(o["lambda_file"],{"w_lambda":wlambda[1],"u_lambda":ulambda[1]})
		logger.info("... training fold")
		logger.debug("... Setting max it to training mode: %d"%o["train_maxit"])
		w_spams.params["max_it"] = o["train_maxit"]
		u_spams.params["max_it"] = o["train_maxit"]
		learner.process(
			Yparts.train_all,Xparts.train_all,
			tests={
				"test":(Xparts.test,Yparts.test),
				"val_it":(Xparts.val_it,Yparts.val_it)
			}
		)
		es.add(locals(),"fold_i","w_lambdas","u_lambdas","fold","Yparts","o")
		es.state()["w_spams_params"] = w_spams.params 
		es.state()["u_spams_params"] = u_spams.params
		logger.info("... Saving output")
		es.flush()
		fold_i += 1
		if o["f_maxiter"] is not None and fold_i >= o["f_maxiter"]: break
    **{
        "intercept": True,
        "loss": "square",
        "regul": "elastic-net",
        "max_it": 1000,
        "lambda2": 0.5
    })

# Prepare the learner
learner = BatchBivariateLearner(w_spams, u_spams)
fold_i = 0
es.exp("%s/Experiments/EMNLP2013/ds:politics_word:l1_user:l1_task:multi" %
       home)
# Go through the folds!
for fold in folds:
    es.state("fold_%d" % fold_i)
    logger.info("Working on fold: %d" % fold_i)
    logger.info("... preparing fold parts")
    Xparts, Yparts = BatchBivariateLearner.XYparts(fold, user_col, tasks)
    logger.info("... optimising fold lambda")
    learner.optimise_lambda(w_lambdas, u_lambdas, Yparts, Xparts)
    logger.info("... training fold")
    learner.process(Yparts.train_all,
                    Xparts.train_all,
                    tests={
                        "test": (Xparts.test, Yparts.test),
                        "val_it": (Xparts.val_it, Yparts.val_it)
                    })
    es.add(locals(), "fold_i", "w_lambdas", "u_lambdas", "fold", "Yparts")
    es.state()["w_spams_params"] = w_spams.params
    es.state()["u_spams_params"] = u_spams.params
        "max_it": 1000,
        "lambda2": 0.5,
        "lambda1": 0.3
    })
u_spams = FistaFlat(
    **{
        "intercept": True,
        "loss": "square",
        "regul": "elastic-net",
        "max_it": 1000,
        "lambda2": 0.5,
        "lambda1": 0.3
    })

es.exp("randomExp", fake=False)
es.state("random")
gen = randomgen.RandomBiGen(noise=0.01,
                            brng=(100, 1000),
                            ntasks=1,
                            wu_sparcity=0.6,
                            wrng=(2, 3),
                            urng=(2, 3),
                            nusers=100,
                            nwords=400)
x, y = gen.generate(n=1000)
x = ssp.csc_matrix(vstack(x).T)
y = array(y)

fold = [f for f in tscv.tsfi(y.shape[0], ntest=100, ntraining=900)][0]
Xparts, Yparts = BatchBivariateLearner.XYparts(fold, x, y)
})
u_spams = FistaFlat(**{
	"intercept": True,
	"loss":"square",
	"regul":"elastic-net",
	"max_it":1000,
	"lambda2":0.5
})

# Prepare the learner
learner = BatchBivariateLearner(w_spams,u_spams)
fold_i = 0
es.exp("%s/Experiments/EMNLP2013/ds:politics_word:l1_user:l1_task:multi"%home)
# Go through the folds!
for fold in folds:
	es.state("fold_%d"%fold_i)
	logger.info("Working on fold: %d"%fold_i)
	logger.info("... preparing fold parts")
	Xparts,Yparts = BatchBivariateLearner.XYparts(fold,user_col,tasks)
	logger.info("... optimising fold lambda")
	learner.optimise_lambda(w_lambdas,u_lambdas,Yparts,Xparts)
	logger.info("... training fold")
	learner.process(Yparts.train_all,Xparts.train_all,tests={"test":(Xparts.test,Yparts.test),"val_it":(Xparts.val_it,Yparts.val_it)})
	es.add(locals(),"fold_i","w_lambdas","u_lambdas","fold","Yparts")
	es.state()["w_spams_params"] = w_spams.params 
	es.state()["u_spams_params"] = u_spams.params
	logger.info("Saving output")
	es.flush()
	fold_i += 1
	break