def test_list_gather():
    nusers = 3
    ndays = 20
    nwords = 10
    userdayword = scipy.sparse.rand(nwords,
                                    nusers * ndays,
                                    density=0.1,
                                    format="csc")

    suwd = billdata.suserdayword(
        userdayword, scipy.sparse.csc_matrix(userdayword.transpose()), ndays)

    l = suwd.list()
    assert len(l) == ndays
def test_list_gather():
	nusers = 3
	ndays = 20
	nwords = 10
	userdayword = scipy.sparse.rand(
		nwords,nusers*ndays,density=0.1,format="csc"
	)

	suwd = billdata.suserdayword(
		userdayword,
		scipy.sparse.csc_matrix(userdayword.transpose()),
		ndays
	)


	l = suwd.list()
	assert len(l) == ndays
Example #3
0
import logging.config

logger.config.fileConfig("logconfig.ini")

# Where the data is coming from and going
user_home = os.environ['HOME']
data_home = "%s/Dropbox/Experiments/twitter_uk_users_MATLAB"%user_home

# load the polls (for the number of days)
p2 = scipy.io.loadmat("%s/Dropbox/TrendMiner/Collaboration/EMNLP_2013/MATLAB_v2/UK_data_for_experiment_PartII.mat"%user_home);
yvalues = p2["polls_vi_cum"][p2["polls_index"]-1,:]

# Load the big dataset
udw = bd.suserdayword(
	"%s/user_vsr_for_polls_t.mat"%data_home,
	"%s/user_vsr_for_polls.mat"%data_home,
	yvalues.shape[0]
)

# extract these days and save
start = 81
delta = 20
end = start + delta
ua,wa = udw.mat(days=(start,end))
matname = "user_vsr_for_polls_day_%d_%d"%(start,end)
outuser = "******"%(data_home,matname)
outword = "%s/%s.mat"%(data_home,matname)
logger.debug("Saving days %d to %d (User Col matrix)"%(start,end))
bd.savesparse(ua,outuser,"%s_t"%matname)
logger.debug("Saving days %d to %d (Word Col matrix)"%(start,end))
bd.savesparse(wa,outword,"%s"%matname)
def experiment(o):			
	logger.info("Reading initial data")
	start = o["start"];ndays = o["ndays"];end = start + ndays
	folds = tscv.tsfi(ndays,ntest=o['f_ntest'],nvalidation=o['f_nval'],ntraining=o['f_ntrain'])
	
	tasks = billdata.taskvals(o["task_file"])
	ndays_total = tasks.yvalues.shape[0]
	if o["user_file_corrected"] is None or not os.path.exists(o["user_file_corrected"]):
		logger.info("...Loading and correcting from source")
		if "voc_file" in o and not o["word_subsample"] < 1:
			logger.info("...Reading vocabulary")
			voc = billdata.voc(o["voc_file"]).voc()
			# voc = None
		else:
			voc = None
		logger.info("...Reading user days")
		user_col, word_col = billdata.suserdayword(
			o["user_file"],ndays_total,
			nwords=billdata.count_cols_h5(o["word_file"])
		).mat(
			days=(start,end),
			voc=voc
		)
		if o["user_file_corrected"] is not None:
			logger.info("...Saving corrected user_mat")
			sio.savemat(o["user_file_corrected"],{"data":user_col.data,"indices":user_col.indices,"indptr":user_col.indptr,"shape":user_col.shape})
	else:
		logger.info("...Loading corrected user_mat")
		# csc_matrix((data, indices, indptr), [shape=(M, N)])
		user_col_d = sio.loadmat(o["user_file_corrected"])
		user_col = ssp.csc_matrix((user_col_d["data"][:,0],user_col_d["indices"][:,0],user_col_d["indptr"][:,0]),shape=user_col_d["shape"])
	logger.info("...User Col read, dimensions: %s"%str(user_col.shape))
	logger.info("...Reading task data")
	tasks = tasks.mat(days=(start,end),cols=[3,4,5])
	logger.info("...Reading tree file")
	tree = billdata.tree(o["tree_file"]).spamsobj()

	if o["word_subsample"] < 1 or o["user_subsample"] < 1:
		user_col=billdata.subsample(user_col,word_subsample=o["word_subsample"],user_subsample=o["user_subsample"],ndays=ndays)
	# At this point we've just loaded all the data
	# Prepare the optimisation functions
	u_lambdas = [float(x) for x in o['u_lambdas_str'].split(",")]
	w_lambdas = [float(x) for x in o['w_lambdas_str'].split(",")]
	u_lambdas = np.arange(*u_lambdas)
	w_lambdas = np.arange(*w_lambdas)
	spams_avail = {
		"tree":FistaTree(tree,**{
			"intercept": True,
			"loss":"square",
			"regul":"multi-task-tree",
			"it0":10,
			"lambda2":1000,
			"max_it":1000,
			"verbose":True
		}),
		"treecheck":FistaTree(tree,**{
			"intercept": True,
			"loss":"square",
			"regul":"multi-task-tree",
			"it0":10,
			"max_it":100,
			"lambda2":1000,
			"verbose":True
		}),
		"flatcheck":FistaFlat(**{
			"intercept": True,
			"loss":"square",
			"regul":"l1l2",
			"it0":50,
			"max_it":100,
			"verbose":True
		}),
		"flat":FistaFlat(**{
			"intercept": True,
			"loss":"square",
			"regul":"l1l2",
			"it0":50,
			"max_it":1000,
			"verbose":True
		})
	}

	w_spams = copy.deepcopy(spams_avail[o["w_spams"]])
	u_spams = copy.deepcopy(spams_avail[o["u_spams"]])
	lambda_set = False
	if o["lambda_file"] is not None and os.path.exists(o["lambda_file"]):
		logger.info("... loading existing lambda")
		lambda_d = sio.loadmat(o["lambda_file"])
		w_spams.params["lambda1"] = lambda_d["w_lambda"][0][0]
		u_spams.params["lambda1"] = lambda_d["u_lambda"][0][0]
		lambda_set = True

	# Prepare the learner
	learner = BatchBivariateLearner(w_spams,u_spams,bivar_max_it=o["bivar_max_it"])
	fold_i = 0
	es.exp(os.sep.join([o['exp_out'],"ds:politics_word:l1_user:l1_task:multi"]),fake=False)
	# Go through the folds!
	for fold in folds:
		es.state("fold_%d"%fold_i)
		logger.info("Working on fold: %d"%fold_i)
		logger.info("... preparing fold parts")
		Xparts,Yparts = BatchBivariateLearner.XYparts(fold,user_col,tasks)
		if not o["optimise_lambda_once"] or (o["optimise_lambda_once"] and not lambda_set):
			logger.debug("... Setting max it to optimisation mode: %d"%o["opt_maxit"])
			w_spams.params["max_it"] = o["opt_maxit"]
			u_spams.params["max_it"] = o["opt_maxit"]
			logger.info("... optimising fold lambda")
			ulambda,wlambda = learner.optimise_lambda(
				w_lambdas,u_lambdas,Yparts,Xparts,
				w_lambda=o["w_lambda"],u_lambda=o["u_lambda"]
			)
			lambda_set = True
			if o["lambda_file"] is not None:
				logger.info("... saving optimised lambdas")
				sio.savemat(o["lambda_file"],{"w_lambda":wlambda[1],"u_lambda":ulambda[1]})
		logger.info("... training fold")
		logger.debug("... Setting max it to training mode: %d"%o["train_maxit"])
		w_spams.params["max_it"] = o["train_maxit"]
		u_spams.params["max_it"] = o["train_maxit"]
		learner.process(
			Yparts.train_all,Xparts.train_all,
			tests={
				"test":(Xparts.test,Yparts.test),
				"val_it":(Xparts.val_it,Yparts.val_it)
			}
		)
		es.add(locals(),"fold_i","w_lambdas","u_lambdas","fold","Yparts","o")
		es.state()["w_spams_params"] = w_spams.params 
		es.state()["u_spams_params"] = u_spams.params
		logger.info("... Saving output")
		es.flush()
		fold_i += 1
		if o["f_maxiter"] is not None and fold_i >= o["f_maxiter"]: break
voc_file = "%s/Dropbox/TrendMiner/Collaboration/EMNLP_2013/MATLAB_v2/voc_matching_v2.mat" % home

start = 81
ndays = 20
end = start + ndays

user_file = os.sep.join([data_home, user_mat_file % (start, end)])
word_file = os.sep.join([data_home, word_mat_file % (start, end)])

folds = tscv.tsfi(ndays, ntest=2)
logger.info("Reading task data")
tasks = billdata.taskvals(task_file).mat(days=(start, end))
# tree = billdata.tree(tree_file).spamsobj()
logger.info("Reading vocabulary")
voc = billdata.voc(voc_file).voc()
user_col, word_col = billdata.suserdayword(user_file, word_file,
                                           ndays).mat(voc=voc)

user_col, word_col = billdata.subsample(user_col,
                                        word_subsample=0.001,
                                        user_subsample=0.001,
                                        ndays=ndays)
# At this point we've just loaded all the data
# Prepare the optimisation functions
u_lambdas = np.arange(0.1, 1, 0.1)
w_lambdas = np.arange(0.1, 2, 0.1)
w_spams = FistaFlat(**{
    "intercept": True,
    "loss": "square",
    "regul": "l1",
    "it0": 10,
    "max_it": 1000
import logging.config

logger.config.fileConfig("logconfig.ini")

# Where the data is coming from and going
user_home = os.environ['HOME']
data_home = "%s/Dropbox/Experiments/twitter_uk_users_MATLAB" % user_home

# load the polls (for the number of days)
p2 = scipy.io.loadmat(
    "%s/Dropbox/TrendMiner/Collaboration/EMNLP_2013/MATLAB_v2/UK_data_for_experiment_PartII.mat"
    % user_home)
yvalues = p2["polls_vi_cum"][p2["polls_index"] - 1, :]

# Load the big dataset
udw = bd.suserdayword("%s/user_vsr_for_polls_t.mat" % data_home,
                      "%s/user_vsr_for_polls.mat" % data_home,
                      yvalues.shape[0])

# extract these days and save
start = 81
delta = 20
end = start + delta
ua, wa = udw.mat(days=(start, end))
matname = "user_vsr_for_polls_day_%d_%d" % (start, end)
outuser = "******" % (data_home, matname)
outword = "%s/%s.mat" % (data_home, matname)
logger.debug("Saving days %d to %d (User Col matrix)" % (start, end))
bd.savesparse(ua, outuser, "%s_t" % matname)
logger.debug("Saving days %d to %d (Word Col matrix)" % (start, end))
bd.savesparse(wa, outword, "%s" % matname)
tree_file = "%s/Dropbox/TrendMiner/Collaboration/EMNLP_2013/MATLAB_v2/UK_data_for_experiment_PartI.mat"%home
voc_file = "%s/Dropbox/TrendMiner/Collaboration/EMNLP_2013/MATLAB_v2/voc_matching_v2.mat"%home

start = 81;ndays = 20;end = start + ndays

user_file = os.sep.join([data_home,user_mat_file%(start,end)])
word_file = os.sep.join([data_home,word_mat_file%(start,end)])

folds = tscv.tsfi(ndays,ntest=2)
logger.info("Reading task data")
tasks = billdata.taskvals(task_file).mat(days=(start,end))
# tree = billdata.tree(tree_file).spamsobj()
logger.info("Reading vocabulary")
voc = billdata.voc(voc_file).voc()
user_col, word_col = billdata.suserdayword(
	user_file,word_file,ndays
).mat(voc=voc)

user_col,word_col=billdata.subsample(user_col,word_subsample=0.001,user_subsample=0.001,ndays=ndays)
# At this point we've just loaded all the data
# Prepare the optimisation functions
u_lambdas = np.arange(0.1,1,0.1)
w_lambdas = np.arange(0.1,2,0.1)
w_spams = FistaFlat(**{
	"intercept": True,
	"loss":"square",
	"regul":"l1",
	"it0":10,
	"max_it":1000
})
u_spams = FistaFlat(**{