def __init__(self): sqler = get_sqler() self.sq = sqler.db self.cursor = self.sq.cursor() self.fg = FeatureGetter() self.dump_table = 'outer_products' self.request = "INSERT INTO "+self.dump_table+" (req_id, data) VALUES (%s, %s)" req_per_node = self.NUM_COUCHREQUESTS/comm_size lower = comm_rank*req_per_node + self.START_OFFSET upper = req_per_node print 'node %d computes %d to %d'%(comm_rank, lower, lower+upper) if comm_rank == comm_size-1: # The last guy just takes the rest upper *= 2 # Get the req_ids for this node req = "SELECT req_id from " + self.dump_table self.cursor.execute(req) ids = self.cursor.fetchall() self.existent_req_ids = [int(x[0]) for x in ids] request = "SELECT id, host_user_id, surf_user_id FROM couchrequest limit "\ + str(lower) + ", " + str(upper) print 'get the req_user_map database...' t = time.time() self.cursor.execute(request) t -= time.time() print 'took %f secs'%(-t) rows = self.cursor.fetchall() self.req_user_map = {int(row[0]):(int(row[1]),int(row[2])) for row in rows} print 'len user_map: %d'%len(self.req_user_map) for r in self.existent_req_ids: if self.req_user_map.has_key(r): self.req_user_map.pop(r) print 'reduced len user_map: %d'%len(self.req_user_map)
from math import sqrt from time import time import numpy as np # get data (Tobi) testing = True dataobject = CompetitorSetCollection(testing=testing,validation=False) #print dataobject.get_nsamples() # N #print dataobject.get_sample(17) # yields a competitorset # TODO: Tobi - put your stuff here # get featuremethod (Ron) from features.user_features import FeatureGetter fg = FeatureGetter(testing) #print fg.get_features(907345, 907345, 1) dimension = fg.get_dimension() # create SGD object, sample different competitorsets, and do learning from gradientdescent import SGDLearning from gradientdescent_personalization import SGDLearningPersonalized import random # it is better to load all competitorsets at once and then do learning fast traindataobject = CompetitorSetCollection(testing=True,validation=False) Ntrain = traindataobject.get_nsamples() competitorsets_train = [traindataobject.get_sample(i) for i in xrange(Ntrain)]
class OuterProductDumper(): NUM_COUCHREQUESTS = 10928173 START_OFFSET = 0 def __init__(self): sqler = get_sqler() self.sq = sqler.db self.cursor = self.sq.cursor() self.fg = FeatureGetter() self.dump_table = 'outer_products' self.request = "INSERT INTO "+self.dump_table+" (req_id, data) VALUES (%s, %s)" req_per_node = self.NUM_COUCHREQUESTS/comm_size lower = comm_rank*req_per_node + self.START_OFFSET upper = req_per_node print 'node %d computes %d to %d'%(comm_rank, lower, lower+upper) if comm_rank == comm_size-1: # The last guy just takes the rest upper *= 2 # Get the req_ids for this node req = "SELECT req_id from " + self.dump_table self.cursor.execute(req) ids = self.cursor.fetchall() self.existent_req_ids = [int(x[0]) for x in ids] request = "SELECT id, host_user_id, surf_user_id FROM couchrequest limit "\ + str(lower) + ", " + str(upper) print 'get the req_user_map database...' t = time.time() self.cursor.execute(request) t -= time.time() print 'took %f secs'%(-t) rows = self.cursor.fetchall() self.req_user_map = {int(row[0]):(int(row[1]),int(row[2])) for row in rows} print 'len user_map: %d'%len(self.req_user_map) for r in self.existent_req_ids: if self.req_user_map.has_key(r): self.req_user_map.pop(r) print 'reduced len user_map: %d'%len(self.req_user_map) #embed() def dump_outer_product(self, datas): try: self.cursor.executemany(self.request, datas) except MySQLdb.IntegrityError: pass def commit(self): self.sq.commit() def get_dicts(self, req_id): user1 = self.req_user_map[req_id][0] dict1 = load_data_for_user(self.cursor, user1) user2 = self.req_user_map[req_id][1] dict2 = load_data_for_user(self.cursor, user2) return (dict1, dict2) def get_features(self, req_id): (dict1, dict2) = self.get_dicts(req_id) data = self.fg.get_features_from_dct(dict1, dict2, req_id) return np.nonzero(data)[0].tolist() def execute(self): total_time = 0 counter = 0 commit_count = 0 all_keys = self.req_user_map.keys() datas = [] for req_idx, req_id in enumerate(all_keys): t = time.time() data = self.get_features(req_id) #print 'took %f sec'%(-t) counter += 1 #print '%d dumps 100 rows'%comm_rank thedata = cPickle.dumps(data) datas.append((req_id, thedata)) t -= time.time() if counter % 10000 == 0 or req_idx == len(all_keys)-1: print '%s finished %s/%s' % (comm_rank, counter, len(self.req_user_map.keys())) self.dump_outer_product(datas) datas = [] total_time -= t print 'mean time: %f sec'%(total_time/float(counter)) t = time.time() t -= time.time() print 'commit took %f sec'%(-t)
def run(cfg): lambdas = cfg.lambdas memory_for_personalized_parameters = cfg.memory_for_personalized_parameters percentage = cfg.train_percentage outer_iterations = cfg.outer_iterations nepoches = cfg.nepoches alpha = cfg.alpha beta = cfg.beta verbose = cfg.verbose personalization = cfg.personalization rhostsize = cfg.rhostsize just_winning_sets = cfg.just_winning_sets testing = cfg.testing dirname = cfg.train_dirname if comm_rank == 0: print "using lambdas:", lambdas fg = FeatureGetter() if cfg.god_mode: featuredimension = 1 else: featuredimension = fg.get_dimension() get_feature_function = fg.get_features sq = get_sqler() overallnum_sets = sq.get_num_compsets("train") num_sets = int(overallnum_sets * percentage) for i in range(2, comm_size + 2, 3): if comm_rank == i or comm_rank == i - 1 or comm_rank == i - 2: print ("Machine %d/%d - Start loading %s competitorsets for TRAIN" % (comm_rank + 1, comm_size, num_sets)) t0 = time.time() cs_train = CompetitorSetCollection(num_sets=num_sets, mode="train") t1 = time.time() print "Machine %d/%d - Finished loading the competitorsets for TRAIN." % (comm_rank, comm_size) print "Loading competitorsets took %s." % (t1 - t0) safebarrier(comm) # sleeping so that we dont kill database sec = comm_rank print "machine %d is sleeping for %d sec." % (comm_rank, sec) time.sleep(sec) trainerrors = np.zeros((len(lambdas), len(lambdas))) testerrors = np.zeros((len(lambdas), len(lambdas))) trainmeannrank = np.zeros((len(lambdas), len(lambdas))) testmeannrank = np.zeros((len(lambdas), len(lambdas))) for lw in range(len(lambdas)): lambda_winner, lambda_reject = lambdas[lw] # Create sgd object if personalization: sgd = SGDLearningPersonalized(featuredimension, get_feature_function, memory_for_personalized_parameters) else: sgd = SGDLearningRHOSTHASH(featuredimension, get_feature_function, rhostsize=rhostsize) N = cs_train.get_nsamples() niter = int(N * nepoches) for outit in range(outer_iterations): # for each outer iteration we draw new samples iid per node sampleindices = [] for _ in range(int(nepoches) + 1): sampleindices += range(N) random.shuffle(sampleindices) update_lookahead_cnt = 0 req_ids = cs_train.get_req_ids_for_samples(sampleindices[0:LOOK_AHEAD_LENGTH]) fg.upt_out_prod_get(req_ids) for innerit in range(niter): i = outit * niter + innerit eta_t = 1 / sqrt(alpha + i * beta) if not i % (niter / 5): print ( "Machine %d/%d - Iterations out: %d/%d - in: %d/%d - eta %f - lambda %f" % (comm_rank, comm_size, outit + 1, outer_iterations, innerit + 1, niter, eta_t, lambda_winner) ) update_lookahead_cnt += 1 if update_lookahead_cnt == LOOK_AHEAD_LENGTH: req_ids = cs_train.get_req_ids_for_samples(sampleindices[innerit : innerit + LOOK_AHEAD_LENGTH]) fg.upt_out_prod_get(req_ids) update_lookahead_cnt = 0 # draw random sample - UPDATE: now first get a random permutation, then do it sampleindex = sampleindices[innerit] competitorset = cs_train.get_sample(sampleindex) for l in competitorset.get_surferlist(): assert l[1] in req_ids if verbose and not i % (niter / 5) and i > 1: print ( "Iterations \n\tout: %d/%d \n\tin: %d/%d - eta %f - lambda %f" % (outit + 1, outer_iterations, innerit + 1, niter, eta_t, lambda_winner) ) print "\ttheta", min(sgd.theta), max(sgd.theta) print "\tr", sgd.r print "\tr_hosts", min(sgd.r_hosts), max(sgd.r_hosts) print "\ttrue", competitorset.get_winner() print "\tpredicted", sgd.predict(competitorset) print "\tranking", sgd.rank(competitorset) sgd.update(competitorset, eta=eta_t, lambda_winner=lambda_winner, lambda_reject=lambda_reject) # Now we aggregate theta(_host), r(_host) print ("outer iteration %d/%d: node %d at safebarrier" % (outit + 1, outer_iterations, comm_rank)) safebarrier(comm) if comm_rank == 0: print "all nodes arrived and we start allreduce/broadcasting" theta = comm.allreduce(sgd.theta) / float(comm_size) if comm_rank == 0: print "allreduce done for theta" if personalization: theta_hosts = comm.allreduce(sgd.theta_hosts) / float(comm_size) if comm_rank == 0: print "allreduce done for theta_hosts" r = comm.allreduce(sgd.r) / float(comm_size) if comm_rank == 0: print "allreduce done for r" r_hosts = comm.allreduce(sgd.r_hosts) / float(comm_size) if comm_rank == 0: print "allreduce done for r_hosts" print "spreading mean of parameters done!" sgd.theta = theta if personalization: sgd.theta_hosts = theta_hosts sgd.r = r sgd.r_hosts = r_hosts print "done with training" # Store the parameters to /tscratch/tmp/csrec if comm_rank == 0: if os.path.exists("/tscratch"): if not os.path.exists(dirname): os.makedirs(dirname) filename = "parameters_lwin_%f_lrej_%f_testing_%d_personalized_%d_numsets_%d_outerit_%d_nepoches_%d.pkl" % ( lambda_winner, lambda_reject, testing, personalization, num_sets, outer_iterations, nepoches, ) if not RON_MODE: os.system("chmod -R 777 " + dirname) if personalization: pickle.dump((sgd.theta, sgd.theta_hosts, sgd.r, sgd.r_hosts), open(dirname + filename, "wb")) else: pickle.dump((sgd.theta, sgd.r, sgd.r_hosts), open(dirname + filename, "wb")) print "Stored params at " + dirname + filename return filename