def load_ins_feat(self,train_file,eval_file,feat_file): [self.train_ins,self.train_ins_count] = utils.load_ins(self.sc,train_file) [self.eval_ins,self.eval_ins_count] = utils.load_ins(self.sc,eval_file) self.feat_dict = utils.load_feat(self.sc,feat_file) self.feat_weight = [0.0] * len(self.feat_dict)
def train(sc): feat_weight = {} learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51") [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 while cur_iter < ITER_MAX: print ( "iteration %d" % cur_iter ) broadcast_feat = sc.broadcast(feat_weight) accum = sc.accumulator(0) grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(lambda a,b: a+b).collect() update_weight(grad,feat_weight,accum.value,learning_rate, THETA) eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, feat_weight,eval_res) print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae)) cur_iter += 1
def train(sc): feat_weight = {} learning_rate = 1 ITER_MAX = 1000 THETA = 4 K = 8 SAMPLING_RATE = 0.1 [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51") #[train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train.test/train.test") [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") #[eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval.test/eval.test") #feat_dict = utils.load_feat(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_feat/*") [ feat_dict, feat_freq ] = utils.load_feat_2(sc,"hdfs://hqz-ubuntu-master:9000/data/feat_count/*",10000) for f in feat_dict: feat_weight[f] = [ 0.0, [] ] if False: #feat_freq[f] >= 0: for i in range(0,K): feat_weight[f][1].append(random.uniform(0,0.001)) #feat_weight[f][1].append(0) cur_iter = 0 while cur_iter < ITER_MAX: print "=============================================================================" print ( "iteration %d" % cur_iter ) print "broadcasting feat_weight" broadcast_feat = sc.broadcast(feat_weight) accum = sc.accumulator(0) print "calculating gradient" grad = train_ins.flatMap(lambda ins: calc_gradient(ins, broadcast_feat,accum, SAMPLING_RATE)).reduceByKey(add_gradient).collect() print "updating feat_weight" feat_weight = update_weight(grad,feat_weight,train_ins,accum.value,learning_rate, THETA) #print "returned weight:" fp=open("weights_%d" % cur_iter,"w") for f in feat_weight: fp.write("%d\t%f\t%s\n" % (f, feat_weight[f][0],"\t".join([str(i) for i in feat_weight[f][1]])) ) fp.close() print "evaluating..." eval_res = eval_ins.map(lambda ins: ( ins.predict(feat_weight), ins.label)).sortByKey().collect() print "getting eval res" [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, feat_weight,eval_res) print ("selected %d samples: auc :%f, mae: %f" % (accum.value,auc,mae)) cur_iter += 1
def load_ins_feat(self, train_file, eval_file, feat_file): [self.train_ins, self.train_ins_count] = utils.load_ins(self.sc, train_file) [self.eval_ins, self.eval_ins_count] = utils.load_ins(self.sc, eval_file) self.feat_dict = utils.load_feat(self.sc, feat_file) self.feat_weight = [0.0] * len(self.feat_dict)
def train(sc): learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins, train_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*") [eval_ins, eval_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 single_sample_learning_rate = learning_rate / (train_ins_count * SAMPLING_RATE) single_sample_theta = THETA / (train_ins_count * SAMPLING_RATE) print "single sample learning rate: %f " % single_sample_learning_rate print "single sample theta: %f" % single_sample_theta feat_weight = sc.accumulator({}, WeightAccumulatorParam()) broadcast_feat = sc.broadcast(feat_weight.value) while cur_iter < ITER_MAX: print("iteration %d" % cur_iter) selected_sample = train_ins.map(lambda ins: calc_gradient( ins, feat_weight, broadcast_feat, single_sample_learning_rate, SAMPLING_RATE, single_sample_theta)).reduce(lambda a, b: a + b) broadcast_feat = sc.broadcast(feat_weight.value) eval_res = eval_ins.map(lambda ins: utils.evalulate_map( ins, broadcast_feat)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, broadcast_feat.value,eval_res) print("selected %d samples: auc :%f, mae: %f" % (selected_sample, auc, mae)) cur_iter += 1
def train(sc): feat_weight = {} learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins, train_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*51") [eval_ins, eval_ins_count] = utils.load_ins( sc, "hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 while cur_iter < ITER_MAX: print("iteration %d" % cur_iter) broadcast_feat = sc.broadcast(feat_weight) accum = sc.accumulator(0) grad = train_ins.flatMap(lambda ins: calc_gradient( ins, broadcast_feat, accum, SAMPLING_RATE)).reduceByKey( lambda a, b: a + b).collect() update_weight(grad, feat_weight, accum.value, learning_rate, THETA) eval_res = eval_ins.map(lambda ins: (ins.predict(feat_weight), ins. label)).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, feat_weight,eval_res) print("selected %d samples: auc :%f, mae: %f" % (accum.value, auc, mae)) cur_iter += 1
def train(sc): learning_rate = 0.5 ITER_MAX = 1000 THETA = 4 SAMPLING_RATE = 0.01 [train_ins,train_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/train/*") [eval_ins,eval_ins_count] = utils.load_ins(sc,"hdfs://hqz-ubuntu-master:9000/data/filtered_ins/eval/*") cur_iter = 0 single_sample_learning_rate = learning_rate / ( train_ins_count * SAMPLING_RATE ) single_sample_theta = THETA/(train_ins_count * SAMPLING_RATE) print "single sample learning rate: %f " % single_sample_learning_rate print "single sample theta: %f" % single_sample_theta feat_weight = sc.accumulator({},WeightAccumulatorParam()) broadcast_feat = sc.broadcast(feat_weight.value) while cur_iter < ITER_MAX: print ( "iteration %d" % cur_iter ) selected_sample = train_ins.map(lambda ins: calc_gradient(ins, feat_weight,broadcast_feat,single_sample_learning_rate, SAMPLING_RATE, single_sample_theta)).reduce(lambda a,b: a+b) broadcast_feat = sc.broadcast(feat_weight.value) eval_res = eval_ins.map(lambda ins:utils.evalulate_map(ins,broadcast_feat) ).sortByKey().collect() [auc, mae, loss] = utils.get_eval_stat(eval_res) #utils.output(cur_iter, None, broadcast_feat.value,eval_res) print ("selected %d samples: auc :%f, mae: %f" % (selected_sample,auc,mae)) cur_iter += 1