def train(): batch = reader.read() sess = xdl.TrainSession() emb1 = xdl.embedding('emb1', batch['sparse0'], xdl.TruncatedNormal(stddev=0.001), 8, 1024, vtype='hash') emb2 = xdl.embedding('emb2', batch['sparse1'], xdl.TruncatedNormal(stddev=0.001), 8, 1024, vtype='hash') loss = model(batch['deep0'], [emb1, emb2], batch['label']) train_op = xdl.SGD(0.5).optimize() log_hook = xdl.LoggerHook(loss, "loss:{0}", 10) sess = xdl.TrainSession(hooks=[log_hook]) while not sess.should_stop(): sess.run(train_op)
def train(train_file=train_file, test_file=test_file, uid_voc=uid_voc, mid_voc=mid_voc, cat_voc=cat_voc, item_info=item_info, reviews_info=reviews_info, batch_size=128, maxlen=100, test_iter=700): model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, LIGHT_EMBEDDING_DIM, LIGHT_HIDDEN_SIZE, LIGHT_ATTENTION_SIZE, use_rocket_training=use_rocket_training()) sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, embedding_dim=EMBEDDING_DIM, light_embedding_dim=LIGHT_EMBEDDING_DIM) with xdl.model_scope('train'): train_ops = model.build_final_net(EMBEDDING_DIM, LIGHT_EMBEDDING_DIM, sample_io) lr = 0.001 # Adam Adagrad train_ops.append(xdl.Adam(lr).optimize()) hooks = [] log_format = "[%(time)s] lstep[%(lstep)s] gstep[%(gstep)s] lqps[%(lqps)s] gqps[%(gqps)s] loss[%(loss)s]" hooks = [QpsMetricsHook(), MetricsPrinterHook(log_format)] if xdl.get_task_index() == 0: hooks.append( xdl.CheckpointHook( xdl.get_config('checkpoint', 'save_interval'))) train_sess = xdl.TrainSession(hooks=hooks) with xdl.model_scope('test'): test_ops = model.build_final_net(EMBEDDING_DIM, LIGHT_EMBEDDING_DIM, sample_io, is_train=False) test_sess = xdl.TrainSession() model.run(train_ops, train_sess, test_ops, test_sess, test_iter=test_iter)
def test(train_file=train_file, test_file=test_file, uid_voc=uid_voc, mid_voc=mid_voc, cat_voc=cat_voc, batch_size=128, maxlen=100): # sample_io sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, EMBEDDING_DIM) if xdl.get_config('model') == 'din': model = Model_DIN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif xdl.get_config('model') == 'dien': model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: raise Exception('only support din and dien model') # test # datas = sample_io.next_test() # test_ops = tf_test_model(*model.xdl_embedding(datas, EMBEDDING_DIM, *sample_io.get_n())) # print('='*10,'start test','='*10) test_ops = model.build_final_net(EMBEDDING_DIM, sample_io, is_train=False) print('=' * 10 + 'start test' + '=' * 10) saver = xdl.Saver() checkpoint_version = "ckpt-...............12000" saver.restore(version=checkpoint_version) eval_sess = xdl.TrainSession() print( 'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval_model(eval_sess, test_ops))
def train(): if model_type == 'din_mogujie': model = Model_DIN_MOGUJIE(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, False, train_file, batch_size) else: raise Exception('only support din_mogujie and dien') #data set with xdl.model_scope('train'): train_ops = model.build_network() lr = 0.001 # Adam Adagrad train_ops.append(xdl.Adam(lr).optimize()) hooks = [] log_format = "[%(time)s] lstep[%(lstep)s] gstep[%(gstep)s] lqps[%(lqps)s] gqps[%(gqps)s] loss[%(loss)s]" hooks = [QpsMetricsHook(), MetricsPrinterHook(log_format)] if xdl.get_task_index() == 0: hooks.append(xdl.CheckpointHook(save_interval)) train_sess = xdl.TrainSession(hooks=hooks) """ with xdl.model_scope('test'): test_ops = model.build_network( EMBEDDING_DIM, is_train=False) test_sess = xdl.TrainSession() """ model.run(train_ops, train_sess)
def run(name1, name2, scope): with xdl.model_scope(scope): labels = xdl.mock_dense_op(shape=[1, 1], value=1.0) mock_embs = mock_embedding(name1, name2) loss = model(mock_embs, labels) train_op = xdl.SGD(lr).optimize() hooks = [] sess = xdl.TrainSession(hooks) run_ops = [train_op, loss] op_names = ['none', 'loss'] embed_vars = [ var for var in trainable_variables() if is_embedding_var(var) ] sparse_embed_grads = [] for var in embed_vars: sparse_embed_grads.append(xdl.get_sparse_grads(var.name)) op_names.append(var.name + '.indices') op_names.append(var.name + '.grads') for i in range(len(sparse_embed_grads)): run_ops.append(sparse_embed_grads[i].indices) run_ops.append(sparse_embed_grads[i].grad) var_list = sess.run(run_ops) if name1 != name2: return var_list[3], var_list[5] return var_list[3]
def test(train_file=train_file, test_file=test_file, uid_voc=uid_voc, mid_voc=mid_voc, cat_voc=cat_voc, batch_size=128, maxlen=100): # sample_io sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, EMBEDDING_DIM) if xdl.get_config('model') == 'din': model = Model_DIN( EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif xdl.get_config('model') == 'dien': model = Model_DIEN( EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: raise Exception('only support din and dien model') # test datas = sample_io.next_test() test_ops = tf_test_model( *model.xdl_embedding(datas, EMBEDDING_DIM, *sample_io.get_n())) eval_sess = xdl.TrainSession() print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval_model(eval_sess, test_ops))
def test(train_file=train_file, test_file=test_file, uid_voc=uid_voc, mid_voc=mid_voc, cat_voc=cat_voc, batch_size=128, maxlen=100): # sample_io sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, embedding_dim=EMBEDDING_DIM, light_embedding_dim=LIGHT_EMBEDDING_DIM) model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, LIGHT_EMBEDDING_DIM, LIGHT_HIDDEN_SIZE, LIGHT_ATTENTION_SIZE, use_rocket_training=use_rocket_training()) # test datas = sample_io.next_test() test_ops = tf_test_model(*model.xdl_embedding( datas, EMBEDDING_DIM, LIGHT_EMBEDDING_DIM, *sample_io.get_n())) eval_sess = xdl.TrainSession() print( 'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval_model(eval_sess, test_ops))
def gear(): forward = xdl.mock_dense_op(shape=[1, 16], value=0.01, name_="forward") backward = xdl.mock_dense_op(shape=[1, 16], value=0.02, name_="backward") labels = xdl.mock_dense_op(shape=[1, 1], value=1.0, name_="label1") init_grad = xdl.mock_dense_op(shape=[1, 1], value=0.3, name_="init_grad") forward.set_shape([1, 16]) backward.set_shape([1, 16]) labels.set_shape([1, 1]) init_grad.set_shape([1, 1]) predict = ams_gear([forward], [backward], init_grad)(gear_model)(None) with xdl.model_scope("ams_gear_forward"): sess = xdl.TrainSession() prediction = sess.run(predict) with xdl.model_scope("ams_gear_backward"): grads = xdl.get_gradient("fc_weight") sess = xdl.TrainSession() fc_weight_grad = sess.run(grads) return prediction, fc_weight_grad
def train(): images, labels = xdl.py_func(read_train, [], output_type=[np.float32, np.float32]) images_test, labels_test = xdl.py_func( read_test, [], output_type=[np.float32, np.float32]) with xdl.model_scope('train'): loss = model(images, labels) train_op = xdl.Adagrad(0.5).optimize() train_sess = xdl.TrainSession() with xdl.model_scope('test'): accuracy = eval_model(images_test, labels_test) eval_sess = xdl.TrainSession() for _ in range(100): for _ in range(1000): train_sess.run(train_op) print("accuracy %s" % eval_sess.run(accuracy))
def run(): user_ms = xdl.ModelServer( "user_graph", user_graph_train, xdl.DataType.float, xdl.ModelServer.Forward.UniqueCache(xdl.get_task_num()), xdl.ModelServer.Backward.UniqueCache(xdl.get_task_num())) xdl.current_env().start_model_server(user_ms) ad_ms = xdl.ModelServer( "ad_graph", ad_graph_train, xdl.DataType.float, xdl.ModelServer.Forward.UniqueCache(xdl.get_task_num()), xdl.ModelServer.Backward.UniqueCache(xdl.get_task_num())) xdl.current_env().start_model_server(ad_ms) batch = reader().read() user0 = xdl.embedding("user0", batch["user0"], xdl.TruncatedNormal(stddev=0.001), 16, 2 * 1024 * 1024, "sum", vtype="hash") user1 = xdl.embedding("user1", batch["user1"], xdl.TruncatedNormal(stddev=0.001), 16, 2 * 1024 * 1024, "sum", vtype="hash") ad0 = batch["ad0"] ad1 = batch["ad1"] img0 = user_ms(batch["user_img"].ids) ids0 = xdl.py_func(to_tf_segment_id, [batch["user_img"].segments], [np.int32])[0] img1 = ad_ms(batch["ad_img"].ids) ids1 = xdl.py_func(to_tf_segment_id, [batch["ad_img"].segments], [np.int32])[0] label = batch['label'] loss = ams_main(main_model)(user0, user1, ad0, ad1, label, ids0, ids1, gear_inputs=[img0, img1]) optimizer = xdl.Adam(0.0005).optimize() run_ops = [loss, optimizer] sess = xdl.TrainSession([]) while not sess.should_stop(): values = sess.run(run_ops) if values is not None: print 'loss: ', values[0]
def train(train_file=train_file, test_file=test_file, uid_voc=uid_voc, mid_voc=mid_voc, cat_voc=cat_voc, item_info=item_info, reviews_info=reviews_info, batch_size=128, maxlen=100, test_iter=700): if xdl.get_config('model') == 'din': model = Model_DIN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif xdl.get_config('model') == 'dien': model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: raise Exception('only support din and dien') sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, EMBEDDING_DIM) with xdl.model_scope('train'): train_ops = model.build_final_net(EMBEDDING_DIM, sample_io) lr = 0.001 # Adam Adagrad train_ops.append(xdl.Adam(lr).optimize()) hooks = [] log_format = "[%(time)s] lstep[%(lstep)s] gstep[%(gstep)s] lqps[%(lqps)s] gqps[%(gqps)s] loss[%(loss)s]" hooks = [QpsMetricsHook(), MetricsPrinterHook(log_format)] if xdl.get_task_index() == 0: hooks.append( xdl.CheckpointHook( xdl.get_config('checkpoint', 'save_interval'))) train_sess = xdl.TrainSession(hooks=hooks) with xdl.model_scope('test'): test_ops = model.build_final_net(EMBEDDING_DIM, sample_io, is_train=False) test_sess = xdl.TrainSession() print('=' * 10 + 'start train' + '=' * 10) model.run(train_ops, train_sess, test_ops, test_sess, test_iter=test_iter)
def main(): dense = xdl.mock_dense_op(shape=[1, 16], value=0.01, name_="dense") gear = xdl.mock_dense_op(shape=[1, 1], value=0.01, name_="gear") labels = xdl.mock_dense_op(shape=[1, 1], value=1.0, name_="label") gear.set_shape([1, 1]) dense.set_shape([1, 16]) labels.set_shape([1, 1]) with xdl.model_scope("ams_main"): loss = ams_main(main_model)(dense, labels, gear_inputs=[gear]) sess = xdl.TrainSession() return sess.run([xdl.get_collection("gear_grad")])
def train(): images, labels = xdl.py_func(read_train, [], output_type=[np.float32, np.float32]) images_test, labels_test = xdl.py_func( read_test, [], output_type=[np.float32, np.float32]) with xdl.model_scope('train'): loss = model(images, labels) train_op = xdl.Adagrad(0.5).optimize() if xdl.get_task_index() == 0: ckpt_hook = xdl.CheckpointHook(1000) train_sess = xdl.TrainSession(hooks=[ckpt_hook]) else: train_sess = xdl.TrainSession() with xdl.model_scope('test'): accuracy = eval_model(images_test, labels_test) eval_sess = xdl.TrainSession() for _ in range(100): for _ in range(1000): train_sess.run(train_op) print("accuracy %s" % eval_sess.run(accuracy))
def main(): dense = xdl.mock_dense_op(shape=[1, 16], value=0.01, name_="dense") gear = xdl.mock_dense_op(shape=[1, 1], value=0.01, name_="gear") labels = xdl.mock_dense_op(shape=[1, 1], value=1.0, name_="label") ids, values, segments = xdl.mock_sparse_op(dense_shape=[1, 16], name_="wide") sparse = xdl.SparseTensor(ids, values, segments) emb = xdl.embedding("sparse", sparse, xdl.Ones(), 1, 16, 'sum') gear.set_shape([None, 1]) dense.set_shape([None, 16]) labels.set_shape([None, 1]) with xdl.model_scope("ams_main"): loss = ams_main(main_model)(dense, emb, labels, gear_inputs=[gear]) sess = xdl.TrainSession() return sess.run(xdl.get_collection("gear_grad"))
def test_all(self): dense = xdl.mock_dense_op(shape=[1, 16], value=0.01, name_="dense") labels = xdl.mock_dense_op(shape=[1, 1], value=1.0, name_="label") ids = xdl.convert_to_tensor( np.array([[0, 0], [0, 1], [0, 2]], dtype=np.int64)) values = xdl.convert_to_tensor( np.array([1.0, 2.0, 3.0], dtype=np.float32)) segments = xdl.convert_to_tensor(np.array([3], dtype=np.int32)) sparse = xdl.SparseTensor(ids, values, segments) emb = xdl.embedding("sparse", sparse, xdl.Ones(), 1, 16, 'sum', vtype='hash') loss = model(dense, emb, labels) train_op = xdl.SGD(0.5).optimize() sess = xdl.TrainSession() _, l, g = sess.run( [train_op, loss, xdl.get_sparse_grads('sparse').grad]) self.assertTrue((l == np.array(0.0024364376, dtype=np.float32)).all()) self.assertTrue( (g == np.array([[-0.002433472], [-0.004866944], [-0.007300416]], dtype=np.float32)).all()) sparse_var = xdl.get_variable_by_name('sparse') weights = sess.run( sparse_var.gather( np.array([[0, 0], [0, 1], [0, 2]], dtype=np.int64))) self.assertTrue( (weights == np.array([[1.0012168], [1.0024334], [1.0036502]], dtype=np.float32)).all()) _, l, g = sess.run( [train_op, loss, xdl.get_sparse_grads('sparse').grad]) self.assertTrue((l == np.array(0.002395329, dtype=np.float32)).all()) self.assertTrue( (g == np.array([[-0.0023924622], [-0.0047849244], [-0.0071773864]], dtype=np.float32)).all()) weights = sess.run( sparse_var.gather( np.array([[0, 0], [0, 1], [0, 2]], dtype=np.int64))) self.assertTrue( (weights == np.array([[1.002413], [1.0048258], [1.0072389]], dtype=np.float32)).all())
def main(): dense = xdl.mock_dense_op(shape=[1, 16], value=0.01, name_="dense") labels = xdl.mock_dense_op(shape=[1, 1], value=1.0, name_="label") ids = xdl.convert_to_tensor( np.array([[0, 0], [0, 1], [0, 2]], dtype=np.int64)) values = xdl.convert_to_tensor(np.array([1.0, 2.0, 3.0], dtype=np.float32)) segments = xdl.convert_to_tensor(np.array([3], dtype=np.int32)) sparse = xdl.SparseTensor(ids, values, segments) emb = xdl.embedding("sparse", sparse, xdl.Ones(), 1, 16, 'sum', vtype='hash') loss = model(dense, emb, labels) train_op = xdl.SGD(0.5).optimize() sess = xdl.TrainSession() loss, gradients = sess.run([loss, xdl.get_sparse_grads('sparse').grad]) return loss, gradients
def run(name1, name2, scope, optimizer): with xdl.model_scope(scope): labels = xdl.mock_dense_op(shape=[1, 1], value=1.0) mock_embs = mock_embedding(name1, name2) loss = model(mock_embs, labels) if optimizer == 'sgd': train_op = xdl.SGD(0.5).optimize() elif optimizer == 'momentum': train_op = xdl.Momentum(0.005, 0.99).optimize() elif optimizer == 'ftrl': train_op = xdl.Ftrl(0.01).optimize() elif optimizer == 'adam': train_op = xdl.Adam(0.001).optimize() elif optimizer == 'adagrad': train_op = xdl.Adagrad(0.04, 0.1).optimize() elif optimizer == 'rmsprop': train_op = xdl.RMSProp(0.001).optimize() else: train_op = xdl.SGD(0.5).optimize() hooks = [] sess = xdl.TrainSession(hooks) run_ops = [train_op, loss] op_names = ['none', 'loss'] embed_vars = [ var for var in trainable_variables_with_scope(scope) if is_embedding_var(var) ] sparse_embed_grads = [] for var in embed_vars: sparse_embed_grads.append(xdl.get_sparse_grads(var.name)) op_names.append(var.name + '.indices') op_names.append(var.name + '.grads') for i in range(len(sparse_embed_grads)): run_ops.append(sparse_embed_grads[i].indices) run_ops.append(sparse_embed_grads[i].grad) var_list = sess.run(run_ops) if name1 != name2: return var_list[3], var_list[5] return var_list[3]
def train(is_training=True): #np.set_printoptions(threshold='nan') if is_training or xdl.get_task_index() == 0: init() else: return file_type = xdl.parsers.txt if is_training: data_io = xdl.DataIO("tdm", file_type=file_type, fs_type=xdl.fs.hdfs, namenode="hdfs://your/namenode/hdfs/path:9000", enable_state=False) feature_count = 69 for i in xrange(1, feature_count + 1): data_io.feature(name=("item_%s" % i), type=xdl.features.sparse, table=1) data_io.feature(name="unit_id_expand", type=xdl.features.sparse, table=0) data_io.batch_size(intconf('train_batch_size')) data_io.epochs(intconf('train_epochs')) data_io.threads(intconf('train_threads')) data_io.label_count(2) base_path = '%s/%s/' % (conf('upload_url'), conf('data_dir')) data = base_path + conf('train_sample') + '_' + r'[\d]+' sharding = xdl.DataSharding(data_io.fs()) sharding.add_path(data) paths = sharding.partition(rank=xdl.get_task_index(), size=xdl.get_task_num()) print 'train: sharding.partition() =', paths data_io.add_path(paths) iop = xdl.GetIOP("TDMOP") else: data_io = xdl.DataIO("tdm", file_type=file_type, fs_type=xdl.fs.hdfs, namenode="hdfs://your/namenode/hdfs/path:9000", enable_state=False) feature_count = 69 for i in xrange(1, feature_count + 1): data_io.feature(name=("item_%s" % i), type=xdl.features.sparse, table=1) data_io.feature(name="unit_id_expand", type=xdl.features.sparse, table=0) data_io.batch_size(intconf('predict_batch_size')) data_io.epochs(intconf('predict_epochs')) data_io.threads(intconf('predict_threads')) data_io.label_count(2) base_path = '%s/%s/' % (conf('upload_url'), conf('data_dir')) data = base_path + conf('test_sample') data_io.add_path(data) print 'predict: add_path =', data iop = xdl.GetIOP("TDMPREDICTOP") #data_io.finish_delay(True) assert iop is not None key_value = {} key_value["key"] = "value" key_value["debug"] = conf('tdmop_debug') key_value["layer_counts"] = conf('tdmop_layer_counts') key_value["pr_test_each_layer_retrieve_num"] = "400" key_value["pr_test_final_layer_retrieve_num"] = "200" iop.init(key_value) data_io.add_op(iop) data_io.split_group(False) if not is_training: data_io.keep_sample(True) data_io.pause(intconf('predict_io_pause_num'), True) data_io.startup() if not is_training: if xdl.get_task_index() == 0: saver = xdl.Saver() saver.restore(conf('saver_ckpt')) batch = data_io.read() emb_combiner = 'mean' # mean | sum ind = batch["indicators"][0] ids = batch["_ids"][0] emb = [] emb_dim = 24 if is_training: feature_add_probability = 1. else: feature_add_probability = 0. import xdl.python.sparse_engine.embedding as embedding emb_name = "item_emb" for i in xrange(1, feature_count + 1): #emb_name = "item_%s_emb" % i eb = xdl.embedding(emb_name, batch["item_%s" % i], xdl.Normal(stddev=0.001), emb_dim, 50000, emb_combiner, vtype="hash", feature_add_probability=feature_add_probability) with xdl.device('GPU'): eb_take = xdl.take_op(eb, batch["indicators"][0]) eb_take.set_shape(eb.shape) emb.append(eb_take) #emb_name = "unit_id_expand_emb" unit_id_expand_emb = xdl.embedding(emb_name, batch["unit_id_expand"], xdl.Normal(stddev=0.001), emb_dim, 50000, emb_combiner, vtype="hash", feature_add_probability=feature_add_probability) @xdl.mxnet_wrapper(is_training=is_training, device_type='gpu') def dnn_model_define(user_input, indicator, unit_id_emb, label, bs, eb_dim, fea_groups, active_op='prelu', use_batch_norm=True): # 把用户输入按fea_groups划分窗口,窗口内做avg pooling fea_groups = [int(s) for s in fea_groups.split(',')] total_group_length = np.sum(np.array(fea_groups)) print "fea_groups", fea_groups, "total_group_length", total_group_length, "eb_dim", eb_dim user_input_before_reshape = mx.sym.concat(*user_input) user_input = mx.sym.reshape(user_input_before_reshape, shape=(-1, total_group_length, eb_dim)) layer_data = [] # start att att_user_input = mx.sym.reshape(user_input, (bs, total_group_length, eb_dim)) att_node_input = mx.sym.reshape(unit_id_emb, (bs, 1, eb_dim)) att_node_input = mx.sym.broadcast_to(data=att_node_input, shape=(0, total_group_length, 0)) att_din = mx.sym.concat(att_user_input, att_user_input * att_node_input, att_node_input, dim=2) att_active_op = 'prelu' att_layer_arr = [] att_layer1 = FullyConnected3D(3*eb_dim, 36, active_op=att_active_op, version=1, batch_size=bs) att_layer_arr.append(att_layer1) att_layer2 = FullyConnected3D(36, 1, active_op=att_active_op, version=2, batch_size=bs) att_layer_arr.append(att_layer2) layer_data.append(att_din) for layer in att_layer_arr: layer_data.append(layer.call(layer_data[-1])) att_dout = layer_data[-1] att_dout = mx.sym.broadcast_to(data=att_dout, shape=(0, 0, eb_dim)) user_input = mx.sym.reshape(user_input, shape=(bs, -1, eb_dim)) user_input = user_input * att_dout # end att idx = 0 for group_length in fea_groups: block_before_sum = mx.sym.slice_axis(user_input, axis=1, begin=idx, end=idx+group_length) block = mx.sym.sum_axis(block_before_sum, axis=1) / group_length if idx == 0: grouped_user_input = block else: grouped_user_input = mx.sym.concat(grouped_user_input, block, dim=1) idx += group_length indicator = mx.symbol.BlockGrad(indicator) label = mx.symbol.BlockGrad(label) # 按indicator来扩展user fea,然后过网络 #grouped_user_input_after_take = mx.symbol.take(grouped_user_input, indicator) grouped_user_input_after_take = grouped_user_input din = mx.symbol.concat(*[grouped_user_input_after_take, unit_id_emb], dim=1) net_version = "d" layer_arr = [] layer1 = mx_dnn_layer(11 * eb_dim, 128, active_op=active_op, use_batch_norm=use_batch_norm, version="%d_%s" % (1, net_version)) layer_arr.append(layer1) layer2 = mx_dnn_layer(128, 64, active_op=active_op, use_batch_norm=use_batch_norm, version="%d_%s" % (2, net_version)) layer_arr.append(layer2) layer3 = mx_dnn_layer(64, 32, active_op=active_op, use_batch_norm=use_batch_norm, version="%d_%s" % (3, net_version)) layer_arr.append(layer3) layer4 = mx_dnn_layer(32, 2, active_op='', use_batch_norm=False, version="%d_%s" % (4, net_version)) layer_arr.append(layer4) #layer_data = [din] layer_data.append(din) for layer in layer_arr: layer_data.append(layer.call(layer_data[-1])) dout = layer_data[-1] # 正常label两列加和必为1,补全的label为0,故减一之后即可得到-1,作为ignore label ph_label_sum = mx.sym.sum(label, axis=1) ph_label_ignore = ph_label_sum - 1 ph_label_ignore = mx.sym.reshape(ph_label_ignore, shape=(-1, 1)) ph_label_click = mx.sym.slice_axis(label, axis=1, begin=1, end=2) ph_label_click = ph_label_click + ph_label_ignore ph_label_click = mx.sym.reshape(ph_label_click, shape=(bs, )) prop = mx.symbol.SoftmaxOutput(data=dout, label=ph_label_click, grad_scale=1.0, use_ignore=True, normalization='valid') origin_loss = mx.sym.log(prop) * label ph_label_sum = mx.sym.reshape(ph_label_sum, shape=(bs, 1)) origin_loss = mx.sym.broadcast_mul(origin_loss, ph_label_sum) loss = - mx.symbol.sum(origin_loss) / mx.sym.sum(ph_label_sum) return prop, loss re = dnn_model_define(emb, batch["indicators"][0], unit_id_expand_emb, batch["label"], data_io._batch_size, emb_dim, '20,20,10,10,2,2,2,1,1,1') prop = re[0] loss = re[1] if is_training: train_op = xdl.Adam(learning_rate=intconf('learning_rate'), lr_decay=False).optimize() #train_op = xdl.SGD(0.1).optimize() #fc_1_weight_grad = xdl.get_gradient("fc_w_1_d") #fc_1_bias_grad = xdl.get_gradient("fc_b_1_d") else: fin = data_io.set_prop(prop=prop) hooks = [] if is_training: if conf("train_mode") == "sync": hooks.append(xdl.SyncRunHook(xdl.get_task_index(), xdl.get_task_num())) if xdl.get_task_index() == 0: ckpt_hook = xdl.CheckpointHook(intconf('save_checkpoint_interval')) hooks.append(ckpt_hook) log_hook = xdl.LoggerHook([loss], "#### loss:{0}") else: log_hook = xdl.LoggerHook([loss], "#### loss:{0}") hooks.append(log_hook) from xdl.python.training.training_utils import get_global_step global_step = get_global_step() sess = xdl.TrainSession(hooks) elapsed_time = 0. statis_begin_loop = 200 loop_num = 0 while not sess.should_stop(): print ">>>>>>>>>>>> %d >>>>>>>>>>>" % loop_num begin_time = time.time() for itr in xrange(200): if is_training: result = sess.run([train_op, xdl.get_collection(xdl.UPDATE_OPS)]) #result = sess.run([train_op, xdl.get_collection(xdl.UPDATE_OPS), unit_id_expand_emb]) else: result = sess.run([loss, fin, global_step.value]) #result = sess.run([loss, fin, ids, global_step.value]) if result is None: print "result is None, finished success." break if not is_training: print "global_step =", result[-1] #print "batch['_ids'] =", result[-2] #else: # print "unit_id_expand_emb = { mean =", result[-1].mean(), ", std =", result[-1].std(), "}" loop_num += 1 if loop_num > statis_begin_loop: elapsed_time += time.time() - begin_time #print 'batch_size = %d, qps = %f batch/s' % (data_io._batch_size, (loop_num - statis_begin_loop) / elapsed_time) if is_training: xdl.execute(xdl.ps_synchronize_leave_op(np.array(xdl.get_task_index(), dtype=np.int32))) if xdl.get_task_index() == 0: print 'start put item_emb' def _string_to_int8(src): return np.array([ord(ch) for ch in src], dtype=np.int8) from xdl.python.utils.config import get_ckpt_dir output_dir = conf('model_url') op = xdl.ps_convert_ckpt_variable_op(checkpoint_dir=_string_to_int8(get_ckpt_dir()), output_dir=_string_to_int8(output_dir), variables=_string_to_int8("item_emb")) xdl.execute(op) shell_cmd("rm -f data/item_emb") shell_cmd("hadoop fs -get %s/item_emb data/item_emb" % output_dir) shell_cmd("sed -i 's/..//' data/item_emb") shell_cmd("hadoop fs -put -f data/item_emb %s" % output_dir) print 'finish put item_emb'
def train(): batch = data_io.read() print batch embs = list() for i in range(1, embs_len + 1): name = "item_%d" % i emb = xdl.embedding(name, batch[name], xdl.Ones(), 1, 1000, 'sum', vtype='hash') embs.append(emb) print "emb =", name, ", shape =", emb.shape print "origin batch[label].shape =", batch["label"].shape loss, prop, label, indicator, din, dout, fc1_weight, fc1_bias, fc2_weight, fc2_bias = model( embs, batch["label"], 4, 7) train_op = xdl.SGD(0.5).optimize() item1_grad = xdl.get_gradient('item_1') item2_grad = xdl.get_gradient('item_2') item3_grad = xdl.get_gradient('item_3') item4_grad = xdl.get_gradient('item_4') fc1_weight_grad = xdl.get_gradient('fc1_weight') fc1_bias_grad = xdl.get_gradient('fc1_bias') fc2_weight_grad = xdl.get_gradient('fc2_weight') fc2_bias_grad = xdl.get_gradient('fc2_bias') sess = xdl.TrainSession() loop_num = 0 while not sess.should_stop(): if loop_num == 5: break print "\n>>>>>>>>>>>> loop_num = %d" % loop_num result = sess.run([train_op, loss, prop, batch['label'], label, indicator, din, dout, \ batch['item_1'].ids, batch['item_1'].segments, batch['item_1'].values, \ batch['item_2'].ids, batch['item_2'].segments, batch['item_2'].values, \ batch['item_3'].ids, batch['item_3'].segments, batch['item_3'].values, \ batch['item_4'].ids, batch['item_4'].segments, batch['item_4'].values, \ item1_grad, item2_grad, item3_grad, item4_grad, \ fc1_weight, fc1_bias, fc1_weight_grad, fc1_bias_grad, \ fc2_weight, fc2_bias, fc2_weight_grad, fc2_bias_grad]) if result is None: break print "loss:", result[-31] print "prop:", result[-30] print "origin label:", result[-29] print "label:", result[-28] print "indicator:", result[-27] print "din:", result[-26] print "dout:", result[-25] print "item_1: ids=", result[-24], "\n segments=", result[ -23], "\n values=", result[-22] print "item_2: ids=", result[-21], "\n segments=", result[ -20], "\n values=", result[-19] print "item_3: ids=", result[-18], "\n segments=", result[ -17], "\n values=", result[-16] print "item_4: ids=", result[-15], "\n segments=", result[ -14], "\n values=", result[-13] print "item1_grad", result[-12] print "item2_grad", result[-11] print "item1_grad", result[-10] print "item2_grad", result[-9] print "fc1_weight", result[-8] print "fc1_bias", result[-7] print "fc1_weight_grad", result[-6] print "fc1_bias_grad", result[-5] print "fc2_weight", result[-4] print "fc2_bias", result[-3] print "fc2_weight_grad", result[-2] print "fc2_bias_grad", result[-1] loop_num += 1
def test(train_file=train_file, test_file=test_file, uid_voc=uid_voc, mid_voc=mid_voc, cat_voc=cat_voc, item_info=item_info, reviews_info=reviews_info, batch_size=99, maxlen=100): if xdl.get_config('model') == 'din': model = Model_DIN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) elif xdl.get_config('model') == 'dien': model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE) else: raise Exception('only support din and dien model') # create item cate dict i_c = {} for i in item_c: ii = i.strip().split('\t') i_c[ii[0]] = ii[1] saver = xdl.Saver() checkpoint_version = "ckpt-...............20000" saver.restore(version=checkpoint_version) last_hist = [] target_list = [] seq = [] test_set = pkl.load(open(test_file, 'rb')) knn_table = pkl.load( open('../data/ali_knn_table/knn' + str(test_file[-5]) + '_no_pro2.pkl', 'rb')) print('length before deal with : ', len(test_set)) test_knn = open('../data/test_knn', 'w') count22 = 0 for i in test_set: # knn ss = i.strip().split('\t') last = ss[4].split('/')[-1] # append last, target, and seq last_hist.append(last) target_list.append(ss[2]) seq.append((ss[1], ss[4])) # uid and hist knn = knn_table[last] for k in knn: count22 += 1 if k in i_c: tmp = '1\t' + ss[1] + '\t' + k + '\t' + i_c[k] + '\t' + ss[ 4] + '\t' + ss[5] else: tmp = '1\t' + ss[1] + '\t' + k + '\t' + 'UNK' + '\t' + ss[ 4] + '\t' + ss[5] print >> test_knn, tmp test_knn.close() print('after last_hist :', len(last_hist)) print('all test_knn length :', count22) # sample_io test_knn_f = os.path.join(get_data_prefix(), 'test_knn') sample_io = SampleIO(train_file, test_knn_f, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size, maxlen, EMBEDDING_DIM) print('all length:', len(last_hist)) test_ops = model.build_final_net(EMBEDDING_DIM, sample_io, is_train=False) print('=' * 10 + 'start test' + '=' * 10) eval_sess = xdl.TrainSession() pro_all, test_auc, loss_sum, accuracy_sum, aux_loss_sum = eval_model( eval_sess, test_ops) print( 'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % (test_auc, loss_sum, accuracy_sum, aux_loss_sum)) print('after pro length :', len(pro_all)) print('=' * 50) # sort the knn with prob rank_all_knn = {} rank = [] for i in range(len(last_hist)): knn = knn_table[last_hist[i]] pro = pro_all[i] c = list(zip(knn, pro)) c = sorted(c, key=lambda t: t[1], reverse=True) rank_all = [sss[0] for sss in c] rank_all_knn[seq[i][0]] = rank_all if target_list[i] in rank_all: rank.append(rank_all.index(target_list[i]) + 1) else: rank.append(100) # print(rank_all_knn) # save the result of re-rank user = [i[0] for i in seq] hist = [i[1] for i in seq] assert len(last_hist) == len(user) results = list(zip(user, hist, last_hist, target_list, rank)) # results = pd.DataFrame(results, columns = ['last','target','rank']) # esults.to_csv('ali_dien_rank.csv',index=False) with open('ali_dien_rank_4days' + test_file[-11:], 'wb') as d: pkl.dump(results, d)
def train(is_training=True): if is_training or xdl.get_task_index() == 0: init() else: return file_type = xdl.parsers.txt if is_training: data_io = xdl.DataIO("tdm", file_type=file_type, fs_type=xdl.fs.hdfs, namenode="hdfs://your/namenode/hdfs/path:9000", enable_state=False) feature_count = 69 for i in xrange(1, feature_count + 1): data_io.feature(name=("item_%s" % i), type=xdl.features.sparse, table=1) data_io.feature(name="unit_id_expand", type=xdl.features.sparse, table=0) data_io.batch_size(intconf('train_batch_size')) data_io.epochs(intconf('train_epochs')) data_io.threads(intconf('train_threads')) data_io.label_count(2) base_path = '%s/%s/' % (conf('upload_url'), conf('data_dir')) data = base_path + conf('train_sample') + '_' + r'[\d]+' sharding = xdl.DataSharding(data_io.fs()) sharding.add_path(data) paths = sharding.partition(rank=xdl.get_task_index(), size=xdl.get_task_num()) print 'train: sharding.partition() =', paths data_io.add_path(paths) iop = xdl.GetIOP("TDMOP") else: data_io = xdl.DataIO("tdm", file_type=file_type, fs_type=xdl.fs.hdfs, namenode="hdfs://your/namenode/hdfs/path:9000", enable_state=False) feature_count = 69 for i in xrange(1, feature_count + 1): data_io.feature(name=("item_%s" % i), type=xdl.features.sparse, table=1) data_io.feature(name="unit_id_expand", type=xdl.features.sparse, table=0) data_io.feature(name="test_unit_id", type=xdl.features.sparse, table=1) data_io.batch_size(intconf('predict_batch_size')) data_io.epochs(intconf('predict_epochs')) data_io.threads(intconf('predict_threads')) data_io.label_count(2) base_path = '%s/%s/' % (conf('upload_url'), conf('data_dir')) data = base_path + conf('test_sample') data_io.add_path(data) print 'predict: add_path =', data iop = xdl.GetIOP("TDMPREDICTOP") #data_io.finish_delay(True) assert iop is not None key_value = {} key_value["key"] = "value" key_value["debug"] = conf('tdmop_debug') key_value["layer_counts"] = conf('tdmop_layer_counts') key_value["start_sample_layer"] = "22" key_value["pr_test_each_layer_retrieve_num"] = "400" key_value["pr_test_final_layer_retrieve_num"] = "200" if not is_training: key_value["expand_mode"] = "vector" iop.init(key_value) data_io.add_op(iop) data_io.split_group(False) data_io.startup() if not is_training: if xdl.get_task_index() == 0: saver = xdl.Saver() saver.restore(conf('saver_ckpt')) batch = data_io.read() emb_combiner = 'mean' # mean | sum if not is_training: gt_ids = batch["_ids"][-1] gt_segments = batch["_segments"][-1] emb = [] emb_dim = 24 if is_training: feature_add_probability = 1. else: feature_add_probability = 0. import xdl.python.sparse_engine.embedding as embedding emb_name = "item_emb" for i in xrange(1, feature_count + 1): eb = xdl.embedding(emb_name, batch["item_%s" % i], xdl.Normal(stddev=0.001), emb_dim, 50000, emb_combiner, vtype="hash", feature_add_probability=feature_add_probability) with xdl.device('GPU'): eb_take = xdl.take_op(eb, batch["indicators"][0]) eb_take.set_shape(eb.shape) emb.append(eb_take) unit_id_expand_emb = xdl.embedding( emb_name, batch["unit_id_expand"], xdl.Normal(stddev=0.001), emb_dim, 50000, emb_combiner, vtype="hash", feature_add_probability=feature_add_probability) @xdl.mxnet_wrapper(is_training=is_training, device_type='gpu') def dnn_model_define(user_input, indicator, unit_id_emb, label, bs, eb_dim, sample_num, fea_groups, active_op='prelu', use_batch_norm=True): # 把用户输入按fea_groups划分窗口,窗口内做avg pooling fea_groups = [int(s) for s in fea_groups.split(',')] total_group_length = np.sum(np.array(fea_groups)) print "fea_groups", fea_groups, "total_group_length", total_group_length, "eb_dim", eb_dim user_input_before_reshape = mx.sym.concat(*user_input) user_input = mx.sym.reshape(user_input_before_reshape, shape=(-1, total_group_length, eb_dim)) idx = 0 for group_length in fea_groups: block_before_sum = mx.sym.slice_axis(user_input, axis=1, begin=idx, end=idx + group_length) block = mx.sym.sum_axis(block_before_sum, axis=1) / group_length if idx == 0: grouped_user_input = block else: grouped_user_input = mx.sym.concat(grouped_user_input, block, dim=1) idx += group_length indicator = mx.symbol.BlockGrad(indicator) label = mx.symbol.BlockGrad(label) grouped_user_input_after_take = grouped_user_input net_version = "e" layer_arr = [] layer1 = mx_dnn_layer(10 * eb_dim, 128, active_op=active_op, use_batch_norm=use_batch_norm, version="%d_%s" % (1, net_version)) layer_arr.append(layer1) layer2 = mx_dnn_layer(128, 64, active_op=active_op, use_batch_norm=use_batch_norm, version="%d_%s" % (2, net_version)) layer_arr.append(layer2) layer3 = mx_dnn_layer(64, 24, active_op='', use_batch_norm=False, version="%d_%s" % (3, net_version)) layer_arr.append(layer3) layer_data = [grouped_user_input_after_take] for layer in layer_arr: layer_data.append(layer.call(layer_data[-1])) dout = layer_data[-1] inner_product = mx.sym.sum(dout * unit_id_emb, axis=1) softmax_input = mx.sym.Reshape(inner_product, shape=(bs / sample_num, sample_num)) # 用正例的label减1作为softmax的label ph_label_click = mx.sym.slice_axis(label, axis=1, begin=1, end=2) ph_label_click = mx.sym.reshape( ph_label_click, shape=(bs / sample_num, sample_num)) - 1 ph_label_click = mx.sym.slice_axis(ph_label_click, axis=1, begin=0, end=1) ph_label_click = mx.sym.reshape(ph_label_click, shape=(bs / sample_num, )) prop = mx.symbol.SoftmaxOutput(data=softmax_input, label=ph_label_click, normalization='valid', use_ignore=True) positive_prop = mx.sym.slice_axis(prop, axis=1, begin=0, end=1) positive_prop = mx.sym.reshape(positive_prop, shape=(bs / sample_num, )) # 实际的有效样本数量是(bs/sample_num)减去需要ignore的label数量 loss = -mx.sym.sum(mx.symbol.log(positive_prop)) / ( bs / sample_num + mx.sym.sum(ph_label_click)) user_vector = mx.sym.reshape(dout, shape=(bs / sample_num, sample_num, eb_dim)) user_vector = mx.sym.slice_axis(user_vector, axis=1, begin=0, end=1) user_vector = mx.sym.reshape(user_vector, shape=(bs / sample_num, eb_dim)) return prop, loss, mx.sym.BlockGrad(user_vector) if is_training: re = dnn_model_define(emb, batch["indicators"][0], unit_id_expand_emb, batch["label"], data_io._batch_size, emb_dim, 600, '20,20,10,10,2,2,2,1,1,1') else: re = dnn_model_define(emb, batch["indicators"][0], unit_id_expand_emb, batch["label"], data_io._batch_size, emb_dim, 1, '20,20,10,10,2,2,2,1,1,1') prop = re[0] loss = re[1] if is_training: train_op = xdl.Adam(learning_rate=intconf('learning_rate')).optimize() else: user_vector = re[2] hooks = [] if is_training: if conf("train_mode") == "sync": hooks.append( xdl.SyncRunHook(xdl.get_task_index(), xdl.get_task_num())) if xdl.get_task_index() == 0: ckpt_hook = xdl.CheckpointHook(intconf('save_checkpoint_interval')) hooks.append(ckpt_hook) log_hook = xdl.LoggerHook([loss], "#### loss:{0}") else: log_hook = xdl.LoggerHook([loss], "#### loss:{0}") hooks.append(log_hook) from xdl.python.training.training_utils import get_global_step global_step = get_global_step() sess = xdl.TrainSession(hooks) elapsed_time = 0. statis_begin_loop = 200 loop_num = 0 if not is_training: urun_re = iop.urun({"get_level_ids": key_value["start_sample_layer"]}) item_num = len(urun_re) item_ids = np.array([int(iid) for iid in urun_re.keys()], dtype=np.int64).reshape((item_num, 1)) print 'item_ids shape: ' print item_ids.shape zeros = np.zeros((item_num, 1), dtype=np.int64) hash_ids = np.concatenate((zeros, item_ids), axis=1) item_embeddings = xdl.execute( xdl.ps_sparse_pull_op(hash_ids, var_name="item_emb", var_type="hash", save_ratio=1.0, otype=xdl.DataType.float)) item_embeddings = item_embeddings.transpose() print 'item_embeddings shape: ' print item_embeddings.shape hit_num_list = [] precision_list = [] recall_list = [] gt_num_list = [] user_idx = 1 while not sess.should_stop(): print ">>>>>>>>>>>> %d >>>>>>>>>>>" % loop_num begin_time = time.time() for itr in xrange(200): if is_training: result = sess.run( [train_op, xdl.get_collection(xdl.UPDATE_OPS)]) else: result = sess.run( [user_vector, global_step.value, gt_ids, gt_segments]) if result is None: print "result is None, finished success." break if not is_training: print "global_step =", result[1] batch_uv = result[0] batch_gt = result[2] batch_seg = result[3] batch_uv = batch_uv[0:len(batch_seg)] batch_scores = np.matmul(batch_uv, item_embeddings) sorted_idx = np.argsort(-batch_scores, axis=1) sorted_idx = sorted_idx[:, :int( key_value["pr_test_final_layer_retrieve_num"])] gt_id_start_idx = 0 for i in xrange(len(batch_seg)): pred_set = set(item_ids[sorted_idx[i, :], 0]) gt_dict = {} for gt in batch_gt[gt_id_start_idx:batch_seg[i], 1]: if gt in gt_dict: gt_dict[gt] += 1 else: gt_dict[gt] = 1 test_gt_list = batch_gt[gt_id_start_idx:batch_seg[i], 1].tolist() test_gt_str = ','.join( [str(gtid) for gtid in test_gt_list]) test_pred_list = item_ids[sorted_idx[i, :], 0].tolist() test_pred_str = ','.join( [str(gtid) for gtid in test_pred_list]) user_idx += 1 gt_set = set(batch_gt[gt_id_start_idx:batch_seg[i], 1]) comm_set = gt_set.intersection(pred_set) hit_num = sum([ float(gt_dict[item]) if item in gt_dict else 0.0 for item in comm_set ]) hit_num_list.append(hit_num) if len(pred_set) > 0: precision = hit_num / len(pred_set) else: precision = 0.0 if len(gt_dict) > 0: recall = hit_num / (batch_seg[i] - gt_id_start_idx) else: recall = 0.0 precision_list.append(precision) recall_list.append(recall) gt_num_list.append(float(batch_seg[i] - gt_id_start_idx)) gt_id_start_idx = batch_seg[i] print "==================================================" print 'predicted user num is: %d' % len(hit_num_list) print 'gt num is: %f' % sum(gt_num_list) print 'precision: %f' % (sum(precision_list) / len(hit_num_list)) print 'recall: %f' % (sum(recall_list) / len(hit_num_list)) print 'global recall: %f' % (sum(hit_num_list) / sum(gt_num_list)) print "==================================================" loop_num += 1 if loop_num > statis_begin_loop: elapsed_time += time.time() - begin_time #print 'batch_size = %d, qps = %f batch/s' % (data_io._batch_size, (loop_num - statis_begin_loop) / elapsed_time) if not is_training: print "==================================================" print 'predicted user num is: %d' % len(hit_num_list) print 'gt num is: %f' % sum(gt_num_list) print 'precision: %f' % (sum(precision_list) / len(hit_num_list)) print 'recall: %f' % (sum(recall_list) / len(hit_num_list)) print 'global recall: %f' % (sum(hit_num_list) / sum(gt_num_list)) print "==================================================" if is_training: xdl.execute( xdl.ps_synchronize_leave_op( np.array(xdl.get_task_index(), dtype=np.int32))) if xdl.get_task_index() == 0: print 'start put item_emb' def _string_to_int8(src): return np.array([ord(ch) for ch in src], dtype=np.int8) from xdl.python.utils.config import get_ckpt_dir output_dir = conf('model_url') op = xdl.ps_convert_ckpt_variable_op( checkpoint_dir=_string_to_int8(get_ckpt_dir()), output_dir=_string_to_int8(output_dir), variables=_string_to_int8("item_emb")) xdl.execute(op) shell_cmd("rm -f data/item_emb") shell_cmd("hadoop fs -get %s/item_emb data/item_emb" % output_dir) shell_cmd("sed -i 's/..//' data/item_emb") shell_cmd("hadoop fs -put -f data/item_emb %s" % output_dir) print 'finish put item_emb'
def run(is_training, files): data_io = reader("esmm", files, 2, batch_size, 2, user_fn, ad_fn) batch = data_io.read() user_embs = list() for fn in user_fn: emb = xdl.embedding('u_' + fn, batch[fn], xdl.TruncatedNormal(stddev=0.001), embed_size, 1000, 'sum', vtype='hash') user_embs.append(emb) ad_embs = list() for fn in ad_fn: emb = xdl.embedding('a_' + fn, batch[fn], xdl.TruncatedNormal(stddev=0.001), embed_size, 1000, 'sum', vtype='hash') ad_embs.append(emb) var_list = model(is_training)(ad_embs, user_embs, batch["indicators"][0], batch["label"]) keys = [ 'loss', 'ctr_prop', 'ctcvr_prop', 'cvr_prop', 'ctr_label', 'ctcvr_label', 'cvr_label' ] run_vars = dict(zip(keys, list(var_list))) hooks = [] if is_training: train_op = xdl.Adam(lr).optimize() hooks = get_collection(READER_HOOKS) if hooks is None: hooks = [] if xdl.get_task_index() == 0: ckpt_hook = xdl.CheckpointHook(1000) hooks.append(ckpt_hook) run_vars.update({None: train_op}) if is_debug > 1: print("=========gradients") grads = xdl.get_gradients() grads_keys = grads[''].keys() grads_keys.sort() for key in grads_keys: run_vars.update({"grads {}".format(key): grads[''][key]}) hooks.append(QpsMetricsHook()) log_format = "lstep[%(lstep)s] gstep[%(gstep)s] " \ "lqps[%(lqps)s] gqps[%(gqps)s]" hooks.append(MetricsPrinterHook(log_format, 100)) ckpt = xdl.get_config("checkpoint", "ckpt") if ckpt is not None and len(ckpt) > 0: if int(xdl.get_task_index()) == 0: from xdl.python.training.saver import Saver saver = Saver() print("restore from %s" % ckpt) saver.restore(ckpt) else: time.sleep(120) sess = xdl.TrainSession(hooks) if is_training: itr = 1 ctr_auc = Auc('ctr') ctcvr_auc = Auc('ctcvr') cvr_auc = Auc('cvr') while not sess.should_stop(): print('iter=', itr) values = sess.run(run_vars.values()) if not values: continue value_map = dict(zip(run_vars.keys(), values)) print('loss=', value_map['loss']) ctr_auc.add(value_map['ctr_prop'], value_map['ctr_label']) ctcvr_auc.add(value_map['ctcvr_prop'], value_map['ctcvr_label']) cvr_auc.add_with_filter(value_map['cvr_prop'], value_map['cvr_label'], np.where(value_map['ctr_label'] == 1)) itr += 1 ctr_auc.show() ctcvr_auc.show() cvr_auc.show() else: ctr_test_auc = Auc('ctr') ctcvr_test_auc = Auc('ctcvr') cvr_test_auc = Auc('cvr') for i in xrange(test_batch_num): print('iter=', i + 1) values = sess.run(run_vars.values()) value_map = dict(zip(run_vars.keys(), values)) print('test_loss=', value_map['loss']) ctr_test_auc.add(value_map['ctr_prop'], value_map['ctr_label']) ctcvr_test_auc.add(value_map['ctcvr_prop'], value_map['ctcvr_label']) cvr_test_auc.add_with_filter(value_map['cvr_prop'], value_map['cvr_label'], np.where(value_map['ctr_label'] == 1)) ctr_test_auc.show() ctcvr_test_auc.show() cvr_test_auc.show()