def __init__(self, dnn_dims=[], vocab_sizes=[], model_type=ModelType.create_classification(), model_arch=ModelArch.create_rnn(), share_semantic_generator=False, class_num=2, share_embed=False, is_infer=False): """ init dssm network :param dnn_dims: list of int (dimentions of each layer in semantic vector generator.) :param vocab_sizes: 2d tuple (size of both left and right items.) :param model_type: classification :param model_arch: model architecture :param share_semantic_generator: bool (whether to share the semantic vector generator for both left and right.) :param class_num: number of categories. :param share_embed: bool (whether to share the embeddings between left and right.) :param is_infer: inference """ assert len(vocab_sizes) == 2, ( "vocab sizes specify the sizes left and right inputs, dim is 2.") assert len(dnn_dims) > 1, "more than two layers is needed." self.dnn_dims = dnn_dims self.vocab_sizes = vocab_sizes self.share_semantic_generator = share_semantic_generator self.share_embed = share_embed self.model_type = ModelType(model_type) self.model_arch = ModelArch(model_arch) self.class_num = class_num self.is_infer = is_infer logger.warning("build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) _model_arch = { "rnn": self.create_rnn, "cnn": self.create_cnn, "fc": self.create_fc, } def _model_arch_creater(emb, prefix=""): sent_vec = _model_arch.get(str(model_arch))(emb, prefix) dnn = self.create_dnn(sent_vec, prefix) return dnn self.model_arch_creater = _model_arch_creater self.model_type_creater = self._build_classification_model
def infer(self, data_path, output_path, model_type=ModelType(ModelType.CLASSIFICATION_MODE), feature_dim=800, batch_size=100): logger.info("infer data...") #infer_reader = reader.test(data_path, # feature_dim+1, # model_type.is_classification()) infer_batch = paddle.batch(reader.test(data_path, feature_dim+1, model_type.is_classification()), batch_size=batch_size) logger.warning('write predictions to %s' % output_path) output_f = open(output_path, 'w') batch = [] #for item in infer_reader(): # batch.append([item[0]]) for id, batch in enumerate(infer_batch()): res = self.inferer.infer(input=batch) predictions = [' '.join(map(str, x)) for x in res] assert len(batch) == len( predictions), "predict error, %d inputs, but %d predictions" % ( len(batch), len(predictions)) output_f.write('\n'.join(map(str, predictions)) + '\n') batch = []
def __init__(self, param_path, model_type=ModelType(ModelType.CLASSIFICATION_MODE), class_num=2, feature_dim=800, dnn_dims='256,128,64,32'): logger.info("create DNN model") paddle.init(use_gpu=False, trainer_count=1) # network config input_layer = paddle.layer.data(name='input_layer', type=paddle.data_type.dense_vector(feature_dim)) layer_dims = [int(i) for i in dnn_dims.split(',')] dnn = create_dnn(input_layer, layer_dims) prediction = None label = None cost = None if model_type.is_classification(): prediction = paddle.layer.fc(input=dnn, size=class_num, act=paddle.activation.Softmax()) label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(class_num)) cost = paddle.layer.classification_cost(input=prediction, label=label) elif model_type.is_regression(): prediction = paddle.layer.fc(input=dnn, size=1, act=paddle.activation.Linear()) label = paddle.layer.data(name='label', type=paddle.data_type.dense_vector(1)) cost = paddle.layer.mse_cost(input=prediction, label=label) # load parameter logger.info("load model parameters from %s" % param_path) self.parameters = paddle.parameters.Parameters.from_tar( open(param_path, 'r')) self.inferer = paddle.inference.Inference( output_layer=prediction, parameters=self.parameters)
def __init__(self, dnn_layer_dims, dnn_input_dim, lr_input_dim, model_type=ModelType.create_classification(), is_infer=False): ''' @dnn_layer_dims: list of integer dims of each layer in dnn @dnn_input_dim: int size of dnn's input layer @lr_input_dim: int size of lr's input layer @is_infer: bool whether to build a infer model ''' self.dnn_layer_dims = dnn_layer_dims self.dnn_input_dim = dnn_input_dim self.lr_input_dim = lr_input_dim self.model_type = model_type self.is_infer = is_infer self._declare_input_layers() self.dnn = self._build_dnn_submodel_(self.dnn_layer_dims) self.lr = self._build_lr_submodel_() # model's prediction # TODO(superjom) rename it to prediction if self.model_type.is_classification(): self.model = self._build_classification_model(self.dnn, self.lr) if self.model_type.is_regression(): self.model = self._build_regression_model(self.dnn, self.lr)
def __init__(self, dnn_layer_dims, dnn_input_dim, lr_input_dim, model_type=ModelType.create_classification(), is_infer=False): ''' @dnn_layer_dims: list of integer DNN每一层的维度 @dnn_input_dim: int DNN输入层的大小 @lr_input_dim: int LR输入层大小 @is_infer: bool 是否建立预估模型 ''' self.dnn_layer_dims = dnn_layer_dims self.dnn_input_dim = dnn_input_dim self.lr_input_dim = lr_input_dim self.model_type = model_type self.is_infer = is_infer self._declare_input_layers() self.dnn = self._build_dnn_submodel_(self.dnn_layer_dims) self.lr = self._build_lr_submodel_() # 模型预测 if self.model_type.is_classification(): self.model = self._build_classification_model(self.dnn, self.lr) if self.model_type.is_regression(): self.model = self._build_regression_model(self.dnn, self.lr)
def __init__(self, train_path, test_path, source_dic_path, target_dic_path, model_type): self.train_path = train_path self.test_path = test_path self.source_dic_path = source_dic_path self.target_dic_path = target_dic_path self.model_type = ModelType(model_type) self.source_dic = load_dic(self.source_dic_path) self.target_dic = load_dic(self.target_dic_path) _record_reader = { ModelType.CLASSIFICATION_MODE: self._read_classification_record, ModelType.REGRESSION_MODE: self._read_regression_record, ModelType.RANK_MODE: self._read_rank_record, } assert isinstance(model_type, ModelType) self.record_reader = _record_reader[model_type.mode] self.is_infer = False self.train_data_csv = "/home/kesci/input/qichedashi/train_set.csv" self.dev_data_csv = "/home/kesci/input/qichedashi/final_round_dev_set.csv" self.test_data_csv = "/home/kesci/input/qichedashi/final_round_test_set.csv" self.NEG = 3 self.train_samples = 200000
def train(): args = parse_args() args.model_type = ModelType( args.model_type) #--model_type=0,1 classification regression #只使用cpu而且cpu只开一个线程 paddle.init(use_gpu=False, trainer_count=1) '''dnn_input_dim: 61 lr_input_dim: 10040001''' dnn_input_dim, lr_input_dim = reader.load_data_meta(args.data_meta_file) # create ctr model. model = CTRmodel(dnn_layer_dims, dnn_input_dim, lr_input_dim, model_type=args.model_type, is_infer=False) params = paddle.parameters.create(model.train_cost) optimizer = paddle.optimizer.AdaGrad() #学习率优化 trainer = paddle.trainer.SGD(cost=model.train_cost, parameters=params, update_equation=optimizer) dataset = reader.Dataset() def __event_handler__(event): if isinstance(event, paddle.event.EndIteration): num_samples = event.batch_id * args.batch_size if event.batch_id % 100 == 0: logger.warning( "Pass %d, Samples %d, Cost %f, %s" % (event.pass_id, num_samples, event.cost, event.metrics)) if event.batch_id % 1000 == 0: if args.test_data_path: result = trainer.test(reader=paddle.batch( dataset.test(args.test_data_path), batch_size=args.batch_size), feeding=reader.feeding_index) logger.warning("Test %d-%d, Cost %f, %s" % (event.pass_id, event.batch_id, result.cost, result.metrics)) path = "{}-pass-{}-batch-{}-test-{}.tar.gz".format( args.model_output_prefix, event.pass_id, event.batch_id, result.cost) with gzip.open(path, 'w') as f: trainer.save_parameter_to_tar(f) trainer.train(reader=paddle.batch(paddle.reader.shuffle(dataset.train( args.train_data_path), buf_size=500), batch_size=args.batch_size), feeding=reader.feeding_index, event_handler=__event_handler__, num_passes=args.num_passes)
def __init__(self, param_path): logger.info("create CTR model") dnn_input_dim, lr_input_dim = reader.load_data_meta(args.data_meta_path) # create the mdoel self.ctr_model = network_conf.CTRmodel( dnn_layer_dims, dnn_input_dim, lr_input_dim, model_type=ModelType(args.model_type), is_infer=True) # load parameter logger.info("load model parameters from %s" % param_path) self.parameters = paddle.parameters.Parameters.from_tar( gzip.open(param_path, 'r')) self.inferer = paddle.inference.Inference( output_layer=self.ctr_model.model, parameters=self.parameters, )
def __init__(self, train_path, test_path, source_dic_path, target_dic_path, model_type): self.train_path = train_path self.test_path = test_path self.source_dic_path = source_dic_path self.target_dic_path = target_dic_path self.model_type = ModelType(model_type) self.source_dic = load_dic(self.source_dic_path) self.target_dic = load_dic(self.target_dic_path) _record_reader = { ModelType.CLASSIFICATION_MODE: self._read_classification_record, ModelType.REGRESSION_MODE: self._read_regression_record, ModelType.RANK_MODE: self._read_rank_record, } assert isinstance(model_type, ModelType) self.record_reader = _record_reader[model_type.mode] self.is_infer = False
def train(data_path=None, model_type=ModelType.create_classification(), batch_size=100, num_passes=50, class_num=None, num_workers=1, use_gpu=False): ''' Train the DNN. ''' paddle.init(use_gpu=use_gpu, trainer_count=num_workers) # network config input_layer = paddle.layer.data(name='input_layer', type=paddle.data_type.dense_vector(feature_dim)) dnn = create_dnn(input_layer) prediction = None label = None cost = None if args.model_type.is_classification(): prediction = paddle.layer.fc(input=dnn, size=class_num, act=paddle.activation.Softmax()) label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(class_num)) cost = paddle.layer.classification_cost(input=prediction, label=label) elif args.model_type.is_regression(): prediction = paddle.layer.fc(input=dnn, size=1, act=paddle.activation.Linear()) label = paddle.layer.data(name='label', type=paddle.data_type.dense_vector(1)) cost = paddle.layer.mse_cost(input=prediction, label=label) # create parameters parameters = paddle.parameters.create(cost) # create optimizer optimizer = paddle.optimizer.Momentum(momentum=0) trainer = paddle.trainer.SGD( cost=cost, extra_layers=paddle.evaluator.auc(input=prediction, label=label), parameters=parameters, update_equation=optimizer) feeding = {'input_layer': 0, 'label': 1} # event_handler to print training and testing info def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) if isinstance(event, paddle.event.EndPass): result = trainer.test( reader=paddle.batch(reader.test(data_path, feature_dim+1, args.model_type.is_classification()), batch_size=batch_size), feeding=feeding) print "Test %d, Cost %f, %s" % (event.pass_id, result.cost, result.metrics) model_desc = "{type}".format( type=str(args.model_type)) with open("%sdnn_%s_pass_%05d.tar" % (args.model_output_prefix, model_desc, event.pass_id), "w") as f: parameters.to_tar(f) # training trainer.train( reader=paddle.batch( paddle.reader.shuffle(reader.train(data_path, feature_dim+1, args.model_type.is_classification()), buf_size=batch_size*10), batch_size=batch_size), feeding=feeding, event_handler=event_handler, num_passes=num_passes)
'-c', '--class_num', type=int, default=0, help="number of categories for classification task.") parser.add_argument( '--num_workers', type=int, default=1, help="num worker threads, default 1") parser.add_argument( '--use_gpu', type=bool, default=False, help="whether to use GPU devices (default: False)") # arguments check. args = parser.parse_args() args.model_type = ModelType(args.model_type) if args.model_type.is_classification(): assert args.class_num > 1, "--class_num should be set in classification task." feature_dim = args.feature_dim layer_dims = [int(i) for i in args.dnn_dims.split(',')] def create_dnn(sent_vec): # if more than three layers, than a fc layer will be added. if len(layer_dims) > 1: _input_layer = sent_vec for id, dim in enumerate(layer_dims): name = "fc_%d_%d" % (id, dim) logger.info("create fc layer [%s] which dimention is %d" % (name, dim)) fc = paddle.layer.fc(
def __init__(self, dnn_dims=[], vocab_sizes=[], model_type=ModelType.create_classification(), model_arch=ModelArch.create_cnn(), share_semantic_generator=False, class_num=None, share_embed=False, is_infer=False): """ :param dnn_dims: The dimention of each layer in the semantic vector generator. :type dnn_dims: list of int :param vocab_sizes: The size of left and right items. :type vocab_sizes: A list having 2 elements. :param model_type: The type of task to train the DSSM model. The value should be "rank: 0", "regression: 1" or "classification: 2". :type model_type: int :param model_arch: A value indicating the model architecture to use. :type model_arch: int :param share_semantic_generator: A flag indicating whether to share the semantic vector between the left and the right item. :type share_semantic_generator: bool :param share_embed: A floag indicating whether to share the embeddings between the left and the right item. :type share_embed: bool :param class_num: The number of categories. :type class_num: int """ assert len(vocab_sizes) == 2, ( "The vocab_sizes specifying the sizes left and right inputs. " "Its dimension should be 2.") assert len(dnn_dims) > 1, ("In the DNN model, more than two layers " "are needed.") self.dnn_dims = dnn_dims self.vocab_sizes = vocab_sizes self.share_semantic_generator = share_semantic_generator self.share_embed = share_embed self.model_type = ModelType(model_type) self.model_arch = ModelArch(model_arch) self.class_num = class_num self.is_infer = is_infer logger.warning("Build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) logger.info("The vocabulary size is : %s" % str(self.vocab_sizes)) # bind model architecture _model_arch = { "cnn": self.create_cnn, "fc": self.create_fc, "rnn": self.create_rnn, } def _model_arch_creater(emb, prefix=""): sent_vec = _model_arch.get(str(model_arch))(emb, prefix) dnn = self.create_dnn(sent_vec, prefix) return dnn self.model_arch_creater = _model_arch_creater _model_type = { "classification": self._build_classification_model, "rank": self._build_rank_model, "regression": self._build_regression_model, } print("model type: ", str(self.model_type)) self.model_type_creater = _model_type[str(self.model_type)]
class DSSM(object): def __init__(self, dnn_dims=[], vocab_sizes=[], model_type=ModelType.create_classification(), model_arch=ModelArch.create_cnn(), share_semantic_generator=False, class_num=None, share_embed=False, is_infer=False): """ :param dnn_dims: The dimention of each layer in the semantic vector generator. :type dnn_dims: list of int :param vocab_sizes: The size of left and right items. :type vocab_sizes: A list having 2 elements. :param model_type: The type of task to train the DSSM model. The value should be "rank: 0", "regression: 1" or "classification: 2". :type model_type: int :param model_arch: A value indicating the model architecture to use. :type model_arch: int :param share_semantic_generator: A flag indicating whether to share the semantic vector between the left and the right item. :type share_semantic_generator: bool :param share_embed: A floag indicating whether to share the embeddings between the left and the right item. :type share_embed: bool :param class_num: The number of categories. :type class_num: int """ assert len(vocab_sizes) == 2, ( "The vocab_sizes specifying the sizes left and right inputs. " "Its dimension should be 2.") assert len(dnn_dims) > 1, ("In the DNN model, more than two layers " "are needed.") self.dnn_dims = dnn_dims self.vocab_sizes = vocab_sizes self.share_semantic_generator = share_semantic_generator self.share_embed = share_embed self.model_type = ModelType(model_type) self.model_arch = ModelArch(model_arch) self.class_num = class_num self.is_infer = is_infer logger.warning("Build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) logger.info("The vocabulary size is : %s" % str(self.vocab_sizes)) # bind model architecture _model_arch = { "cnn": self.create_cnn, "fc": self.create_fc, "rnn": self.create_rnn, } def _model_arch_creater(emb, prefix=""): sent_vec = _model_arch.get(str(model_arch))(emb, prefix) dnn = self.create_dnn(sent_vec, prefix) return dnn self.model_arch_creater = _model_arch_creater _model_type = { "classification": self._build_classification_model, "rank": self._build_rank_model, "regression": self._build_regression_model, } print("model type: ", str(self.model_type)) self.model_type_creater = _model_type[str(self.model_type)] def __call__(self): return self.model_type_creater() def create_embedding(self, input, prefix=""): """ Create word embedding. The `prefix` is added in front of the name of embedding"s learnable parameter. """ logger.info("Create embedding table [%s] whose dimention is %d. " % (prefix, self.dnn_dims[0])) emb = paddle.layer.embedding(input=input, size=self.dnn_dims[0], param_attr=ParamAttr(name="%s_emb.w" % prefix)) return emb def create_fc(self, emb, prefix=""): """ A multi-layer fully connected neural networks. :param emb: The output of the embedding layer :type emb: paddle.layer :param prefix: A prefix will be added to the layers' names. :type prefix: str """ _input_layer = paddle.layer.pooling(input=emb, pooling_type=paddle.pooling.Max()) fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1], param_attr=ParamAttr(name="%s_fc.w" % prefix), bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.)) return fc def create_rnn(self, emb, prefix=""): """ A GRU sentence vector learner. """ gru = paddle.networks.simple_gru( input=emb, size=self.dnn_dims[1], mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix), mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), gru_param_attr=ParamAttr(name="%s_gru.w" % prefix), gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) sent_vec = paddle.layer.last_seq(gru) return sent_vec def create_cnn(self, emb, prefix=""): """ A multi-layer CNN. :param emb: The word embedding. :type emb: paddle.layer :param prefix: The prefix will be added to of layers' names. :type prefix: str """ def create_conv(context_len, hidden_size, prefix): key = "%s_%d_%d" % (prefix, context_len, hidden_size) conv = paddle.networks.sequence_conv_pool( input=emb, context_len=context_len, hidden_size=hidden_size, # set parameter attr for parameter sharing context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"), fc_param_attr=ParamAttr(name=key + "_fc.w"), fc_bias_attr=ParamAttr(name=key + "_fc.b"), pool_bias_attr=ParamAttr(name=key + "_pool.b")) return conv logger.info("create a sequence_conv_pool whose context width is 3.") conv_3 = create_conv(3, self.dnn_dims[1], "cnn") logger.info("create a sequence_conv_pool whose context width is 4.") conv_4 = create_conv(4, self.dnn_dims[1], "cnn") return paddle.layer.concat(input=[conv_3, conv_4]) def create_dnn(self, sent_vec, prefix): # if more than three layers, than a fc layer will be added. if len(self.dnn_dims) > 1: _input_layer = sent_vec for id, dim in enumerate(self.dnn_dims[1:]): name = "%s_fc_%d_%d" % (prefix, id, dim) logger.info("create fc layer [%s] which dimention is %d" % (name, dim)) fc = paddle.layer.fc(input=_input_layer, size=dim, act=paddle.activation.Tanh(), param_attr=ParamAttr(name="%s.w" % name), bias_attr=ParamAttr(name="%s.b" % name, initial_std=0.)) _input_layer = fc return _input_layer def _build_classification_model(self): logger.info("build classification model") assert self.model_type.is_classification() return self._build_classification_or_regression_model( is_classification=True) def _build_regression_model(self): logger.info("build regression model") assert self.model_type.is_regression() return self._build_classification_or_regression_model( is_classification=False) def _build_rank_model(self): """ Build a pairwise rank model, and the cost is returned. A pairwise rank model has 3 inputs: - source sentence - left_target sentence - right_target sentence - label, 1 if left_target should be sorted in front of right_target, otherwise 0. """ logger.info("build rank model") assert self.model_type.is_rank() source = paddle.layer.data( name="source_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) left_target = paddle.layer.data( name="left_target_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) right_target = paddle.layer.data( name="right_target_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) if not self.is_infer: label = paddle.layer.data(name="label_input", type=paddle.data_type.integer_value(1)) prefixs = "_ _ _".split( ) if self.share_semantic_generator else "source target target".split() embed_prefixs = "_ _ _".split( ) if self.share_embed else "source target target".split() word_vecs = [] for id, input in enumerate([source, left_target, right_target]): x = self.create_embedding(input, prefix=embed_prefixs[id]) word_vecs.append(x) semantics = [] for id, input in enumerate(word_vecs): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) # The cosine similarity score of source and left_target. left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) # The cosine similarity score of source and right target. right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) if not self.is_infer: # rank cost cost = paddle.layer.rank_cost(left_score, right_score, label=label) # prediction = left_score - right_score # but this operator is not supported currently. # so AUC will not used. return cost, None, label return right_score def _build_classification_or_regression_model(self, is_classification): """ Build a classification/regression model, and the cost is returned. The classification/regression task expects 3 inputs: - source sentence - target sentence - classification label """ if is_classification: assert self.class_num source = paddle.layer.data( name="source_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) target = paddle.layer.data( name="target_input", type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) label = paddle.layer.data( name="label_input", type=paddle.data_type.integer_value(self.class_num) if is_classification else paddle.data_type.dense_vector(1)) prefixs = "_ _".split( ) if self.share_semantic_generator else "source target".split() embed_prefixs = "_ _".split( ) if self.share_embed else "source target".split() word_vecs = [] for id, input in enumerate([source, target]): x = self.create_embedding(input, prefix=embed_prefixs[id]) word_vecs.append(x) semantics = [] for id, input in enumerate(word_vecs): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) if is_classification: concated_vector = paddle.layer.concat(semantics) prediction = paddle.layer.fc(input=concated_vector, size=self.class_num, act=paddle.activation.Softmax()) cost = paddle.layer.classification_cost(input=prediction, label=label) else: prediction = paddle.layer.cos_sim(*semantics) cost = paddle.layer.square_error_cost(prediction, label) if not self.is_infer: return cost, prediction, label return prediction
def train(train_data_path=None, test_data_path=None, source_dic_path=None, target_dic_path=None, model_type=ModelType.create_classification(), model_arch=ModelArch.create_cnn(), batch_size=10, num_passes=10, share_semantic_generator=False, share_embed=False, class_num=None, num_workers=1, use_gpu=False): ''' Train the DSSM. ''' default_train_path = './data/rank/train.txt' default_test_path = './data/rank/test.txt' default_dic_path = './data/vocab.txt' if not model_type.is_rank(): default_train_path = './data/classification/train.txt' default_test_path = './data/classification/test.txt' use_default_data = not train_data_path if use_default_data: train_data_path = default_train_path test_data_path = default_test_path source_dic_path = default_dic_path target_dic_path = default_dic_path dataset = reader.Dataset( train_path=train_data_path, test_path=test_data_path, source_dic_path=source_dic_path, target_dic_path=target_dic_path, model_type=model_type, ) train_reader = paddle.batch(paddle.reader.shuffle(dataset.train, buf_size=1000), batch_size=batch_size) test_reader = paddle.batch(paddle.reader.shuffle(dataset.test, buf_size=1000), batch_size=batch_size) paddle.init(use_gpu=use_gpu, trainer_count=num_workers) cost, prediction, label = DSSM( dnn_dims=layer_dims, vocab_sizes=[ len(load_dic(path)) for path in [source_dic_path, target_dic_path] ], model_type=model_type, model_arch=model_arch, share_semantic_generator=share_semantic_generator, class_num=class_num, share_embed=share_embed)() parameters = paddle.parameters.create(cost) adam_optimizer = paddle.optimizer.Adam( learning_rate=1e-3, regularization=paddle.optimizer.L2Regularization(rate=1e-3), model_average=paddle.optimizer.ModelAverage(average_window=0.5)) trainer = paddle.trainer.SGD( cost=cost, extra_layers=paddle.evaluator.auc(input=prediction, label=label) if not model_type.is_rank() else None, parameters=parameters, update_equation=adam_optimizer) feeding = {} if model_type.is_classification() or model_type.is_regression(): feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2} else: feeding = { 'source_input': 0, 'left_target_input': 1, 'right_target_input': 2, 'label_input': 3 } def _event_handler(event): ''' Define batch handler ''' if isinstance(event, paddle.event.EndIteration): # output train log if event.batch_id % args.num_batches_to_log == 0: logger.info( "Pass %d, Batch %d, Cost %f, %s" % (event.pass_id, event.batch_id, event.cost, event.metrics)) # test model if event.batch_id > 0 and event.batch_id % args.num_batches_to_test == 0: if test_reader is not None: if model_type.is_classification(): result = trainer.test(reader=test_reader, feeding=feeding) logger.info("Test at Pass %d, %s" % (event.pass_id, result.metrics)) else: result = None # save model if event.batch_id > 0 and event.batch_id % args.num_batches_to_save_model == 0: model_desc = "{type}_{arch}".format(type=str(args.model_type), arch=str(args.model_arch)) with open( "%sdssm_%s_pass_%05d.tar" % (args.model_output_prefix, model_desc, event.pass_id), "w") as f: parameters.to_tar(f) trainer.train(reader=train_reader, event_handler=_event_handler, feeding=feeding, num_passes=num_passes) logger.info("Training has finished.")
def __init__(self, dnn_dims=[], vocab_sizes=[], model_type=ModelType.create_classification(), model_arch=ModelArch.create_cnn(), share_semantic_generator=False, class_num=None, share_embed=False, is_infer=False): ''' @dnn_dims: list of int dimentions of each layer in semantic vector generator. @vocab_sizes: 2-d tuple size of both left and right items. @model_type: int type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2' @model_arch: int model architecture @share_semantic_generator: bool whether to share the semantic vector generator for both left and right. @share_embed: bool whether to share the embeddings between left and right. @class_num: int number of categories. ''' assert len( vocab_sizes ) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2." assert len(dnn_dims) > 1, "more than two layers is needed." self.dnn_dims = dnn_dims self.vocab_sizes = vocab_sizes self.share_semantic_generator = share_semantic_generator self.share_embed = share_embed self.model_type = ModelType(model_type) self.model_arch = ModelArch(model_arch) self.class_num = class_num self.is_infer = is_infer logger.warning("build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) # bind model architecture _model_arch = { 'cnn': self.create_cnn, 'fc': self.create_fc, 'rnn': self.create_rnn, } def _model_arch_creater(emb, prefix=''): sent_vec = _model_arch.get(str(model_arch))(emb, prefix) dnn = self.create_dnn(sent_vec, prefix) return dnn self.model_arch_creater = _model_arch_creater # build model type _model_type = { 'classification': self._build_classification_model, 'rank': self._build_rank_model, 'regression': self._build_regression_model, } print 'model type: ', str(self.model_type) self.model_type_creater = _model_type[str(self.model_type)]
class DSSM(object): def __init__(self, dnn_dims=[], vocab_sizes=[], model_type=ModelType.create_classification(), model_arch=ModelArch.create_cnn(), share_semantic_generator=False, class_num=None, share_embed=False, is_infer=False): ''' @dnn_dims: list of int dimentions of each layer in semantic vector generator. @vocab_sizes: 2-d tuple size of both left and right items. @model_type: int type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2' @model_arch: int model architecture @share_semantic_generator: bool whether to share the semantic vector generator for both left and right. @share_embed: bool whether to share the embeddings between left and right. @class_num: int number of categories. ''' assert len( vocab_sizes ) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2." assert len(dnn_dims) > 1, "more than two layers is needed." self.dnn_dims = dnn_dims self.vocab_sizes = vocab_sizes self.share_semantic_generator = share_semantic_generator self.share_embed = share_embed self.model_type = ModelType(model_type) self.model_arch = ModelArch(model_arch) self.class_num = class_num self.is_infer = is_infer logger.warning("build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) # bind model architecture _model_arch = { 'cnn': self.create_cnn, 'fc': self.create_fc, 'rnn': self.create_rnn, } def _model_arch_creater(emb, prefix=''): sent_vec = _model_arch.get(str(model_arch))(emb, prefix) dnn = self.create_dnn(sent_vec, prefix) return dnn self.model_arch_creater = _model_arch_creater # build model type _model_type = { 'classification': self._build_classification_model, 'rank': self._build_rank_model, 'regression': self._build_regression_model, } print 'model type: ', str(self.model_type) self.model_type_creater = _model_type[str(self.model_type)] def __call__(self): return self.model_type_creater() def create_embedding(self, input, prefix=''): ''' Create an embedding table whose name has a `prefix`. ''' logger.info("create embedding table [%s] which dimention is %d" % (prefix, self.dnn_dims[0])) emb = paddle.layer.embedding( input=input, size=self.dnn_dims[0], param_attr=ParamAttr(name='%s_emb.w' % prefix)) return emb def create_fc(self, emb, prefix=''): ''' A multi-layer fully connected neural networks. @emb: paddle.layer output of the embedding layer @prefix: str prefix of layers' names, used to share parameters between more than one `fc` parts. ''' _input_layer = paddle.layer.pooling( input=emb, pooling_type=paddle.pooling.Max()) fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) return fc def create_rnn(self, emb, prefix=''): ''' A GRU sentence vector learner. ''' gru = paddle.networks.simple_gru(input=emb, size=256) sent_vec = paddle.layer.last_seq(gru) return sent_vec def create_cnn(self, emb, prefix=''): ''' A multi-layer CNN. @emb: paddle.layer output of the embedding layer @prefix: str prefix of layers' names, used to share parameters between more than one `cnn` parts. ''' def create_conv(context_len, hidden_size, prefix): key = "%s_%d_%d" % (prefix, context_len, hidden_size) conv = paddle.networks.sequence_conv_pool( input=emb, context_len=context_len, hidden_size=hidden_size, # set parameter attr for parameter sharing context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_bias_attr=ParamAttr(name=key + '_fc.b'), pool_bias_attr=ParamAttr(name=key + '_pool.b')) return conv logger.info('create a sequence_conv_pool which context width is 3') conv_3 = create_conv(3, self.dnn_dims[1], "cnn") logger.info('create a sequence_conv_pool which context width is 4') conv_4 = create_conv(4, self.dnn_dims[1], "cnn") return conv_3, conv_4 def create_dnn(self, sent_vec, prefix): # if more than three layers, than a fc layer will be added. if len(self.dnn_dims) > 1: _input_layer = sent_vec for id, dim in enumerate(self.dnn_dims[1:]): name = "%s_fc_%d_%d" % (prefix, id, dim) logger.info("create fc layer [%s] which dimention is %d" % (name, dim)) fc = paddle.layer.fc( name=name, input=_input_layer, size=dim, act=paddle.activation.Tanh(), param_attr=ParamAttr(name='%s.w' % name), bias_attr=ParamAttr(name='%s.b' % name)) _input_layer = fc return _input_layer def _build_classification_model(self): logger.info("build classification model") assert self.model_type.is_classification() return self._build_classification_or_regression_model( is_classification=True) def _build_regression_model(self): logger.info("build regression model") assert self.model_type.is_regression() return self._build_classification_or_regression_model( is_classification=False) def _build_rank_model(self): ''' Build a pairwise rank model, and the cost is returned. A pairwise rank model has 3 inputs: - source sentence - left_target sentence - right_target sentence - label, 1 if left_target should be sorted in front of right_target, otherwise 0. ''' logger.info("build rank model") assert self.model_type.is_rank() source = paddle.layer.data( name='source_input', type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) left_target = paddle.layer.data( name='left_target_input', type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) right_target = paddle.layer.data( name='right_target_input', type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) if not self.is_infer: label = paddle.layer.data( name='label_input', type=paddle.data_type.integer_value(1)) prefixs = '_ _ _'.split( ) if self.share_semantic_generator else 'source left right'.split() embed_prefixs = '_ _'.split( ) if self.share_embed else 'source target target'.split() word_vecs = [] for id, input in enumerate([source, left_target, right_target]): x = self.create_embedding(input, prefix=embed_prefixs[id]) word_vecs.append(x) semantics = [] for id, input in enumerate(word_vecs): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) # cossim score of source and left_target left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) # cossim score of source and right target right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) if not self.is_infer: # rank cost cost = paddle.layer.rank_cost(left_score, right_score, label=label) # prediction = left_score - right_score # but this operator is not supported currently. # so AUC will not used. return cost, None, label return right_score def _build_classification_or_regression_model(self, is_classification): ''' Build a classification/regression model, and the cost is returned. A Classification has 3 inputs: - source sentence - target sentence - classification label ''' if is_classification: # prepare inputs. assert self.class_num source = paddle.layer.data( name='source_input', type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) target = paddle.layer.data( name='target_input', type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) label = paddle.layer.data( name='label_input', type=paddle.data_type.integer_value(self.class_num) if is_classification else paddle.data_type.dense_vector(1)) prefixs = '_ _'.split( ) if self.share_semantic_generator else 'left right'.split() embed_prefixs = '_ _'.split( ) if self.share_embed else 'left right'.split() word_vecs = [] for id, input in enumerate([source, target]): x = self.create_embedding(input, prefix=embed_prefixs[id]) word_vecs.append(x) semantics = [] for id, input in enumerate(word_vecs): x = self.model_arch_creater(input, prefix=prefixs[id]) semantics.append(x) if is_classification: concated_vector = paddle.layer.concat(semantics) prediction = paddle.layer.fc( input=concated_vector, size=self.class_num, act=paddle.activation.Softmax()) cost = paddle.layer.classification_cost( input=prediction, label=label) else: prediction = paddle.layer.cos_sim(*semantics) cost = paddle.layer.square_error_cost(prediction, label) if not self.is_infer: return cost, prediction, label return prediction