def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    dropout_prob = args.hidden_dropout

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()

    for epoch in range(num_epochs):
        overall_name = f'emb_np-{nproc}_vs-{vocab_size}'
        profiler.start(overall_name)

        # Forward pass
        profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}')
        embedding_output = embedding.forward(input_indices, position_indices)
        train_loss = torch.mean(embedding_output)
        torch.cuda.synchronize()
        profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}')

        # Backward pass
        profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}')
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}')

        profiler.stop(overall_name)
Beispiel #2
0
 def __init__(self, dataset, config):
     super(Classifier, self).__init__()
     self.config = config
     self.token_embedding = \
         Embedding(dataset.token_map, config.embedding.dimension,
                   cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING,
                   pretrained_embedding_file=
                   config.feature.token_pretrained_file,
                   mode=EmbeddingProcessType.FLAT,
                   dropout=self.config.embedding.dropout,
                   init_type=self.config.embedding.initializer,
                   low=-self.config.embedding.uniform_bound,
                   high=self.config.embedding.uniform_bound,
                   std=self.config.embedding.random_stddev,
                   fan_mode=self.config.embedding.fan_mode,
                   activation_type=ActivationType.NONE,
                   model_mode=dataset.model_mode)
     self.char_embedding = \
         Embedding(dataset.char_map, config.embedding.dimension,
                   cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
                   mode=EmbeddingProcessType.FLAT,
                   dropout=self.config.embedding.dropout,
                   init_type=self.config.embedding.initializer,
                   low=-self.config.embedding.uniform_bound,
                   high=self.config.embedding.uniform_bound,
                   std=self.config.embedding.random_stddev,
                   fan_mode=self.config.embedding.fan_mode,
                   activation_type=ActivationType.NONE,
                   model_mode=dataset.model_mode)
     self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
Beispiel #3
0
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    args = get_args()
    sequence_length = 1024
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(args.batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (args.batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=args.hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)

    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    transformer = ParallelTransformer(
        attention_mask_func=gpt2_attention_mask_func,
        num_layers=args.num_layers,
        hidden_size=args.hidden_size,
        layernorm_epsilon=args.layernorm_epsilon,
        num_attention_heads=args.num_attention_heads,
        attention_dropout=0.1,
        hidden_dropout=0.1)

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        input_indices, vocab_size - 1)

    transformer_output = transformer.forward(hidden_states=embedding_output,
                                             attention_mask=attention_mask)
    print_rank_0(f'AutoMP: transformer_output = {transformer_output}')
def train():
    
    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training self attention layer...')
    # Use fake train data
    args = get_args()
    batch_size = 32
    sequence_length = 1024
    hidden_size = args.hidden_size
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
    embedding = Embedding(hidden_size=hidden_size, 
              vocab_size=vocab_size, 
              max_sequence_length=sequence_length, 
              embedding_dropout_prob=dropout_prob, 
              init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)
    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):

        print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}')
        print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}')

        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    self_attention = ParallelSelfAttention(
        attention_mask_func=gpt2_attention_mask_func, 
        hidden_size=args.hidden_size, 
        num_attention_heads=args.num_attention_heads, 
        attention_dropout=0.1
    )

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)

    print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}')

    self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask)
    print_rank_0(f'AutoMP: self_att_output = {self_att_output}')
Beispiel #5
0
 def __init__(self, dataset, config):
     super(FastText, self).__init__()
     self.config = config
     assert "token" in self.config.feature.feature_names
     self.token_embedding = \
         Embedding(dataset.token_map,
                   config.embedding.dimension,
                   cDataset.DOC_TOKEN, config,
                   padding_idx=dataset.VOCAB_PADDING,
                   pretrained_embedding_file=
                   config.feature.token_pretrained_file,
                   mode=EmbeddingProcessType.SUM, dropout=0,
                   init_type=config.embedding.initializer,
                   low=-config.embedding.uniform_bound,
                   high=config.embedding.uniform_bound,
                   std=config.embedding.random_stddev,
                   activation_type=ActivationType.NONE)
     if self.config.feature.token_ngram > 1:
         self.token_ngram_embedding = \
             Embedding(dataset.token_ngram_map,
                       config.embedding.dimension,
                       cDataset.DOC_TOKEN_NGRAM, config,
                       padding_idx=dataset.VOCAB_PADDING,
                       mode=EmbeddingProcessType.SUM, dropout=0,
                       init_type=config.embedding.initializer,
                       low=-config.embedding.uniform_bound,
                       high=config.embedding.uniform_bound,
                       std=config.embedding.random_stddev,
                       activation_type=ActivationType.NONE)
     if "keyword" in self.config.feature.feature_names:
         self.keyword_embedding = \
             Embedding(dataset.keyword_map,
                       config.embedding.dimension,
                       cDataset.DOC_KEYWORD, config,
                       padding_idx=dataset.VOCAB_PADDING,
                       pretrained_embedding_file=
                       config.feature.keyword_pretrained_file,
                       mode=EmbeddingProcessType.SUM, dropout=0,
                       init_type=config.embedding.initializer,
                       low=-config.embedding.uniform_bound,
                       high=config.embedding.uniform_bound,
                       std=config.embedding.random_stddev,
                       activation_type=ActivationType.NONE)
     if "topic" in self.config.feature.feature_names:
         self.topic_embedding = \
             Embedding(dataset.topic_map,
                       config.embedding.dimension,
                       cDataset.DOC_TOPIC, config,
                       padding_idx=dataset.VOCAB_PADDING,
                       mode=EmbeddingProcessType.SUM, dropout=0,
                       init_type=config.embedding.initializer,
                       low=-config.embedding.uniform_bound,
                       high=config.embedding.uniform_bound,
                       std=config.embedding.random_stddev,
                       activation_type=ActivationType.NONE)
     self.linear = torch.nn.Linear(config.embedding.dimension,
                                   len(dataset.label_map))
     self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
    def __init__(self, embedding_info: Dict, encoder_info: Dict,
                 decoder_info: Dict, hidden_states: Dict, token_to_id: Dict,
                 type_to_id: Dict, label_to_id: Dict):
        super().__init__()

        self.embedding_info = embedding_info
        self.encoder_info = encoder_info
        self.decoder_info = decoder_info

        self.hidden_states = hidden_states

        self.token_to_id = token_to_id
        self.type_to_id = type_to_id
        self.label_to_id = label_to_id

        self.embedding = Embedding(h_emb=self.hidden_states['embedding'],
                                   token_to_id=self.token_to_id,
                                   type_to_id=self.type_to_id,
                                   **self.embedding_info)
        self.encoder = Encoder(h_emb=self.hidden_states['embedding'],
                               h_enc=self.hidden_states['encoder'],
                               **self.encoder_info)
        self.decoder = Decoder(h_enc=self.hidden_states['encoder'],
                               h_dec=self.hidden_states['decoder'],
                               label_to_id=self.label_to_id,
                               **self.decoder_info)
    def __init__(self, hidden_size, vocab_size, sequence_length,
                 hidden_dropout, attention_mask_func, num_layers,
                 layernorm_epsilon, num_attention_heads, attention_dropout,
                 init_method):
        super(TransformerLanguageModel, self).__init__()

        # self.hidden_size = hidden_size
        # self.vocab_size = vocab_size
        # self.sequence_length = sequence_length
        # self.hidden_dropout = hidden_dropout
        # self.init_method = init_method

        # self.num_layers = num_layers
        # self.layernorm_epsilon = layernorm_epsilon
        # self.num_attention_heads = num_attention_heads
        # self.attention_dropout = attention_dropout

        # Embeddings
        self.embedding = Embedding(hidden_size, vocab_size, sequence_length,
                                   hidden_dropout, init_method)
        self._embedding_key = 'embedding'

        # Transformer
        self.transformer = ParallelTransformer(attention_mask_func, num_layers,
                                               hidden_size, layernorm_epsilon,
                                               num_attention_heads,
                                               attention_dropout,
                                               hidden_dropout)
        self._transformer_key = 'transformer'
Beispiel #8
0
	def get_embedding(self, embed_id):
		""" Return the actual word embedding associated with a given ID """
		if not embed_id in self.embedding_meta:
			return None
		if embed_id in self.embedding_cache:
			log.info("Using cached embedding for %s" % embed_id)
			return self.embedding_cache[embed_id]
		# load the associated word embedding
		em = self.embedding_meta[embed_id]
		in_path = em.dir_base / em["file"]
		log.info("Loading word embedding from %s" % in_path)
		try:
			self.embedding_cache[embed_id] = Embedding(in_path)
		except Exception as e:
			log.warning("Failed to load word embedding: %s" % in_path)
			log.warning(e)
			return None
		return self.embedding_cache[embed_id]
Beispiel #9
0
    def create_model(self, fetch_data=None):
        with tf.variable_scope('variables'):
            sentence_simple_input_placeholder = []
            sentence_complex_input_placeholder = []
            if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                sentence_simple_segment_input_placeholder = []
                sentence_complex_segment_input_placeholder = []

            obj = {}
            if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset':
                for t in tf.unstack(fetch_data['line_comp_ids'], axis=1):
                    sentence_complex_input_placeholder.append(t)
                for t in tf.unstack(fetch_data['line_simp_ids'], axis=1):
                    sentence_simple_input_placeholder.append(t)

                if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                    for t in tf.unstack(fetch_data['line_comp_segids'],
                                        axis=1):
                        sentence_complex_segment_input_placeholder.append(t)
                    for t in tf.unstack(fetch_data['line_simp_segids'],
                                        axis=1):
                        sentence_simple_segment_input_placeholder.append(t)
                    obj['line_comp_segids'] = tf.stack(
                        sentence_complex_segment_input_placeholder, axis=1)
                    obj['line_simp_segids'] = tf.stack(
                        sentence_simple_segment_input_placeholder, axis=1)

                score = None
                if self.model_config.tune_style:
                    if self.is_train:
                        # In training, score are from fetch data
                        scores = []
                        if self.model_config.tune_style[0]:
                            ppdb_score = fetch_data['ppdb_score']
                            scores.append(ppdb_score)
                            print('Tune ppdb score!')
                            if 'plus' in self.model_config.tune_mode:
                                # to avoid most ppdb scores are 0
                                ppdb_score += 0.1
                        if self.model_config.tune_style[1]:
                            add_score = fetch_data['dsim_score']
                            scores.append(add_score)
                            print('Tune dsim_score score!')
                        if self.model_config.tune_style[2]:
                            add_score = fetch_data['add_score']
                            scores.append(add_score)
                            print('Tune add score!')
                        if self.model_config.tune_style[3]:
                            len_score = fetch_data['len_score']
                            scores.append(len_score)
                            print('Tune length score!')

                    else:
                        # In evaluating/predict, scores may be a  factor to multiply if in pred mode
                        #   or actual user provided score
                        # TODO(sanqiang): not used for now because not fech_data in eval
                        raise NotImplementedError('No tune style for training')
                        # ppdb_score = tf.constant(
                        #     self.model_config.tune_style[0], shape=[self.model_config.batch_size], dtype=tf.float32)
                        # add_score = tf.constant(
                        #     self.model_config.tune_style[1], shape=[self.model_config.batch_size], dtype=tf.float32)
                        # len_score = tf.constant(
                        #     self.model_config.tune_style[2], shape=[self.model_config.batch_size], dtype=tf.float32)

                    # Assemble scores
                    dimension_unit = int(self.model_config.dimension /
                                         len(scores))
                    dimension_runit = self.model_config.dimension - (
                        len(scores) - 1) * dimension_unit
                    for s_i, score in enumerate(scores):
                        if s_i < len(scores) - 1:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_unit]),
                                                         axis=1)
                        else:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_runit]),
                                                         axis=1)
                    score = tf.concat(scores, axis=-1)
            else:
                for step in range(self.model_config.max_simple_sentence):
                    sentence_simple_input_placeholder.append(
                        tf.zeros(self.model_config.batch_size,
                                 tf.int32,
                                 name='simple_input'))

                for step in range(self.model_config.max_complex_sentence):
                    sentence_complex_input_placeholder.append(
                        tf.zeros(self.model_config.batch_size,
                                 tf.int32,
                                 name='complex_input'))

                if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                    for step in range(self.model_config.max_simple_sentence):
                        sentence_simple_segment_input_placeholder.append(
                            tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='simple_seg_input'))

                    for step in range(self.model_config.max_complex_sentence):
                        sentence_complex_segment_input_placeholder.append(
                            tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='complex_seg_input'))

                    obj['line_comp_segids'] = tf.stack(
                        sentence_complex_segment_input_placeholder, axis=1)
                    obj['line_simp_segids'] = tf.stack(
                        sentence_simple_segment_input_placeholder, axis=1)

                score = None
                if self.model_config.tune_style:
                    if self.is_train:
                        raise NotImplementedError('No tune style for training')
                        #
                        # ppdb_score = tf.constant(
                        #     self.model_config.tune_style, shape=[self.model_config.batch_size], dtype=tf.float32)
                        # ppdb_score = tf.expand_dims(tf.tile(
                        #     tf.expand_dims(ppdb_score, axis=-1),
                        #     [1, self.model_config.dimension]), axis=1)
                    else:
                        scores = []
                        if self.model_config.tune_style:
                            if self.model_config.tune_style[0]:
                                ppdb_score = tf.constant(
                                    self.model_config.tune_style[0],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(ppdb_score)
                                print('tune ppdb score')
                            if self.model_config.tune_style[1]:
                                dsim_score = tf.constant(
                                    self.model_config.tune_style[1],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(dsim_score)
                                print('tune dsim score')
                            if self.model_config.tune_style[2]:
                                add_score = tf.constant(
                                    self.model_config.tune_style[2],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(add_score)
                                print('tune add score')
                            if self.model_config.tune_style[3]:
                                len_score = tf.constant(
                                    self.model_config.tune_style[3],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(len_score)
                                print('tune len score')
                    # Assemble scores
                    dimension_unit = int(self.model_config.dimension /
                                         len(scores))
                    dimension_runit = self.model_config.dimension - (
                        len(scores) - 1) * dimension_unit
                    for s_i, score in enumerate(scores):
                        if s_i < len(scores) - 1:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_unit]),
                                                         axis=1)
                        else:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_runit]),
                                                         axis=1)
                    score = tf.concat(scores, axis=-1)

            # For self.model_config.tune_style:
            comp_features = {}
            comp_add_score = tf.zeros(self.model_config.batch_size,
                                      tf.float32,
                                      name='comp_add_score_input')
            comp_length = tf.zeros(self.model_config.batch_size,
                                   tf.float32,
                                   name='comp_length_input')
            comp_features['comp_add_score'] = comp_add_score
            comp_features['comp_length'] = comp_length

            sentence_idxs = tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='sent_idx')

            self.embedding = Embedding(self.data.vocab_complex,
                                       self.data.vocab_simple,
                                       self.model_config)
            if self.model_config.bert_mode:
                emb_complex = None
            else:
                emb_complex = self.embedding.get_complex_embedding()
            if self.model_config.bert_mode and (
                    self.model_config.tie_embedding == 'all'
                    or self.model_config.tie_embedding == 'enc_dec'):
                emb_simple = None
            else:
                emb_simple = self.embedding.get_simple_embedding()

            if (self.is_train and self.model_config.pretrained_embedding):
                self.embed_complex_placeholder = tf.placeholder(
                    tf.float32, (self.data.vocab_complex.vocab_size(),
                                 self.model_config.dimension), 'complex_emb')
                self.replace_emb_complex = emb_complex.assign(
                    self.embed_complex_placeholder)

                self.embed_simple_placeholder = tf.placeholder(
                    tf.float32, (self.data.vocab_simple.vocab_size(),
                                 self.model_config.dimension), 'simple_emb')
                self.replace_emb_simple = emb_simple.assign(
                    self.embed_simple_placeholder)

            if self.model_config.bert_mode and (
                    self.model_config.tie_embedding == 'all'
                    or self.model_config.tie_embedding == 'dec_out'):
                w = None
            else:
                w = self.embedding.get_w()
            b = self.embedding.get_b()

            mem_contexts, mem_outputs, mem_counter = None, None, None
            rule_id_input_placeholder, rule_target_input_placeholder = [], []
            if 'rule' in self.model_config.memory:
                with tf.device('/cpu:0'):
                    context_size = 0
                    if self.model_config.framework == 'transformer':
                        context_size = 1
                    elif self.model_config.framework == 'seq2seq':
                        context_size = 2
                    mem_contexts = tf.get_variable(
                        'mem_contexts',
                        initializer=tf.constant(
                            0,
                            dtype=tf.float32,
                            shape=(self.data.vocab_rule.get_rule_size(),
                                   self.model_config.max_target_rule_sublen,
                                   self.model_config.dimension *
                                   context_size)),
                        trainable=False,
                        dtype=tf.float32)
                    mem_outputs = tf.get_variable(
                        'mem_outputs',
                        initializer=tf.constant(
                            0,
                            dtype=tf.float32,
                            shape=(self.data.vocab_rule.get_rule_size(),
                                   self.model_config.max_target_rule_sublen,
                                   self.model_config.dimension)),
                        trainable=False,
                        dtype=tf.float32)
                    mem_counter = tf.get_variable(
                        'mem_counter',
                        initializer=tf.constant(
                            0,
                            dtype=tf.int32,
                            shape=(self.data.vocab_rule.get_rule_size(), 1)),
                        trainable=False,
                        dtype=tf.int32)

            if 'direct' in self.model_config.memory or 'rule' in self.model_config.memory:
                if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset':
                    for t in tf.unstack(fetch_data['rule_id'], axis=1):
                        rule_id_input_placeholder.append(t)
                    for t in tf.unstack(fetch_data['rule_target'], axis=1):
                        rule_target_input_placeholder.append(t)
                else:
                    for step in range(self.model_config.max_cand_rules):
                        rule_id_input_placeholder.append(
                            tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='rule_id_input'))

                    for step in range(self.model_config.max_cand_rules):
                        if 'direct' in self.model_config.memory:
                            rule_target_input_placeholder.append(
                                tf.zeros(self.model_config.batch_size,
                                         tf.int32,
                                         name='rule_target_input'))
                        elif 'rule' in self.model_config.memory:
                            rule_target_input_placeholder.append(
                                tf.zeros(self.model_config.batch_size,
                                         tf.string,
                                         name='rule_target_input'))

        with tf.variable_scope('model'):
            output = self.model_fn(sentence_complex_input_placeholder,
                                   emb_complex,
                                   sentence_simple_input_placeholder,
                                   emb_simple, w, b, rule_id_input_placeholder,
                                   rule_target_input_placeholder, mem_contexts,
                                   mem_outputs, self.global_step, score,
                                   comp_features, obj)

            encoder_embs, final_outputs = None, None
            if self.model_config.replace_unk_by_emb:
                encoder_embs = tf.stack(output.encoder_embed_inputs_list,
                                        axis=1)

            if output.decoder_outputs_list is not None:
                if type(output.decoder_outputs_list) == list:
                    decoder_outputs_list = output.decoder_outputs_list
                    decoder_outputs = tf.stack(decoder_outputs_list, axis=1)
                else:
                    decoder_outputs = output.decoder_outputs_list

            if output.final_outputs_list is not None:
                if type(output.final_outputs_list) == list:
                    final_outputs_list = output.final_outputs_list
                    final_outputs = tf.stack(final_outputs_list, axis=1)
                else:
                    final_outputs = output.final_outputs_list

            attn_distr = None
            if self.model_config.replace_unk_by_attn:
                attn_distr = output.attn_distr_list

            if not self.is_train:
                # in beam search, it directly provide decoder target list
                decoder_target = tf.stack(output.decoder_target_list, axis=1)
                loss = tf.reduce_mean(output.decoder_score)
                obj = {
                    'sentence_idxs': sentence_idxs,
                    'sentence_simple_input_placeholder':
                    sentence_simple_input_placeholder,
                    'sentence_complex_input_placeholder':
                    sentence_complex_input_placeholder,
                    'decoder_target_list': decoder_target,
                    'final_outputs': final_outputs,
                    'encoder_embs': encoder_embs,
                    'attn_distr': attn_distr
                }
                if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                    obj['sentence_complex_segment_input_placeholder'] = sentence_complex_segment_input_placeholder
                    obj['sentence_simple_segment_input_placeholder'] = sentence_simple_segment_input_placeholder
                if 'rule' in self.model_config.memory or 'direct' in self.model_config.memory:
                    obj['rule_id_input_placeholder'] = rule_id_input_placeholder
                    obj['rule_target_input_placeholder'] = rule_target_input_placeholder
                if self.model_config.tune_style:
                    obj['comp_features'] = comp_features
                return loss, obj
            else:
                # Memory Populate
                if 'rule' in self.model_config.memory:
                    # Update Memory through python injection
                    def update_memory(mem_contexts_tmp, mem_outputs_tmp,
                                      mem_counter_tmp, decoder_targets,
                                      decoder_outputs, contexts,
                                      rule_target_input_placeholder,
                                      rule_id_input_placeholder, global_step,
                                      encoder_outputs):
                        def _seq_contain(arr, tar):
                            j = 0
                            for i in range(len(arr)):
                                if arr[i] == tar[j]:
                                    j += 1
                                    if j == len(tar):
                                        return i - len(tar) + 1
                                else:
                                    j = 0
                            return -1

                        # if 'stopgrad' in self.model_config.rl_configs and global_step % 2 != 0:
                        #     return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp
                        # if global_step <= self.model_config.memory_prepare_step:
                        #     return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp

                        batch_size = np.shape(rule_target_input_placeholder)[0]
                        max_rules = np.shape(rule_target_input_placeholder)[1]
                        decoder_targets_str = [
                            ' '.join(sent) for sent in truncate_sents(
                                decode(
                                    decoder_targets, self.data.vocab_simple,
                                    self.model_config.subword_vocab_size > 0
                                    or 'bert_token' in
                                    self.model_config.bert_mode))
                        ]
                        for batch_id in range(batch_size):
                            cur_decoder_targets = decoder_targets[batch_id, :]
                            cur_decoder_targets_str = decoder_targets_str[
                                batch_id]

                            cur_decoder_outputs = decoder_outputs[batch_id, :]
                            cur_contexts = contexts[batch_id, :]

                            cur_rule_target_input_placeholder = rule_target_input_placeholder[
                                batch_id, :]
                            cur_rule_target_input_placeholder = [
                                tmp.decode("utf-8").strip('\x00')
                                for tmp in cur_rule_target_input_placeholder
                                if not tmp.decode("utf-8").strip().startswith(
                                    constant.SYMBOL_PAD)
                            ]
                            cur_rule_id_input_placeholder = rule_id_input_placeholder[
                                batch_id, :]

                            # Build the valid mapper from rule id => target words ids
                            rule_mapper = {}
                            for step in range(
                                    len(cur_rule_target_input_placeholder)):
                                rule_target_str = cur_rule_target_input_placeholder[
                                    step]
                                if rule_target_str == constant.SYMBOL_PAD:
                                    continue
                                rule_id = cur_rule_id_input_placeholder[step]
                                if rule_id != 0 and re.search(
                                        r'\b%s\b' % rule_target_str,
                                        cur_decoder_targets_str
                                ):  # decoder_target_str in cur_decoder_targets_str:
                                    decoder_target_wids = self.data.vocab_simple.encode(
                                        rule_target_str)
                                    dec_s_idx = _seq_contain(
                                        cur_decoder_targets,
                                        decoder_target_wids)
                                    if dec_s_idx != -1:
                                        print('rule_target_str:%s' %
                                              rule_target_str)
                                        print('cur_decoder_targets_str:%s' %
                                              cur_decoder_targets_str)
                                        print('cur_decoder_targets:%s' %
                                              cur_decoder_targets)
                                        print('decoder_target_wids:%s' %
                                              decoder_target_wids)
                                    rule_mapper[rule_id] = list(
                                        range(
                                            dec_s_idx, dec_s_idx +
                                            len(decoder_target_wids)))

                            for rule_id in rule_mapper:
                                dec_idxs = rule_mapper[rule_id]

                                for idx, dec_idx in enumerate(dec_idxs):
                                    if mem_counter_tmp[rule_id, 0] == 0:
                                        mem_contexts_tmp[
                                            rule_id,
                                            idx, :] = cur_contexts[dec_idx, :]
                                        mem_outputs_tmp[
                                            rule_id,
                                            idx, :] = cur_decoder_outputs[
                                                dec_idx, :]
                                    else:
                                        mem_contexts_tmp[rule_id, idx, :] = (
                                            cur_contexts[dec_idx, :] +
                                            mem_contexts_tmp[rule_id,
                                                             idx, :]) / 2
                                        mem_outputs_tmp[rule_id, idx, :] = (
                                            cur_decoder_outputs[dec_idx, :] +
                                            mem_outputs_tmp[rule_id,
                                                            idx, :]) / 2

                                mem_counter_tmp[rule_id, 0] += 1

                        return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp

                    mem_output_input = None
                    if 'mofinal' in self.model_config.memory_config:
                        mem_output_input = final_outputs
                    # elif 'modecode' in self.model_config.memory_config:
                    #     mem_output_input = decoder_outputs
                    # elif 'moemb' in self.model_config.memory_config:
                    #     mem_output_input = tf.stack(
                    #         self.embedding_fn(sentence_simple_input_placeholder, emb_simple),
                    #         axis=1)

                    mem_contexts, mem_outputs, mem_counter = tf.py_func(
                        update_memory, [
                            mem_contexts, mem_outputs, mem_counter,
                            tf.stack(output.decoder_target_list, axis=1),
                            mem_output_input, output.contexts,
                            tf.stack(rule_target_input_placeholder, axis=1),
                            tf.stack(rule_id_input_placeholder, axis=1),
                            self.global_step, output.encoder_outputs
                        ], [tf.float32, tf.float32, tf.int32],
                        stateful=False,
                        name='update_memory')

                #Loss and corresponding prior/mask
                decode_word_weight_list = [
                    tf.to_float(
                        tf.not_equal(
                            d,
                            self.data.vocab_simple.encode(
                                constant.SYMBOL_PAD)))
                    for d in output.gt_target_list
                ]
                decode_word_weight = tf.stack(decode_word_weight_list, axis=1)

                gt_target = tf.stack(output.gt_target_list, axis=1)

                def self_critical_loss():
                    # For minimize the negative log of probabilities
                    rewards = tf.py_func(
                        self.metric.self_crititcal_reward,
                        [
                            sentence_idxs,
                            tf.stack(output.sample_target_list, axis=-1),
                            tf.stack(output.decoder_target_list, axis=-1),
                            tf.stack(sentence_simple_input_placeholder,
                                     axis=-1),
                            tf.stack(sentence_complex_input_placeholder,
                                     axis=-1),
                            tf.ones((1, 1)),
                            # tf.stack(rule_target_input_placeholder, axis=1)
                        ],
                        tf.float32,
                        stateful=False,
                        name='reward')
                    rewards.set_shape((self.model_config.batch_size,
                                       self.model_config.max_simple_sentence))
                    rewards = tf.unstack(rewards, axis=1)

                    weighted_probs_list = [
                        rewards[i] * decode_word_weight_list[i] *
                        -output.sample_logit_list[i]
                        for i in range(len(decode_word_weight_list))
                    ]
                    total_size = tf.reduce_sum(decode_word_weight_list)
                    total_size += 1e-12
                    weighted_probs = tf.reduce_sum(
                        weighted_probs_list) / total_size
                    loss = weighted_probs
                    return loss

                def teacherforce_critical_loss():
                    losses = []
                    for step in range(self.model_config.max_simple_sentence):
                        logit = output.decoder_logit_list[step]
                        greedy_target_unit = tf.stop_gradient(
                            tf.argmax(logit, axis=1))
                        if self.model_config.train_mode == 'teachercriticalv2':
                            sampled_target_unit, reward = tf.py_func(
                                self.metric.self_crititcal_reward_unitv2, [
                                    sentence_idxs, step, greedy_target_unit,
                                    tf.stack(sentence_simple_input_placeholder,
                                             axis=-1),
                                    tf.stack(
                                        sentence_complex_input_placeholder,
                                        axis=-1), self.global_step
                                ], [tf.int32, tf.float32],
                                stateful=False,
                                name='reward')
                            reward.set_shape((self.model_config.batch_size, ))
                            sampled_target_unit.set_shape(
                                (self.model_config.batch_size, ))
                        elif self.model_config.train_mode == 'teachercritical':
                            sampled_target_unit = tf.cast(
                                tf.squeeze(tf.multinomial(logit, 1), axis=1),
                                tf.int32)
                            sampled_target_unit, reward = tf.py_func(
                                self.metric.self_crititcal_reward_unit, [
                                    sentence_idxs,
                                    step,
                                    sampled_target_unit,
                                    greedy_target_unit,
                                    tf.stack(sentence_simple_input_placeholder,
                                             axis=-1),
                                    tf.stack(
                                        sentence_complex_input_placeholder,
                                        axis=-1),
                                    self.global_step,
                                ], [tf.int32, tf.float32],
                                stateful=False,
                                name='reward')
                            reward.set_shape((self.model_config.batch_size, ))
                            sampled_target_unit.set_shape(
                                (self.model_config.batch_size, ))
                        indices = tf.stack([
                            tf.range(0,
                                     self.model_config.batch_size,
                                     dtype=tf.int32),
                            tf.squeeze(sampled_target_unit)
                        ],
                                           axis=-1)
                        logit_unit = tf.gather_nd(tf.nn.softmax(logit, axis=1),
                                                  indices)
                        decode_word_weight = decode_word_weight_list[step]
                        losses.append(-logit_unit * reward *
                                      decode_word_weight)
                    loss = tf.add_n(losses)
                    return loss

                def teacherforce_loss():
                    if self.model_config.number_samples > 0:
                        loss_fn = tf.nn.sampled_softmax_loss
                    else:
                        loss_fn = None
                    loss = sequence_loss(
                        logits=tf.stack(output.decoder_logit_list, axis=1),
                        targets=gt_target,
                        weights=decode_word_weight,
                        # softmax_loss_function=loss_fn,
                        # w=w,
                        # b=b,
                        # decoder_outputs=decoder_outputs,
                        # number_samples=self.model_config.number_samples
                    )
                    return loss

                if self.model_config.train_mode == 'dynamic_self-critical':
                    loss = self_critical_loss()
                    # loss = tf.cond(
                    #     tf.greater(self.global_step, 50000),
                    #     # tf.logical_and(tf.greater(self.global_step, 100000), tf.equal(tf.mod(self.global_step, 2), 0)),
                    #     lambda : self_critical_loss(),
                    #     lambda : teacherforce_loss())
                elif self.model_config.train_mode == 'teachercritical' or self.model_config.train_mode == 'teachercriticalv2':
                    loss = tf.cond(tf.equal(tf.mod(self.global_step, 2),
                                            0), lambda: teacherforce_loss(),
                                   lambda: teacherforce_critical_loss())

                    # loss = teacherforce_critical_loss()
                else:
                    loss = teacherforce_loss()

                if self.model_config.architecture == 'ut2t':
                    assert 'extra_encoder_loss' in output.obj_tensors and 'extra_decoder_loss' in output.obj_tensors
                    loss += output.obj_tensors['extra_encoder_loss']
                    loss += output.obj_tensors['extra_decoder_loss']
                    print('Use U T2T with ACT')

                self.loss_style = tf.constant(0.0, dtype=tf.float32)
                if output.pred_score_tuple is not None and 'pred' in self.model_config.tune_mode:
                    print('Create loss for predicting style')
                    ppdb_pred_score, add_pred_score, len_pred_score = output.pred_score_tuple
                    # ppdb_pred_score = tf.Print(ppdb_pred_score, [ppdb_pred_score, fetch_data['ppdb_score']],
                    #                            message='ppdb_pred_score:', first_n=-1, summarize=100)
                    # add_pred_score = tf.Print(add_pred_score, [add_pred_score, fetch_data['add_score']],
                    #                            message='add_pred_score:', first_n=-1, summarize=100)
                    # len_pred_score = tf.Print(len_pred_score, [len_pred_score, fetch_data['len_score']],
                    #                            message='len_pred_score:', first_n=-1, summarize=100)
                    # loss = tf.Print(loss, [loss], message='loss before:', summarize=100)
                    self.loss_style += tf.losses.absolute_difference(
                        ppdb_pred_score, fetch_data['ppdb_score'])
                    self.loss_style += tf.losses.absolute_difference(
                        add_pred_score, fetch_data['add_score'])
                    self.loss_style += tf.losses.absolute_difference(
                        len_pred_score, fetch_data['len_score'])
                    loss += self.loss_style
                    # loss = tf.Print(loss, [loss], message='loss after:', summarize=100)

                obj = {
                    'decoder_target_list':
                    output.decoder_target_list,
                    'sentence_idxs':
                    sentence_idxs,
                    'sentence_simple_input_placeholder':
                    sentence_simple_input_placeholder,
                    'sentence_complex_input_placeholder':
                    sentence_complex_input_placeholder,
                }
                self.logits = output.decoder_logit_list
                if 'rule' in self.model_config.memory:
                    obj['rule_id_input_placeholder'] = rule_id_input_placeholder
                    obj['rule_target_input_placeholder'] = rule_target_input_placeholder
                    # obj['rule_pair_input_placeholder'] = rule_pair_input_placeholder
                    obj['mem_contexts'] = mem_contexts
                    obj['mem_outputs'] = mem_outputs
                    obj['mem_counter'] = mem_counter
                return loss, obj
import numpy as np
import jieba
import os
import sys
# import thulac

# thulac_seg = thulac.thulac(seg_only=True)

base_path = os.path.dirname(os.path.realpath(__file__))
model = Network(None, os.path.join(base_path, 'log.nosync/network_demo/run1'))
model.load_model(
    os.path.join(base_path,
                 'model_checkpoint/lstm_early_stopping_without_conc'))

path = os.path.join(base_path, 'word2vec_model/wiki.zh.nosync/wiki.zh.vec')
embedding = Embedding()
embedding.load_w2v_model(path, False)

max_steps = model.n_input_steps
n_embedding = model.n_embedding

categories = [
    '开关语音播报', '打电话', '发短信', '发邮件', '导航', '离职倾向', 'KPI', '访问网站', '会议室预定',
    '设置提醒', '查日程安排', '查会议安排', '查会议室安排情况', '查月度工作任务', '查工作任务完成情况', '查月度预算执行情况',
    '查当月费用报销情况', '查借款情况', '查应收款', '查应付款', '查考勤', '查出差情况', '查天气', '查股票', '讲笑话',
    '讲故事', '讲新闻', '订机票', '订火车票'
]


def run(s, verbose=False):
    s = s.lower()
Beispiel #11
0
class FastText(torch.nn.Module):
    """Implement fasttext classification method
    Reference: "Bag of Tricks for Efficient Text Classification"
    """
    def __init__(self, dataset, config):
        super(FastText, self).__init__()
        self.config = config
        assert "token" in self.config.feature.feature_names
        self.token_embedding = \
            Embedding(dataset.token_map,
                      config.embedding.dimension,
                      cDataset.DOC_TOKEN, config,
                      padding_idx=dataset.VOCAB_PADDING,
                      pretrained_embedding_file=
                      config.feature.token_pretrained_file,
                      mode=EmbeddingProcessType.SUM, dropout=0,
                      init_type=config.embedding.initializer,
                      low=-config.embedding.uniform_bound,
                      high=config.embedding.uniform_bound,
                      std=config.embedding.random_stddev,
                      activation_type=ActivationType.NONE)
        if self.config.feature.token_ngram > 1:
            self.token_ngram_embedding = \
                Embedding(dataset.token_ngram_map,
                          config.embedding.dimension,
                          cDataset.DOC_TOKEN_NGRAM, config,
                          padding_idx=dataset.VOCAB_PADDING,
                          mode=EmbeddingProcessType.SUM, dropout=0,
                          init_type=config.embedding.initializer,
                          low=-config.embedding.uniform_bound,
                          high=config.embedding.uniform_bound,
                          std=config.embedding.random_stddev,
                          activation_type=ActivationType.NONE)
        if "keyword" in self.config.feature.feature_names:
            self.keyword_embedding = \
                Embedding(dataset.keyword_map,
                          config.embedding.dimension,
                          cDataset.DOC_KEYWORD, config,
                          padding_idx=dataset.VOCAB_PADDING,
                          pretrained_embedding_file=
                          config.feature.keyword_pretrained_file,
                          mode=EmbeddingProcessType.SUM, dropout=0,
                          init_type=config.embedding.initializer,
                          low=-config.embedding.uniform_bound,
                          high=config.embedding.uniform_bound,
                          std=config.embedding.random_stddev,
                          activation_type=ActivationType.NONE)
        if "topic" in self.config.feature.feature_names:
            self.topic_embedding = \
                Embedding(dataset.topic_map,
                          config.embedding.dimension,
                          cDataset.DOC_TOPIC, config,
                          padding_idx=dataset.VOCAB_PADDING,
                          mode=EmbeddingProcessType.SUM, dropout=0,
                          init_type=config.embedding.initializer,
                          low=-config.embedding.uniform_bound,
                          high=config.embedding.uniform_bound,
                          std=config.embedding.random_stddev,
                          activation_type=ActivationType.NONE)
        self.linear = torch.nn.Linear(config.embedding.dimension,
                                      len(dataset.label_map))
        self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)

    def get_parameter_optimizer_dict(self):
        params = list()
        params.append({'params': self.token_embedding.parameters()})
        if self.config.feature.token_ngram > 1:
            params.append({'params': self.token_ngram_embedding.parameters()})
        if "keyword" in self.config.feature.feature_names:
            params.append({'params': self.keyword_embedding.parameters()})
        if "topic" in self.config.feature.feature_names:
            params.append({'params': self.topic_embedding.parameters()})
        params.append({'params': self.linear.parameters()})
        return params

    def update_lr(self, optimizer, epoch):
        """Update lr
        """
        if epoch > self.config.train.num_epochs_static_embedding:
            for param_group in optimizer.param_groups:
                param_group["lr"] = self.config.optimizer.learning_rate
        else:
            for param_group in optimizer.param_groups:
                param_group["lr"] = 0

    def forward(self, batch):
        doc_embedding = self.token_embedding(
            batch[cDataset.DOC_TOKEN].to(self.config.device),
            batch[cDataset.DOC_TOKEN_OFFSET].to(self.config.device))
        length = batch[cDataset.DOC_TOKEN_LEN].to(self.config.device)
        if self.config.feature.token_ngram > 1:
            doc_embedding += self.token_ngram_embedding(
                batch[cDataset.DOC_TOKEN_NGRAM].to(self.config.device),
                batch[cDataset.DOC_TOKEN_NGRAM_OFFSET].to(self.config.device))
            length += batch[cDataset.DOC_TOKEN_NGRAM_LEN].to(
                self.config.device)
        if "keyword" in self.config.feature.feature_names:
            doc_embedding += self.keyword_embedding(
                batch[cDataset.DOC_KEYWORD].to(self.config.device),
                batch[cDataset.DOC_KEYWORD_OFFSET].to(self.config.device))
            length += batch[cDataset.DOC_KEYWORD_LEN].to(self.config.device)
        if "topic" in self.config.feature.feature_names:
            doc_embedding += self.topic_embedding(
                batch[cDataset.DOC_TOPIC].to(self.config.device),
                batch[cDataset.DOC_TOPIC_OFFSET].to(self.config.device))
            length += batch[cDataset.DOC_TOPIC_LEN].to(self.config.device)

        doc_embedding /= length.resize_(doc_embedding.size()[0], 1)
        doc_embedding = self.dropout(doc_embedding)
        return self.linear(doc_embedding)
Beispiel #12
0
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt

import time

if __name__ == "__main__":
    dataset = SkipGram('data/rawdata.txt', 3)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=1024,
                                             shuffle=True)

    net = Embedding(len(dataset.idx_to_token), 2)
    optimizer = torch.optim.SGD(net.parameters(), lr=1e-1, momentum=0.9)

    MAX_EPOCH = 2500
    print('MAX_EPOCH', MAX_EPOCH)
    for epoch in range(MAX_EPOCH):
        if (epoch + 1) % 100 == 1:
            start, l_sum, n = time.time(), 0.0, 0

        for center_word, context_word, negative_word in dataloader:
            optimizer.zero_grad()
            l = net(center_word.view(-1, 1), context_word.view(-1, 1),
                    negative_word.view(-1, 1))
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training ParallelTransformerLayer...')

    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    hidden_dropout = args.hidden_dropout
    attention_dropout = args.attention_dropout
    num_layers = args.num_layers
    layernorm_epsilon = args.layernorm_epsilon
    num_attention_heads = args.num_attention_heads

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    labels = torch.randint(low=0,
                           high=vocab_size,
                           size=(batch_size, sequence_length))
    labels = labels.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=hidden_dropout,
                          init_method=init_method_normal)
    embedding_output = embedding.forward(input_indices, position_indices)

    transformer_layer = ParallelTransformerLayer(
        attention_mask_func=gpt2_attention_mask_func,
        layer_number=0,
        hidden_size=hidden_size,
        layernorm_epsilon=layernorm_epsilon,
        num_attention_heads=num_attention_heads,
        attention_dropout=attention_dropout,
        hidden_dropout=hidden_dropout)

    # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)
    attention_mask = (torch.randint(
        low=0,
        high=2,
        size=(sequence_length,
              divide(num_attention_heads, torch.distributed.get_world_size()),
              batch_size, batch_size)) < 0).cuda()

    optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()
    for epoch in range(num_epochs):
        input_ = torch.rand(size=embedding_output.size()).cuda()

        overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(overall_name)

        fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        # Forward pass
        profiler.start(fname)
        loss = transformer_layer.forward(input_, attention_mask)
        train_loss = torch.mean(loss)
        # print(train_loss)
        torch.cuda.synchronize()
        profiler.stop(fname)
        # Backward pass
        bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(bname)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(bname)

        profiler.stop(overall_name)
Beispiel #14
0
    def __init__(self, dataset, config):
        super(Classifier, self).__init__()
        self.config = config
        assert len(self.config.feature.feature_names) == 1
        assert self.config.feature.feature_names[0] == "token" or \
               self.config.feature.feature_names[0] == "char"
        if config.embedding.type == EmbeddingType.EMBEDDING:
            self.token_embedding = \
                Embedding(dataset.token_map, config.embedding.dimension,
                          cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING,
                          pretrained_embedding_file=
                          config.feature.token_pretrained_file,
                          mode=EmbeddingProcessType.FLAT,
                          dropout=self.config.embedding.dropout,
                          init_type=self.config.embedding.initializer,
                          low=-self.config.embedding.uniform_bound,
                          high=self.config.embedding.uniform_bound,
                          std=self.config.embedding.random_stddev,
                          fan_mode=self.config.embedding.fan_mode,
                          activation_type=ActivationType.NONE)
            self.char_embedding = \
                Embedding(dataset.char_map, config.embedding.dimension,
                          cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
                          mode=EmbeddingProcessType.FLAT,
                          dropout=self.config.embedding.dropout,
                          init_type=self.config.embedding.initializer,
                          low=-self.config.embedding.uniform_bound,
                          high=self.config.embedding.uniform_bound,
                          std=self.config.embedding.random_stddev,
                          fan_mode=self.config.embedding.fan_mode,
                          activation_type=ActivationType.NONE)
        elif config.embedding.type == EmbeddingType.REGION_EMBEDDING:
            self.token_embedding = RegionEmbeddingLayer(
                dataset.token_map, config.embedding.dimension,
                config.embedding.region_size, cDataset.DOC_TOKEN, config,
                padding=dataset.VOCAB_PADDING,
                pretrained_embedding_file=
                config.feature.token_pretrained_file,
                dropout=self.config.embedding.dropout,
                init_type=self.config.embedding.initializer,
                low=-self.config.embedding.uniform_bound,
                high=self.config.embedding.uniform_bound,
                std=self.config.embedding.random_stddev,
                fan_mode=self.config.embedding.fan_mode,
                region_embedding_type=config.embedding.region_embedding_type)

            self.char_embedding = RegionEmbeddingLayer(
                dataset.char_map, config.embedding.dimension,
                config.embedding.region_size, cDataset.DOC_CHAR, config,
                padding=dataset.VOCAB_PADDING,
                dropout=self.config.embedding.dropout,
                init_type=self.config.embedding.initializer,
                low=-self.config.embedding.uniform_bound,
                high=self.config.embedding.uniform_bound,
                std=self.config.embedding.random_stddev,
                fan_mode=self.config.embedding.fan_mode,
                region_embedding_type=config.embedding.region_embedding_type)
        else:
            raise TypeError(
                "Unsupported embedding type: %s. " % config.embedding.type)
        self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
Beispiel #15
0
class Graph():
    def __init__(self, data, is_train, model_config):
        self.model_config = model_config
        self.data = data
        self.is_train = is_train
        self.model_fn = None
        self.rand_unif_init = tf.random_uniform_initializer(-0, .08, 0.08)
        self.metric = Metric(self.model_config, self.data)

    def embedding_fn(self, inputs, embedding):
        if type(inputs) == list:
            if not inputs:
                return []
            else:
                return [
                    tf.nn.embedding_lookup(embedding, inp) for inp in inputs
                ]
        else:
            return tf.nn.embedding_lookup(embedding, inputs)

    def output_to_logit(self, prev_out, w, b):
        prev_logit = tf.add(tf.matmul(prev_out, tf.transpose(w)), b)
        return prev_logit

    def create_model_multigpu(self):
        losses = []
        grads = []
        ops = [tf.constant(0)]
        self.objs = []
        self.global_step = tf.train.get_or_create_global_step()
        optim = self.get_optim()

        fetch_data = None
        if self.model_config.fetch_mode == 'tf_example_dataset':
            fetch_data = self.data.get_data_sample()

        with tf.variable_scope(tf.get_variable_scope()) as scope:
            for gpu_id in range(self.model_config.num_gpus):
                with tf.device('/device:GPU:%d' % gpu_id):
                    with tf.name_scope('%s_%d' % ('gpu_scope', gpu_id)):
                        loss, obj = self.create_model(fetch_data=fetch_data)
                        if self.model_config.npad_mode == 'v1':
                            vars = tf.get_collection(
                                tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=
                                'model/transformer_decoder/decoder/layer_5/npad/'
                            )
                            grad = optim.compute_gradients(
                                loss,
                                colocate_gradients_with_ops=True,
                                var_list=vars)
                        elif self.model_config.npad_mode == 'static_seq':
                            vars = tf.get_collection(
                                tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope='model/transformer_decoder/npad/')
                            grad = optim.compute_gradients(
                                loss,
                                colocate_gradients_with_ops=True,
                                var_list=vars)
                        else:
                            grad = optim.compute_gradients(
                                loss, colocate_gradients_with_ops=True)
                        tf.get_variable_scope().reuse_variables()
                        losses.append(loss)
                        grads.append(grad)
                        if 'rule' in self.model_config.memory and self.is_train:
                            ops.append(obj['mem_contexts'])
                            ops.append(obj['mem_outputs'])
                            ops.append(obj['mem_counter'])
                        self.objs.append(obj)

        with tf.variable_scope('optimization'):
            self.loss = tf.divide(tf.add_n(losses), self.model_config.num_gpus)
            self.perplexity = tf.exp(tf.reduce_mean(self.loss))

            if self.is_train:
                avg_grad = self.average_gradients(grads)
                grads = [g for (g, v) in avg_grad]
                clipped_grads, _ = tf.clip_by_global_norm(
                    grads, self.model_config.max_grad_norm)
                if self.model_config.npad_mode == 'v1':
                    vars = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES,
                        scope='model/transformer_decoder/decoder/layer_5/npad/'
                    )
                elif self.model_config.npad_mode == 'static_seq':
                    vars = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES,
                        scope='model/transformer_decoder/npad/')
                else:
                    vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                self.train_op = optim.apply_gradients(
                    zip(clipped_grads, vars), global_step=self.global_step)
                self.increment_global_step = tf.assign_add(self.global_step, 1)

            self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
            self.ops = tf.tuple(ops)

    def create_model(self, fetch_data=None):
        with tf.variable_scope('variables'):
            sentence_simple_input_placeholder = []
            sentence_complex_input_placeholder = []
            if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                sentence_simple_segment_input_placeholder = []
                sentence_complex_segment_input_placeholder = []

            obj = {}
            if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset':
                for t in tf.unstack(fetch_data['line_comp_ids'], axis=1):
                    sentence_complex_input_placeholder.append(t)
                for t in tf.unstack(fetch_data['line_simp_ids'], axis=1):
                    sentence_simple_input_placeholder.append(t)

                if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                    for t in tf.unstack(fetch_data['line_comp_segids'],
                                        axis=1):
                        sentence_complex_segment_input_placeholder.append(t)
                    for t in tf.unstack(fetch_data['line_simp_segids'],
                                        axis=1):
                        sentence_simple_segment_input_placeholder.append(t)
                    obj['line_comp_segids'] = tf.stack(
                        sentence_complex_segment_input_placeholder, axis=1)
                    obj['line_simp_segids'] = tf.stack(
                        sentence_simple_segment_input_placeholder, axis=1)

                score = None
                if self.model_config.tune_style:
                    if self.is_train:
                        # In training, score are from fetch data
                        scores = []
                        if self.model_config.tune_style[0]:
                            ppdb_score = fetch_data['ppdb_score']
                            scores.append(ppdb_score)
                            print('Tune ppdb score!')
                            if 'plus' in self.model_config.tune_mode:
                                # to avoid most ppdb scores are 0
                                ppdb_score += 0.1
                        if self.model_config.tune_style[1]:
                            add_score = fetch_data['dsim_score']
                            scores.append(add_score)
                            print('Tune dsim_score score!')
                        if self.model_config.tune_style[2]:
                            add_score = fetch_data['add_score']
                            scores.append(add_score)
                            print('Tune add score!')
                        if self.model_config.tune_style[3]:
                            len_score = fetch_data['len_score']
                            scores.append(len_score)
                            print('Tune length score!')

                    else:
                        # In evaluating/predict, scores may be a  factor to multiply if in pred mode
                        #   or actual user provided score
                        # TODO(sanqiang): not used for now because not fech_data in eval
                        raise NotImplementedError('No tune style for training')
                        # ppdb_score = tf.constant(
                        #     self.model_config.tune_style[0], shape=[self.model_config.batch_size], dtype=tf.float32)
                        # add_score = tf.constant(
                        #     self.model_config.tune_style[1], shape=[self.model_config.batch_size], dtype=tf.float32)
                        # len_score = tf.constant(
                        #     self.model_config.tune_style[2], shape=[self.model_config.batch_size], dtype=tf.float32)

                    # Assemble scores
                    dimension_unit = int(self.model_config.dimension /
                                         len(scores))
                    dimension_runit = self.model_config.dimension - (
                        len(scores) - 1) * dimension_unit
                    for s_i, score in enumerate(scores):
                        if s_i < len(scores) - 1:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_unit]),
                                                         axis=1)
                        else:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_runit]),
                                                         axis=1)
                    score = tf.concat(scores, axis=-1)
            else:
                for step in range(self.model_config.max_simple_sentence):
                    sentence_simple_input_placeholder.append(
                        tf.zeros(self.model_config.batch_size,
                                 tf.int32,
                                 name='simple_input'))

                for step in range(self.model_config.max_complex_sentence):
                    sentence_complex_input_placeholder.append(
                        tf.zeros(self.model_config.batch_size,
                                 tf.int32,
                                 name='complex_input'))

                if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                    for step in range(self.model_config.max_simple_sentence):
                        sentence_simple_segment_input_placeholder.append(
                            tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='simple_seg_input'))

                    for step in range(self.model_config.max_complex_sentence):
                        sentence_complex_segment_input_placeholder.append(
                            tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='complex_seg_input'))

                    obj['line_comp_segids'] = tf.stack(
                        sentence_complex_segment_input_placeholder, axis=1)
                    obj['line_simp_segids'] = tf.stack(
                        sentence_simple_segment_input_placeholder, axis=1)

                score = None
                if self.model_config.tune_style:
                    if self.is_train:
                        raise NotImplementedError('No tune style for training')
                        #
                        # ppdb_score = tf.constant(
                        #     self.model_config.tune_style, shape=[self.model_config.batch_size], dtype=tf.float32)
                        # ppdb_score = tf.expand_dims(tf.tile(
                        #     tf.expand_dims(ppdb_score, axis=-1),
                        #     [1, self.model_config.dimension]), axis=1)
                    else:
                        scores = []
                        if self.model_config.tune_style:
                            if self.model_config.tune_style[0]:
                                ppdb_score = tf.constant(
                                    self.model_config.tune_style[0],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(ppdb_score)
                                print('tune ppdb score')
                            if self.model_config.tune_style[1]:
                                dsim_score = tf.constant(
                                    self.model_config.tune_style[1],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(dsim_score)
                                print('tune dsim score')
                            if self.model_config.tune_style[2]:
                                add_score = tf.constant(
                                    self.model_config.tune_style[2],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(add_score)
                                print('tune add score')
                            if self.model_config.tune_style[3]:
                                len_score = tf.constant(
                                    self.model_config.tune_style[3],
                                    shape=[self.model_config.batch_size],
                                    dtype=tf.float32)
                                scores.append(len_score)
                                print('tune len score')
                    # Assemble scores
                    dimension_unit = int(self.model_config.dimension /
                                         len(scores))
                    dimension_runit = self.model_config.dimension - (
                        len(scores) - 1) * dimension_unit
                    for s_i, score in enumerate(scores):
                        if s_i < len(scores) - 1:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_unit]),
                                                         axis=1)
                        else:
                            scores[s_i] = tf.expand_dims(tf.tile(
                                tf.expand_dims(scores[s_i], axis=-1),
                                [1, dimension_runit]),
                                                         axis=1)
                    score = tf.concat(scores, axis=-1)

            # For self.model_config.tune_style:
            comp_features = {}
            comp_add_score = tf.zeros(self.model_config.batch_size,
                                      tf.float32,
                                      name='comp_add_score_input')
            comp_length = tf.zeros(self.model_config.batch_size,
                                   tf.float32,
                                   name='comp_length_input')
            comp_features['comp_add_score'] = comp_add_score
            comp_features['comp_length'] = comp_length

            sentence_idxs = tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='sent_idx')

            self.embedding = Embedding(self.data.vocab_complex,
                                       self.data.vocab_simple,
                                       self.model_config)
            if self.model_config.bert_mode:
                emb_complex = None
            else:
                emb_complex = self.embedding.get_complex_embedding()
            if self.model_config.bert_mode and (
                    self.model_config.tie_embedding == 'all'
                    or self.model_config.tie_embedding == 'enc_dec'):
                emb_simple = None
            else:
                emb_simple = self.embedding.get_simple_embedding()

            if (self.is_train and self.model_config.pretrained_embedding):
                self.embed_complex_placeholder = tf.placeholder(
                    tf.float32, (self.data.vocab_complex.vocab_size(),
                                 self.model_config.dimension), 'complex_emb')
                self.replace_emb_complex = emb_complex.assign(
                    self.embed_complex_placeholder)

                self.embed_simple_placeholder = tf.placeholder(
                    tf.float32, (self.data.vocab_simple.vocab_size(),
                                 self.model_config.dimension), 'simple_emb')
                self.replace_emb_simple = emb_simple.assign(
                    self.embed_simple_placeholder)

            if self.model_config.bert_mode and (
                    self.model_config.tie_embedding == 'all'
                    or self.model_config.tie_embedding == 'dec_out'):
                w = None
            else:
                w = self.embedding.get_w()
            b = self.embedding.get_b()

            mem_contexts, mem_outputs, mem_counter = None, None, None
            rule_id_input_placeholder, rule_target_input_placeholder = [], []
            if 'rule' in self.model_config.memory:
                with tf.device('/cpu:0'):
                    context_size = 0
                    if self.model_config.framework == 'transformer':
                        context_size = 1
                    elif self.model_config.framework == 'seq2seq':
                        context_size = 2
                    mem_contexts = tf.get_variable(
                        'mem_contexts',
                        initializer=tf.constant(
                            0,
                            dtype=tf.float32,
                            shape=(self.data.vocab_rule.get_rule_size(),
                                   self.model_config.max_target_rule_sublen,
                                   self.model_config.dimension *
                                   context_size)),
                        trainable=False,
                        dtype=tf.float32)
                    mem_outputs = tf.get_variable(
                        'mem_outputs',
                        initializer=tf.constant(
                            0,
                            dtype=tf.float32,
                            shape=(self.data.vocab_rule.get_rule_size(),
                                   self.model_config.max_target_rule_sublen,
                                   self.model_config.dimension)),
                        trainable=False,
                        dtype=tf.float32)
                    mem_counter = tf.get_variable(
                        'mem_counter',
                        initializer=tf.constant(
                            0,
                            dtype=tf.int32,
                            shape=(self.data.vocab_rule.get_rule_size(), 1)),
                        trainable=False,
                        dtype=tf.int32)

            if 'direct' in self.model_config.memory or 'rule' in self.model_config.memory:
                if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset':
                    for t in tf.unstack(fetch_data['rule_id'], axis=1):
                        rule_id_input_placeholder.append(t)
                    for t in tf.unstack(fetch_data['rule_target'], axis=1):
                        rule_target_input_placeholder.append(t)
                else:
                    for step in range(self.model_config.max_cand_rules):
                        rule_id_input_placeholder.append(
                            tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='rule_id_input'))

                    for step in range(self.model_config.max_cand_rules):
                        if 'direct' in self.model_config.memory:
                            rule_target_input_placeholder.append(
                                tf.zeros(self.model_config.batch_size,
                                         tf.int32,
                                         name='rule_target_input'))
                        elif 'rule' in self.model_config.memory:
                            rule_target_input_placeholder.append(
                                tf.zeros(self.model_config.batch_size,
                                         tf.string,
                                         name='rule_target_input'))

        with tf.variable_scope('model'):
            output = self.model_fn(sentence_complex_input_placeholder,
                                   emb_complex,
                                   sentence_simple_input_placeholder,
                                   emb_simple, w, b, rule_id_input_placeholder,
                                   rule_target_input_placeholder, mem_contexts,
                                   mem_outputs, self.global_step, score,
                                   comp_features, obj)

            encoder_embs, final_outputs = None, None
            if self.model_config.replace_unk_by_emb:
                encoder_embs = tf.stack(output.encoder_embed_inputs_list,
                                        axis=1)

            if output.decoder_outputs_list is not None:
                if type(output.decoder_outputs_list) == list:
                    decoder_outputs_list = output.decoder_outputs_list
                    decoder_outputs = tf.stack(decoder_outputs_list, axis=1)
                else:
                    decoder_outputs = output.decoder_outputs_list

            if output.final_outputs_list is not None:
                if type(output.final_outputs_list) == list:
                    final_outputs_list = output.final_outputs_list
                    final_outputs = tf.stack(final_outputs_list, axis=1)
                else:
                    final_outputs = output.final_outputs_list

            attn_distr = None
            if self.model_config.replace_unk_by_attn:
                attn_distr = output.attn_distr_list

            if not self.is_train:
                # in beam search, it directly provide decoder target list
                decoder_target = tf.stack(output.decoder_target_list, axis=1)
                loss = tf.reduce_mean(output.decoder_score)
                obj = {
                    'sentence_idxs': sentence_idxs,
                    'sentence_simple_input_placeholder':
                    sentence_simple_input_placeholder,
                    'sentence_complex_input_placeholder':
                    sentence_complex_input_placeholder,
                    'decoder_target_list': decoder_target,
                    'final_outputs': final_outputs,
                    'encoder_embs': encoder_embs,
                    'attn_distr': attn_distr
                }
                if self.model_config.subword_vocab_size and self.model_config.seg_mode:
                    obj['sentence_complex_segment_input_placeholder'] = sentence_complex_segment_input_placeholder
                    obj['sentence_simple_segment_input_placeholder'] = sentence_simple_segment_input_placeholder
                if 'rule' in self.model_config.memory or 'direct' in self.model_config.memory:
                    obj['rule_id_input_placeholder'] = rule_id_input_placeholder
                    obj['rule_target_input_placeholder'] = rule_target_input_placeholder
                if self.model_config.tune_style:
                    obj['comp_features'] = comp_features
                return loss, obj
            else:
                # Memory Populate
                if 'rule' in self.model_config.memory:
                    # Update Memory through python injection
                    def update_memory(mem_contexts_tmp, mem_outputs_tmp,
                                      mem_counter_tmp, decoder_targets,
                                      decoder_outputs, contexts,
                                      rule_target_input_placeholder,
                                      rule_id_input_placeholder, global_step,
                                      encoder_outputs):
                        def _seq_contain(arr, tar):
                            j = 0
                            for i in range(len(arr)):
                                if arr[i] == tar[j]:
                                    j += 1
                                    if j == len(tar):
                                        return i - len(tar) + 1
                                else:
                                    j = 0
                            return -1

                        # if 'stopgrad' in self.model_config.rl_configs and global_step % 2 != 0:
                        #     return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp
                        # if global_step <= self.model_config.memory_prepare_step:
                        #     return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp

                        batch_size = np.shape(rule_target_input_placeholder)[0]
                        max_rules = np.shape(rule_target_input_placeholder)[1]
                        decoder_targets_str = [
                            ' '.join(sent) for sent in truncate_sents(
                                decode(
                                    decoder_targets, self.data.vocab_simple,
                                    self.model_config.subword_vocab_size > 0
                                    or 'bert_token' in
                                    self.model_config.bert_mode))
                        ]
                        for batch_id in range(batch_size):
                            cur_decoder_targets = decoder_targets[batch_id, :]
                            cur_decoder_targets_str = decoder_targets_str[
                                batch_id]

                            cur_decoder_outputs = decoder_outputs[batch_id, :]
                            cur_contexts = contexts[batch_id, :]

                            cur_rule_target_input_placeholder = rule_target_input_placeholder[
                                batch_id, :]
                            cur_rule_target_input_placeholder = [
                                tmp.decode("utf-8").strip('\x00')
                                for tmp in cur_rule_target_input_placeholder
                                if not tmp.decode("utf-8").strip().startswith(
                                    constant.SYMBOL_PAD)
                            ]
                            cur_rule_id_input_placeholder = rule_id_input_placeholder[
                                batch_id, :]

                            # Build the valid mapper from rule id => target words ids
                            rule_mapper = {}
                            for step in range(
                                    len(cur_rule_target_input_placeholder)):
                                rule_target_str = cur_rule_target_input_placeholder[
                                    step]
                                if rule_target_str == constant.SYMBOL_PAD:
                                    continue
                                rule_id = cur_rule_id_input_placeholder[step]
                                if rule_id != 0 and re.search(
                                        r'\b%s\b' % rule_target_str,
                                        cur_decoder_targets_str
                                ):  # decoder_target_str in cur_decoder_targets_str:
                                    decoder_target_wids = self.data.vocab_simple.encode(
                                        rule_target_str)
                                    dec_s_idx = _seq_contain(
                                        cur_decoder_targets,
                                        decoder_target_wids)
                                    if dec_s_idx != -1:
                                        print('rule_target_str:%s' %
                                              rule_target_str)
                                        print('cur_decoder_targets_str:%s' %
                                              cur_decoder_targets_str)
                                        print('cur_decoder_targets:%s' %
                                              cur_decoder_targets)
                                        print('decoder_target_wids:%s' %
                                              decoder_target_wids)
                                    rule_mapper[rule_id] = list(
                                        range(
                                            dec_s_idx, dec_s_idx +
                                            len(decoder_target_wids)))

                            for rule_id in rule_mapper:
                                dec_idxs = rule_mapper[rule_id]

                                for idx, dec_idx in enumerate(dec_idxs):
                                    if mem_counter_tmp[rule_id, 0] == 0:
                                        mem_contexts_tmp[
                                            rule_id,
                                            idx, :] = cur_contexts[dec_idx, :]
                                        mem_outputs_tmp[
                                            rule_id,
                                            idx, :] = cur_decoder_outputs[
                                                dec_idx, :]
                                    else:
                                        mem_contexts_tmp[rule_id, idx, :] = (
                                            cur_contexts[dec_idx, :] +
                                            mem_contexts_tmp[rule_id,
                                                             idx, :]) / 2
                                        mem_outputs_tmp[rule_id, idx, :] = (
                                            cur_decoder_outputs[dec_idx, :] +
                                            mem_outputs_tmp[rule_id,
                                                            idx, :]) / 2

                                mem_counter_tmp[rule_id, 0] += 1

                        return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp

                    mem_output_input = None
                    if 'mofinal' in self.model_config.memory_config:
                        mem_output_input = final_outputs
                    # elif 'modecode' in self.model_config.memory_config:
                    #     mem_output_input = decoder_outputs
                    # elif 'moemb' in self.model_config.memory_config:
                    #     mem_output_input = tf.stack(
                    #         self.embedding_fn(sentence_simple_input_placeholder, emb_simple),
                    #         axis=1)

                    mem_contexts, mem_outputs, mem_counter = tf.py_func(
                        update_memory, [
                            mem_contexts, mem_outputs, mem_counter,
                            tf.stack(output.decoder_target_list, axis=1),
                            mem_output_input, output.contexts,
                            tf.stack(rule_target_input_placeholder, axis=1),
                            tf.stack(rule_id_input_placeholder, axis=1),
                            self.global_step, output.encoder_outputs
                        ], [tf.float32, tf.float32, tf.int32],
                        stateful=False,
                        name='update_memory')

                #Loss and corresponding prior/mask
                decode_word_weight_list = [
                    tf.to_float(
                        tf.not_equal(
                            d,
                            self.data.vocab_simple.encode(
                                constant.SYMBOL_PAD)))
                    for d in output.gt_target_list
                ]
                decode_word_weight = tf.stack(decode_word_weight_list, axis=1)

                gt_target = tf.stack(output.gt_target_list, axis=1)

                def self_critical_loss():
                    # For minimize the negative log of probabilities
                    rewards = tf.py_func(
                        self.metric.self_crititcal_reward,
                        [
                            sentence_idxs,
                            tf.stack(output.sample_target_list, axis=-1),
                            tf.stack(output.decoder_target_list, axis=-1),
                            tf.stack(sentence_simple_input_placeholder,
                                     axis=-1),
                            tf.stack(sentence_complex_input_placeholder,
                                     axis=-1),
                            tf.ones((1, 1)),
                            # tf.stack(rule_target_input_placeholder, axis=1)
                        ],
                        tf.float32,
                        stateful=False,
                        name='reward')
                    rewards.set_shape((self.model_config.batch_size,
                                       self.model_config.max_simple_sentence))
                    rewards = tf.unstack(rewards, axis=1)

                    weighted_probs_list = [
                        rewards[i] * decode_word_weight_list[i] *
                        -output.sample_logit_list[i]
                        for i in range(len(decode_word_weight_list))
                    ]
                    total_size = tf.reduce_sum(decode_word_weight_list)
                    total_size += 1e-12
                    weighted_probs = tf.reduce_sum(
                        weighted_probs_list) / total_size
                    loss = weighted_probs
                    return loss

                def teacherforce_critical_loss():
                    losses = []
                    for step in range(self.model_config.max_simple_sentence):
                        logit = output.decoder_logit_list[step]
                        greedy_target_unit = tf.stop_gradient(
                            tf.argmax(logit, axis=1))
                        if self.model_config.train_mode == 'teachercriticalv2':
                            sampled_target_unit, reward = tf.py_func(
                                self.metric.self_crititcal_reward_unitv2, [
                                    sentence_idxs, step, greedy_target_unit,
                                    tf.stack(sentence_simple_input_placeholder,
                                             axis=-1),
                                    tf.stack(
                                        sentence_complex_input_placeholder,
                                        axis=-1), self.global_step
                                ], [tf.int32, tf.float32],
                                stateful=False,
                                name='reward')
                            reward.set_shape((self.model_config.batch_size, ))
                            sampled_target_unit.set_shape(
                                (self.model_config.batch_size, ))
                        elif self.model_config.train_mode == 'teachercritical':
                            sampled_target_unit = tf.cast(
                                tf.squeeze(tf.multinomial(logit, 1), axis=1),
                                tf.int32)
                            sampled_target_unit, reward = tf.py_func(
                                self.metric.self_crititcal_reward_unit, [
                                    sentence_idxs,
                                    step,
                                    sampled_target_unit,
                                    greedy_target_unit,
                                    tf.stack(sentence_simple_input_placeholder,
                                             axis=-1),
                                    tf.stack(
                                        sentence_complex_input_placeholder,
                                        axis=-1),
                                    self.global_step,
                                ], [tf.int32, tf.float32],
                                stateful=False,
                                name='reward')
                            reward.set_shape((self.model_config.batch_size, ))
                            sampled_target_unit.set_shape(
                                (self.model_config.batch_size, ))
                        indices = tf.stack([
                            tf.range(0,
                                     self.model_config.batch_size,
                                     dtype=tf.int32),
                            tf.squeeze(sampled_target_unit)
                        ],
                                           axis=-1)
                        logit_unit = tf.gather_nd(tf.nn.softmax(logit, axis=1),
                                                  indices)
                        decode_word_weight = decode_word_weight_list[step]
                        losses.append(-logit_unit * reward *
                                      decode_word_weight)
                    loss = tf.add_n(losses)
                    return loss

                def teacherforce_loss():
                    if self.model_config.number_samples > 0:
                        loss_fn = tf.nn.sampled_softmax_loss
                    else:
                        loss_fn = None
                    loss = sequence_loss(
                        logits=tf.stack(output.decoder_logit_list, axis=1),
                        targets=gt_target,
                        weights=decode_word_weight,
                        # softmax_loss_function=loss_fn,
                        # w=w,
                        # b=b,
                        # decoder_outputs=decoder_outputs,
                        # number_samples=self.model_config.number_samples
                    )
                    return loss

                if self.model_config.train_mode == 'dynamic_self-critical':
                    loss = self_critical_loss()
                    # loss = tf.cond(
                    #     tf.greater(self.global_step, 50000),
                    #     # tf.logical_and(tf.greater(self.global_step, 100000), tf.equal(tf.mod(self.global_step, 2), 0)),
                    #     lambda : self_critical_loss(),
                    #     lambda : teacherforce_loss())
                elif self.model_config.train_mode == 'teachercritical' or self.model_config.train_mode == 'teachercriticalv2':
                    loss = tf.cond(tf.equal(tf.mod(self.global_step, 2),
                                            0), lambda: teacherforce_loss(),
                                   lambda: teacherforce_critical_loss())

                    # loss = teacherforce_critical_loss()
                else:
                    loss = teacherforce_loss()

                if self.model_config.architecture == 'ut2t':
                    assert 'extra_encoder_loss' in output.obj_tensors and 'extra_decoder_loss' in output.obj_tensors
                    loss += output.obj_tensors['extra_encoder_loss']
                    loss += output.obj_tensors['extra_decoder_loss']
                    print('Use U T2T with ACT')

                self.loss_style = tf.constant(0.0, dtype=tf.float32)
                if output.pred_score_tuple is not None and 'pred' in self.model_config.tune_mode:
                    print('Create loss for predicting style')
                    ppdb_pred_score, add_pred_score, len_pred_score = output.pred_score_tuple
                    # ppdb_pred_score = tf.Print(ppdb_pred_score, [ppdb_pred_score, fetch_data['ppdb_score']],
                    #                            message='ppdb_pred_score:', first_n=-1, summarize=100)
                    # add_pred_score = tf.Print(add_pred_score, [add_pred_score, fetch_data['add_score']],
                    #                            message='add_pred_score:', first_n=-1, summarize=100)
                    # len_pred_score = tf.Print(len_pred_score, [len_pred_score, fetch_data['len_score']],
                    #                            message='len_pred_score:', first_n=-1, summarize=100)
                    # loss = tf.Print(loss, [loss], message='loss before:', summarize=100)
                    self.loss_style += tf.losses.absolute_difference(
                        ppdb_pred_score, fetch_data['ppdb_score'])
                    self.loss_style += tf.losses.absolute_difference(
                        add_pred_score, fetch_data['add_score'])
                    self.loss_style += tf.losses.absolute_difference(
                        len_pred_score, fetch_data['len_score'])
                    loss += self.loss_style
                    # loss = tf.Print(loss, [loss], message='loss after:', summarize=100)

                obj = {
                    'decoder_target_list':
                    output.decoder_target_list,
                    'sentence_idxs':
                    sentence_idxs,
                    'sentence_simple_input_placeholder':
                    sentence_simple_input_placeholder,
                    'sentence_complex_input_placeholder':
                    sentence_complex_input_placeholder,
                }
                self.logits = output.decoder_logit_list
                if 'rule' in self.model_config.memory:
                    obj['rule_id_input_placeholder'] = rule_id_input_placeholder
                    obj['rule_target_input_placeholder'] = rule_target_input_placeholder
                    # obj['rule_pair_input_placeholder'] = rule_pair_input_placeholder
                    obj['mem_contexts'] = mem_contexts
                    obj['mem_outputs'] = mem_outputs
                    obj['mem_counter'] = mem_counter
                return loss, obj

    def get_optim(self):
        learning_rate = tf.constant(self.model_config.learning_rate)

        if self.model_config.optimizer == 'adagrad':
            opt = tf.train.AdagradOptimizer(learning_rate)
        # Adam need lower learning rate
        elif self.model_config.optimizer == 'adam':
            opt = tf.train.AdamOptimizer(learning_rate)
        elif self.model_config.optimizer == 'lazy_adam':
            if not hasattr(self, 'hparams'):
                # In case not using Transformer model
                from tensor2tensor.models import transformer
                self.hparams = transformer.transformer_base()
            opt = tf.contrib.opt.LazyAdamOptimizer(
                self.hparams.learning_rate / 100.0,
                beta1=self.hparams.optimizer_adam_beta1,
                beta2=self.hparams.optimizer_adam_beta2,
                epsilon=self.hparams.optimizer_adam_epsilon)
        elif self.model_config.optimizer == 'adadelta':
            opt = tf.train.AdadeltaOptimizer(learning_rate)
        elif self.model_config.optimizer == 'sgd':
            opt = tf.train.GradientDescentOptimizer(learning_rate)
        else:
            raise Exception('Not Implemented Optimizer!')

        # if self.model_config.max_grad_staleness > 0:
        #     opt = tf.contrib.opt.DropStaleGradientOptimizer(opt, self.model_config.max_grad_staleness)

        return opt

    # Got from https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py#L101
    def average_gradients(self, tower_grads):
        """Calculate the average gradient for each shared variable across all towers.
        Note that this function provides a synchronization point across all towers.
        Args:
          tower_grads: List of lists of (gradient, variable) tuples. The outer list
            is over individual gradients. The inner list is over the gradient
            calculation for each tower.
        Returns:
           List of pairs of (gradient, variable) where the gradient has been averaged
           across all towers.
        """
        average_grads = []
        for grad_and_vars in zip(*tower_grads):
            # Note that each grad_and_vars looks like the following:
            #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
            grads = []
            for g, _ in grad_and_vars:
                # Add 0 dimension to the gradients to represent the tower.
                if g is None:
                    print('Useless tensors:%s' % grad_and_vars)
                expanded_g = tf.expand_dims(g, 0)

                # Append on a 'tower' dimension which we will average over below.
                grads.append(expanded_g)

            # Average over the 'tower' dimension.
            grad = tf.concat(axis=0, values=grads)
            grad = tf.reduce_mean(grad, 0)

            # Keep in mind that the Variables are redundant because they are shared
            # across towers. So .. we will just return the first tower's pointer to
            # the Variable.
            v = grad_and_vars[0][1]
            grad_and_var = (grad, v)
            average_grads.append(grad_and_var)
        return average_grads
Beispiel #16
0
class Classifier(torch.nn.Module):
    def __init__(self, dataset, config):
        super(Classifier, self).__init__()
        self.config = config
        self.token_embedding = \
            Embedding(dataset.token_map, config.embedding.dimension,
                      cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING,
                      pretrained_embedding_file=
                      config.feature.token_pretrained_file,
                      mode=EmbeddingProcessType.FLAT,
                      dropout=self.config.embedding.dropout,
                      init_type=self.config.embedding.initializer,
                      low=-self.config.embedding.uniform_bound,
                      high=self.config.embedding.uniform_bound,
                      std=self.config.embedding.random_stddev,
                      fan_mode=self.config.embedding.fan_mode,
                      activation_type=ActivationType.NONE,
                      model_mode=dataset.model_mode)
        self.char_embedding = \
            Embedding(dataset.char_map, config.embedding.dimension,
                      cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
                      mode=EmbeddingProcessType.FLAT,
                      dropout=self.config.embedding.dropout,
                      init_type=self.config.embedding.initializer,
                      low=-self.config.embedding.uniform_bound,
                      high=self.config.embedding.uniform_bound,
                      std=self.config.embedding.random_stddev,
                      fan_mode=self.config.embedding.fan_mode,
                      activation_type=ActivationType.NONE,
                      model_mode=dataset.model_mode)
        self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)

    def get_embedding(self, batch, pad_shape=None, pad_value=0):

        token_id = batch[cDataset.DOC_TOKEN].to(self.config.device)
        if pad_shape is not None:
            token_id = torch.nn.functional.pad(token_id,
                                               pad_shape,
                                               mode='constant',
                                               value=pad_value)
        embedding = self.token_embedding(token_id)
        length = batch[cDataset.DOC_TOKEN_LEN].to(self.config.device)
        mask = batch[cDataset.DOC_TOKEN_MASK].to(self.config.device)

        return embedding, length, mask

    def get_parameter_optimizer_dict(self):
        params = list()
        params.append({
            'params': self.token_embedding.parameters(),
            'is_embedding': True
        })
        return params

    def update_lr(self, optimizer, epoch):
        """Update lr
        """
        if epoch > self.config.train.num_epochs_static_embedding:
            for param_group in optimizer.param_groups[:2]:
                param_group["lr"] = self.config.optimizer.learning_rate
        else:
            for param_group in optimizer.param_groups[:2]:
                param_group["lr"] = 0

    def forward(self, batch):
        raise NotImplementedError
Beispiel #17
0
 def parse_emb_str(embs_str: str):
     emb_strs = embs_str.split()
     emb = Embedding()
     for emb_str in emb_strs:
         emb.add_dim(float(emb_str))
     return emb
import numpy as np
import os
import random
import jieba
# import thulac

# thulac_seg = thulac.thulac(seg_only=True)

max_sample = 1000
max_steps = 15
n_embedding = 300

base_path = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(base_path, 'word2vec_model/wiki.zh.nosync/wiki.zh.vec')
model_path = os.path.join(base_path, Config.CURRENT_MODEL_BASE_PATH)
embedding = Embedding()
embedding.load_w2v_model(path, False)

data = dict()
data_dir_path = os.path.join(model_path, 'data.nosync')
for fname in os.listdir(data_dir_path):
    if '.txt' not in fname:
        continue
    train_file_path = os.path.join(data_dir_path, fname)
    with open(train_file_path, 'r') as f:
        for l in f:
            comps = l.strip().split()
            label = int(comps[1])
            if label not in data:
                data[label] = []
            data[label].append((comps[0], label))
Beispiel #19
0
    def create_model(self):
        with tf.variable_scope('variables'):
            sentence_simple_input_placeholder = []
            for step in range(self.model_config.max_simple_sentence):
                sentence_simple_input_placeholder.append(
                    tf.zeros(self.model_config.batch_size,
                             tf.int32,
                             name='simple_input'))

            sentence_complex_input_placeholder = []
            for step in range(self.model_config.max_complex_sentence):
                sentence_complex_input_placeholder.append(
                    tf.zeros(self.model_config.batch_size,
                             tf.int32,
                             name='complex_input'))

            sentence_idxs = tf.zeros(self.model_config.batch_size,
                                     tf.int32,
                                     name='sent_idx')

            embedding = Embedding(self.data.vocab_complex,
                                  self.data.vocab_simple, self.model_config)
            emb_complex = embedding.get_complex_embedding()
            emb_simple = embedding.get_simple_embedding()

            w = embedding.get_w()
            b = embedding.get_b()

            mem_contexts, mem_outputs, mem_counter = None, None, None
            rule_id_input_placeholder, rule_target_input_placeholder = [], []
            rule_pair_input_placeholder = []
            if 'rule' in self.model_config.memory:
                with tf.device('/cpu:0'):
                    context_size = 0
                    if self.model_config.framework == 'transformer':
                        context_size = 1
                    elif self.model_config.framework == 'seq2seq':
                        context_size = 2
                    mem_contexts = tf.get_variable(
                        'mem_contexts',
                        initializer=tf.constant(
                            0,
                            dtype=tf.float32,
                            shape=(self.data.vocab_rule.get_rule_size(),
                                   self.model_config.dimension *
                                   context_size)),
                        trainable=False,
                        dtype=tf.float32)
                    mem_outputs = tf.get_variable(
                        'mem_outputs',
                        initializer=tf.constant(
                            0,
                            dtype=tf.float32,
                            shape=(self.data.vocab_rule.get_rule_size(),
                                   self.model_config.dimension)),
                        trainable=False,
                        dtype=tf.float32)
                    mem_counter = tf.get_variable(
                        'mem_counter',
                        initializer=tf.constant(
                            0,
                            dtype=tf.int32,
                            shape=(self.data.vocab_rule.get_rule_size(), 1)),
                        trainable=False,
                        dtype=tf.int32)

                for step in range(self.model_config.max_cand_rules):
                    rule_id_input_placeholder.append(
                        tf.zeros(self.model_config.batch_size,
                                 tf.int32,
                                 name='rule_id_input'))

                for step in range(self.model_config.max_cand_rules):
                    rule_target_input_placeholder.append(
                        tf.zeros(self.model_config.batch_size,
                                 tf.int32,
                                 name='rule_target_input'))

                for step in range(self.model_config.max_cand_rules):
                    rule_pair_input_placeholder.append(
                        tf.zeros([self.model_config.batch_size, 2],
                                 tf.int32,
                                 name='rule_pair_input'))

        with tf.variable_scope('model'):
            output = self.model_fn(sentence_complex_input_placeholder,
                                   emb_complex,
                                   sentence_simple_input_placeholder,
                                   emb_simple, w, b, rule_id_input_placeholder,
                                   mem_contexts, mem_outputs, self.global_step)

            encoder_embs, final_outputs = None, None
            if self.model_config.replace_unk_by_emb:
                encoder_embs = tf.stack(output.encoder_embed_inputs_list,
                                        axis=1)

            if output.decoder_outputs_list is not None:
                if type(output.decoder_outputs_list) == list:
                    decoder_outputs_list = output.decoder_outputs_list
                    decoder_outputs = tf.stack(decoder_outputs_list, axis=1)
                else:
                    decoder_outputs = output.decoder_outputs_list

            if output.final_outputs_list is not None:
                if type(output.final_outputs_list) == list:
                    final_outputs_list = output.final_outputs_list
                    final_outputs = tf.stack(final_outputs_list, axis=1)
                else:
                    final_outputs = output.final_outputs_list

            attn_distr = None
            if self.model_config.replace_unk_by_attn:
                attn_distr = output.attn_distr_list

            if not self.is_train:
                # in beam search, it directly provide decoder target list
                decoder_target = tf.stack(output.decoder_target_list, axis=1)
                loss = tf.reduce_mean(output.decoder_score)
                obj = {
                    'sentence_idxs': sentence_idxs,
                    'sentence_simple_input_placeholder':
                    sentence_simple_input_placeholder,
                    'sentence_complex_input_placeholder':
                    sentence_complex_input_placeholder,
                    'decoder_target_list': decoder_target,
                    'final_outputs': final_outputs,
                    'encoder_embs': encoder_embs,
                    'attn_distr': attn_distr
                }
                if 'rule' in self.model_config.memory:
                    obj['rule_id_input_placeholder'] = rule_id_input_placeholder
                    obj['rule_target_input_placeholder'] = rule_target_input_placeholder
                return loss, obj
            else:
                # Memory Populate
                if 'rule' in self.model_config.memory:
                    # Update Memory through python injection
                    def update_memory(mem_contexts_tmp, mem_outputs_tmp,
                                      mem_counter_tmp, decoder_targets,
                                      decoder_outputs, contexts,
                                      rule_target_input_placeholder,
                                      rule_id_input_placeholder, global_step,
                                      emb_simple, encoder_outputs):
                        if global_step <= self.model_config.memory_prepare_step:
                            return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp

                        batch_size = np.shape(rule_target_input_placeholder)[0]
                        max_rules = np.shape(rule_target_input_placeholder)[1]
                        for batch_id in range(batch_size):
                            cur_decoder_targets = decoder_targets[batch_id, :]
                            cur_decoder_outputs = decoder_outputs[batch_id, :]
                            cur_contexts = contexts[batch_id, :]
                            cur_rule_target_input_placeholder = rule_target_input_placeholder[
                                batch_id, :]
                            cur_rule_id_input_placeholder = rule_id_input_placeholder[
                                batch_id, :]

                            rule_mapper = {}
                            for step in range(max_rules):
                                rule_id = cur_rule_id_input_placeholder[step]
                                if rule_id != 0:
                                    decoder_target = cur_rule_target_input_placeholder[
                                        step]
                                    if rule_id not in rule_mapper:
                                        rule_mapper[rule_id] = []
                                    rule_mapper[rule_id].append(decoder_target)

                            for rule_id in rule_mapper:
                                rule_targets = rule_mapper[rule_id]
                                decoder_target_orders = np.where(
                                    cur_decoder_targets == rule_targets[0])[0]
                                for decoder_target_order in decoder_target_orders:
                                    if len(rule_targets) > 1:
                                        if decoder_target_order + 1 >= len(
                                                cur_decoder_targets
                                        ) or rule_targets[
                                                1] != cur_decoder_targets[
                                                    decoder_target_order + 1]:
                                            continue
                                    if len(rule_targets) > 2:
                                        if decoder_target_order + 2 >= len(
                                                cur_decoder_targets
                                        ) or rule_targets[
                                                2] != cur_decoder_targets[
                                                    decoder_target_order + 2]:
                                            continue
                                    cur_context, cur_outputs = None, None
                                    for step, _ in enumerate(rule_targets):
                                        if step == 0:
                                            cur_context = cur_contexts[
                                                decoder_target_order, :]
                                            cur_outputs = cur_decoder_outputs[
                                                decoder_target_order, :]
                                        else:
                                            cur_context += cur_contexts[
                                                step + decoder_target_order, :]
                                            cur_outputs += cur_decoder_outputs[
                                                step + decoder_target_order, :]
                                    cur_context /= len(rule_targets)
                                    cur_outputs /= len(rule_targets)
                                    if mem_counter_tmp[rule_id, 0] == 0:
                                        mem_contexts_tmp[
                                            rule_id, :] = cur_context
                                        mem_outputs_tmp[
                                            rule_id, :] = cur_outputs
                                    else:
                                        mem_contexts_tmp[rule_id, :] = (
                                            cur_context +
                                            mem_contexts_tmp[rule_id, :]) / 2
                                        mem_outputs_tmp[rule_id, :] = (
                                            cur_outputs +
                                            mem_outputs_tmp[rule_id, :]) / 2
                                    mem_counter_tmp[rule_id, 0] += 1

                        return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp

                    mem_output_input = None
                    if 'mofinal' in self.model_config.memory_config:
                        mem_output_input = final_outputs
                    # elif 'modecode' in self.model_config.memory_config:
                    #     mem_output_input = decoder_outputs
                    # elif 'moemb' in self.model_config.memory_config:
                    #     mem_output_input = tf.stack(
                    #         self.embedding_fn(sentence_simple_input_placeholder, emb_simple),
                    #         axis=1)

                    mem_contexts, mem_outputs, mem_counter = tf.py_func(
                        update_memory, [
                            mem_contexts, mem_outputs, mem_counter,
                            tf.stack(output.decoder_target_list, axis=1),
                            mem_output_input, output.contexts,
                            tf.stack(rule_target_input_placeholder, axis=1),
                            tf.stack(rule_id_input_placeholder,
                                     axis=1), self.global_step, emb_simple,
                            output.encoder_outputs
                        ], [tf.float32, tf.float32, tf.int32],
                        stateful=False,
                        name='update_memory')

                #Loss and corresponding prior/mask
                decode_word_weight_list = [
                    tf.to_float(
                        tf.not_equal(
                            d,
                            self.data.vocab_simple.encode(
                                constant.SYMBOL_PAD)))
                    for d in output.gt_target_list
                ]
                decode_word_weight = tf.stack(decode_word_weight_list, axis=1)

                gt_target = tf.stack(output.gt_target_list, axis=1)

                def self_critical_loss():
                    # For minimize the negative log of probabilities
                    rewards = tf.py_func(
                        self.metric.self_crititcal_reward,
                        [
                            sentence_idxs,
                            tf.stack(output.sample_target_list, axis=-1),
                            tf.stack(output.decoder_target_list, axis=-1),
                            tf.stack(sentence_simple_input_placeholder,
                                     axis=-1),
                            tf.stack(sentence_complex_input_placeholder,
                                     axis=-1),
                            tf.ones((1, 1)),
                            # tf.stack(rule_target_input_placeholder, axis=1)
                        ],
                        tf.float32,
                        stateful=False,
                        name='reward')
                    rewards.set_shape((self.model_config.batch_size,
                                       self.model_config.max_simple_sentence))
                    rewards = tf.unstack(rewards, axis=1)

                    weighted_probs_list = [
                        rewards[i] * decode_word_weight_list[i] *
                        -output.sample_logit_list[i]
                        for i in range(len(decode_word_weight_list))
                    ]
                    total_size = tf.reduce_sum(decode_word_weight_list)
                    total_size += 1e-12
                    weighted_probs = tf.reduce_sum(
                        weighted_probs_list) / total_size
                    loss = weighted_probs
                    return loss

                def teacherforce_critical_loss():
                    losses = []
                    for step in range(self.model_config.max_simple_sentence):
                        logit = output.decoder_logit_list[step]
                        greedy_target_unit = tf.stop_gradient(
                            tf.argmax(logit, axis=1))
                        if self.model_config.train_mode == 'teachercriticalv2':
                            sampled_target_unit, reward = tf.py_func(
                                self.metric.self_crititcal_reward_unitv2, [
                                    sentence_idxs,
                                    step,
                                    greedy_target_unit,
                                    tf.stack(sentence_simple_input_placeholder,
                                             axis=-1),
                                    tf.stack(
                                        sentence_complex_input_placeholder,
                                        axis=-1),
                                ], [tf.int32, tf.float32],
                                stateful=False,
                                name='reward')
                            reward.set_shape((self.model_config.batch_size, ))
                            sampled_target_unit.set_shape(
                                (self.model_config.batch_size, ))
                        elif self.model_config.train_mode == 'teachercritical':
                            sampled_target_unit = tf.cast(
                                tf.squeeze(tf.multinomial(logit, 1), axis=1),
                                tf.int32)
                            reward = tf.py_func(
                                self.metric.self_crititcal_reward_unit, [
                                    sentence_idxs,
                                    step,
                                    sampled_target_unit,
                                    greedy_target_unit,
                                    tf.stack(sentence_simple_input_placeholder,
                                             axis=-1),
                                    tf.stack(
                                        sentence_complex_input_placeholder,
                                        axis=-1),
                                    tf.ones((1, 1)),
                                ],
                                tf.float32,
                                stateful=False,
                                name='reward')
                            reward.set_shape((self.model_config.batch_size, ))
                        indices = tf.stack([
                            tf.range(0,
                                     self.model_config.batch_size,
                                     dtype=tf.int32),
                            tf.squeeze(sampled_target_unit)
                        ],
                                           axis=-1)
                        logit_unit = tf.gather_nd(tf.nn.softmax(logit, axis=1),
                                                  indices)
                        decode_word_weight = decode_word_weight_list[step]
                        losses.append(-logit_unit * reward *
                                      decode_word_weight)
                    loss = tf.add_n(losses)
                    return loss

                def teacherforce_loss():
                    if self.model_config.number_samples > 0:
                        loss_fn = tf.nn.sampled_softmax_loss
                    else:
                        loss_fn = None
                    loss = sequence_loss(
                        logits=tf.stack(output.decoder_logit_list, axis=1),
                        targets=gt_target,
                        weights=decode_word_weight,
                        # softmax_loss_function=loss_fn,
                        # w=w,
                        # b=b,
                        # decoder_outputs=decoder_outputs,
                        # number_samples=self.model_config.number_samples
                    )
                    return loss

                if self.model_config.train_mode == 'dynamic_self-critical':
                    loss = self_critical_loss()
                    # loss = tf.cond(
                    #     tf.greater(self.global_step, 50000),
                    #     # tf.logical_and(tf.greater(self.global_step, 100000), tf.equal(tf.mod(self.global_step, 2), 0)),
                    #     lambda : self_critical_loss(),
                    #     lambda : teacherforce_loss())
                elif self.model_config.train_mode == 'teachercritical' or self.model_config.train_mode == 'teachercriticalv2':
                    loss = tf.cond(tf.equal(tf.mod(self.global_step, 3),
                                            0), lambda: teacherforce_loss(),
                                   lambda: teacherforce_critical_loss())

                    # loss = teacherforce_critical_loss()
                else:
                    loss = teacherforce_loss()

                # if 'ruleattn' in self.model_config.external_loss:
                #     batch_pos = tf.range(
                #         self.model_config.batch_size * self.model_config.max_cand_rules) // self.model_config.max_cand_rules
                #     batch_pos = tf.reshape(
                #         batch_pos, [self.model_config.batch_size, self.model_config.max_cand_rules])
                #     batch_pos = tf.expand_dims(batch_pos, axis=2)
                #     ids = tf.stack(rule_pair_input_placeholder, axis=1)
                #     bias = 1.0 - tf.to_float(
                #         tf.logical_and(tf.equal(ids[:, :, 0], 0), tf.equal(ids[:, :, 1], 0)))
                #     ids = tf.concat([batch_pos, ids], axis=2)
                #     distrs = tf.stack(output.attn_distr_list, axis=1)
                #     ruleattn_loss = -tf.gather_nd(distrs, ids)*bias
                #     loss += ruleattn_loss
                #     self.pairs = tf.stack(rule_pair_input_placeholder, axis=1)

                obj = {
                    'sentence_idxs':
                    sentence_idxs,
                    'sentence_simple_input_placeholder':
                    sentence_simple_input_placeholder,
                    'sentence_complex_input_placeholder':
                    sentence_complex_input_placeholder,
                }
                self.logits = output.decoder_logit_list
                if 'rule' in self.model_config.memory:
                    obj['rule_id_input_placeholder'] = rule_id_input_placeholder
                    obj['rule_target_input_placeholder'] = rule_target_input_placeholder
                    obj['rule_pair_input_placeholder'] = rule_pair_input_placeholder
                    obj['mem_contexts'] = mem_contexts
                    obj['mem_outputs'] = mem_outputs
                    obj['mem_counter'] = mem_counter
                return loss, obj
            random.seed, torch.manual_seed, torch.cuda.manual_seed_all
    ]:
        set_random_seed(opts.seed)

    base_model = opts.backbone(pretrained=True)

    if opts.l2norm:
        model = L2NormEmbedding(
            base_model,
            feature_size=base_model.output_size,
            embedding_size=opts.embedding_size,
        ).cuda()
    else:
        model = Embedding(
            base_model,
            feature_size=base_model.output_size,
            embedding_size=opts.embedding_size,
        ).cuda()

    if opts.load is not None:
        model.load_state_dict(torch.load(opts.load))
        print("Loaded Model from %s" % opts.load)

    train_transform, test_transform = build_transform(base_model)

    dataset_train = dataset.FashionInshop(opts.data,
                                          split="train",
                                          transform=train_transform)
    dataset_query = dataset.FashionInshop(opts.data,
                                          split="query",
                                          transform=test_transform)