Beispiel #1
0
	def PositionalEmb(max_seq, d_model) :
	    half_d = (int(d_model + 1) / 2)

	    # [0, 2, 4, 6 ...] => 2i array
	    arr = np.multiply(2.0, np.arange(0, half_d, dtype=float))
	    
	    # 1 / 10000 ^ (2i / d_model)
	    even = np.power(10000.0, -np.divide(arr, d_model))
	    emb = np.repeat(even, 2)[0:d_model]
	    
	    # value in odd dimension would be converted to cosine
	    phase = [0.0, np.pi/2] * half_d
	    phase = phase[0:d_model]

	    positional_embedding = []
	    for pos in range(max_seq) :
	        # pos / 10000 ^ (2i / d_model) 
	        emb_pos = np.multiply(emb, pos)
	        
	        # odd dim => (pos/...) => (pos/...) + pi / 2
	        emb_pos = np.add(emb_pos, phase)
	        
	        positional_embedding.append(np.sin(emb_pos))
	    
	    # return shape of [1, max_seq, d_model]
	    return tf.expand_dim(tf.constant(positional_embedding), [0])
Beispiel #2
0
    def build_model(self):
        """
        Build the model
        """
        print('Building model for the %s mode operation' % (self.mode))
        if self.mode == 'train':
            mean, stddev = self._encoder_network(self.input_image)
            sample = tf.random_normal(
                [self.config.batch_size, self.config.latent_vector_length],
                0,
                1,
                dtype=tf.float32)
            guess_sample = stddev * sample + mean
            self.reconstructed_image = self._decoder_network(guess_sample)
            self.total_loss = self._setup_loss(self.reconstructed_image, mean,
                                               stddev)

            for variable in tf.trainable_variables():
                tf.summary.histogram(variable.name, variable)

            self.merged_summary = tf.summary.merge_all()
        else:
            self.sample_prior = tf.placeholder(
                name="sample_prior",
                dtype=tf.float32,
                shape=[self.latent_vector_length])
            self.sample_prior = tf.expand_dim(self.sample_prior, 0)
            self.reconstructed_image = self._decoder_network(self.sample_prior)
Beispiel #3
0
def multihead_attention(queries, keys, embedding_dim=512, num_head=8, dropout_rate=0, is_training=True, future_blind=False):
	Q = tf.layers.dense(queries, embedding_dim, activation=tf.nn.relu)
	K = tf.layers.dense(keys, embedding_dim, activation=tf.nn.relu)
	V = tf.layers.dense(keys, embedding_dim, activation=tf.nn.relu)

	Q_ = tf.concat(tf.split(Q, num_head, axis=2), axis=0)
	K_ = tf.concat(tf.split(K, num_head, axis=2), axis=0)
	V_ = tf.concat(tf.split(V, num_head, axis=2), axis=0)

	output = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
	output = output/tf.sqrt((K_.get_shape().as_list()[-1]))

# key masking 此处mask来自网络,我也没懂
	key_mask = tf.sign(tf.reduce_sum(tf.abs(keys), axis=2))
	key_mask = tf.tile(key_mask, [num_head,1])
	key_mask = tf.tile(tf.expand_dim(key_mask,1), [1, tf.shape(queries)[1], 1])

	pad = tf.ones_like(output)*(-2**32+1)
	output = tf.where(tf.equal(key_mask, 0), pad, output)

	if future_blind:
		diag_vals = tf.ones_like(output[0, :, :])
		tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()
		mask = tf.tile(tf.expand_dim(tril, 0), [tf.shape(output)[0],1,1])
		padding = tf.ones_like(mask)*(-2**32+1)
		output = tf.where(tf.equal(mask, 0), padding, output)
	output = tf.nn.softmax(output)

	#queries masking
	query_mask = tf.sign(tf.reduce_sum(tf.abs(queries), axis=1))
	query_mask = tf.tile(query_mask, [num_head, 1])
	query_mask = tf.tile(tf.expand_dim(query_mask, -1), [1, 1, tf.shape(keys)[1]])
	output = output*query_mask

	output = tf.layers.dropout(output, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
	output = tf.matmul(output, V_)
	output = tf.concat(tf.split(output, num_head, axis=0), axis=2)
	#残差连接
	output = output + queries
	#LN
	output = LN(output)
	return output
Beispiel #4
0
    def call(self, input, mask=None):
        b_s = input.shape[0]

        logits = tf.transpose(self.logits(input), [0, 2, 1])

        if mask is not None:
            mask = tf.tile(tf.expand_dim(mask, 1), [1, self.n_lockups, 1])
            paddings = tf.fill(mask.shape, -float("inf"))
            logits = tf.where(mask, paddings, logits)

        attention = tf.nn.softmax(logits, dim=-1)

        return tf.matmul(attention, input)
Beispiel #5
0
def position_embedding(inputs, vocab_size, embedding_dim):
	"""
	构建位置向量
	"""
	batchsize, maxlen = inputs.get_shape().as_list()
	pos_ind = tf.tile(tf.expand_dim(tf.range(maxlen),0),[batchsize,1])
	pos_enc = [[pos/np.power(10000, 2.*i/num_units) for i in range(embedding_dim)] for pos in range(maxlen)]
	pos_enc[:,0::2] = np.sin(pos_enc[:,0::2])
	pos_enc[:,1::3] = np.cos(pos_enc[:,1::3])
	lookup_table = tf.convert_to_tensor(pos_enc)
	lookup_table = tf.concat((tf.zeros(shape=[1,embedding_dim]),lookup_table[1:,:]),0)
	output = tf.nn.embedding_lookup(lookup_table, pos_ind)
	output = output/tf.sqrt(embedding_dim)
	return output
Beispiel #6
0
		def unit(Feature_input,dropout_keep_prob):
			with tf.variable_scope('FeatureExtractor') as scope:
				with tf.device('/cpu:0'),tf.name_scope("embedding") as scope:
					W_fe = tf.get_variable(
						name = 'W_fe',
						initializer=tf.random_uniform([self.vocab_size+1,self.dis_emb_dim],-1.0,1.0),)

					embedded_chars = tf.nn_embedding_lookup(W_fe,Feature_input+1)
					embedded_chars_expanded = tf.expand_dim(embedded_chars,-1)

				# Create a convolution + maxpool layer for each filter size
				pooled_outputs = []
				for filter_size,num_filter in zip(self.filter_sizes,self.num_filters):
					with tf.name_scope("conv-maxpool-%s"%filter_size) as scope:
					#Convolution Layer 
					filter_shape = [filter_size,self.dis_emb_dim,1,num_filter]
					W = tf.get_variable(name = "W-%s"%filter_size,
										initializer = tf.truncated_normal(filter_shape,stddev=0.1))  
					b = tf.get_variable(name='b-%s'%filter_size,
										initializer=tf.constant(0.1,shape=[num_filter]))
					conv = tf.nn.conv2d(
						embedding_chars_expanded,
						W,
						strides = [1,1,1,1],
						padding = "VALID",
						name='conv-%s'%filter_size)

					h = tf.nn.relu(tf.nn.bias_add(conv,b),name='relu-%s' % filter_size)
					# Maxpooling over the outputs
					pooled = tf.nn.max_ppol(
						h,
						ksize = [1,self.sequence_length - filter_size + 1, 1, 1],
						strides=[1,1,1,1],
						padding='VALID',
						name='pool-%s' % filter_size)
					pooled_outputs.append(pooled)
				h_pool = tf.concat(pooled_outputs,3)
				h_pool_flat = tf.reshape(h_pool,[-1,self.num_filters_total])

				# Add highway
				with tf.name_scope('dropout'):
					h_drop = tf.nn.dropout(h_highway,dropout_keep_prob)

				# Add dropout
				with tf.name_scope('dropout'):
					h_drop = tf.nn.dropout(h_highway,dropout_keep_prob) 
			return h_drop

		return unit
Beispiel #7
0
def capsule_discriminator(x_image):
    """ capsule network as discriminator
    """
    x = tf.reshape(x_image, [-1, 28, 28, 1])
    conv1 = tf.layers.conv2d(
        inputs=x,
        filters=256,
        kernel_size=[9, 9],
        padding="valid",
        activation=tf.nn.relu,
        name="d_ReLU_Conv1")
    conv1 = tf.expand_dim(conv1, axis=-2)
    # Convolutional capsules
    primary_caps = capsule.conv2d(conv1, 32, 8, [9, 9], strides=(2, 2), name="d_PrimaryCaps")
    primary_caps = tf.reshape(primary_caps, [-1, primary_caps.shape[1].value * primary_caps.shape[2].value * 32, 8])
    # Fully Connected capsules with routing by agreement. Binary classifier.
    digit_caps = capsule.dense(primary_caps, 1, 16, iter_routing=3, learn_coupling=False, mapfn_parallel_iterations=16, name="d_DigitCaps")
    # The length of the capsule activation vectors.
    length = tf.sqrt(tf.reduce_sum(tf.square(digit_caps), axis=1), name="Length")
    return length
Beispiel #8
0
    def _mix(self, generate_scores, copy_scores):
        # TODO is this correct? should verify the following code.
        """
        B is batch_size, V is vocab_size, L is length of every input_id
        print genreate_scores.shape     --> (B, V)
        print copy_scores.shape         --> (B, L)
        print self._helper.inputs_ids   --> (B, L)
        """
        # mask is (B, L, V)
        mask = tf.one_hot(self._helper.encoder_inputs_ids,
                          self.config.vocab_size)

        # choice one, move generate_scores to copy_scores
        expanded_generate_scores = tf.expand_dim(generate_scores, 1)  # (B,1,V)
        actual_copy_scores = copy_scores + tf.reduce_sum(
            mask * expanded_generate_scores, 2)
        actual_generate_scores = generate_scores - tf.reduce_sum(
            mask * expanded_generate_scores, 1)

        # choice two, move copy_scores to generate_scores
        '''
        expanded_copy_scores = tf.expand_dims(copy_scores, 2)
        acutual_generate_scores = generate_scores + tf.reduce_sum(
                                    mask * expanded_copy_scores, 1)
        acutual_copy_scores = copy_scores - tf.reduce_sum(
                                    mask * expanded_copy_scores, 2)
        '''

        mix_scores = tf.concat([actual_generate_scores, actual_copy_scores], 1)
        mix_scores = tf.nn.softmax(mix_scores, -1)  # mix_scores is (B, V+L)

        # make sure mix_socres.shape is (B, V + encoder_max_seq_len)
        padding_size = self.config.encoder_max_seq_len - self.shape(
            copy_scores)[1]
        mix_scores = tf.pad(mix_scores, [[0, 0], [0, padding_size]])

        return mix_scores
Beispiel #9
0
    def _build_model(self):
        filters = [128, 512]
        filter_size = [5, 3]
        filter_strides = [1, 1]
        pool1_size = [2, 4]
        pool2_size = [1, 2]
        p = 5
        with tf.variable_scope('cnn'):
            with tf.variable_scope('unit-1'):
                x = self._conv2d(self.inputs, 'cnn-1', filter_size,
                                 FLAGS.image_channel, filters[0],
                                 filter_strides)
                #x = self._batch_norm('bn1', x)
                x = tf.layers.batch_normalization(x,
                                                  axis=-1,
                                                  momentum=0.9,
                                                  training=self.train,
                                                  name='bn1')
                x = self._leaky_relu(x, 0.01)
                x = tf.layers.dropout(
                    x, FLAGS.dropout_conv if self.mode == 'train' else 0.0)
                x = self._max_pool(x, pool1_size, pool1_size)

            with tf.variable_scope('unit-2'):
                x = self._conv2d(x, 'cnn-2', filter_size, filters[0],
                                 filters[1], filter_strides)
                #x = self._batch_norm('bn2', x)
                x = tf.layers.batch_normalization(x,
                                                  axis=-1,
                                                  momentum=0.9,
                                                  training=self.train,
                                                  name='bn2')
                x = self._leaky_relu(x, 0.01)
                x = tf.layers.dropout(
                    x, FLAGS.dropout_conv if self.mode == 'train' else 0.0)
                x = self._max_pool(x, pool2_size, pool2_size)

        with tf.variable_scope('linear'):
            # linear layer for dim reduction
            #x = tf.reshape(x, [-1, p*filters[1]])
            #x = self._linear(x, 'linear1', [p*filters[1], FLAGS.linear_num])
            times, feat, filters = x.shape.as_list()[1:]
            x = tf.reshape(x, [-1, times, feat * filters])
            x = tf.layers.dense(x, FLAGS.linear_num, name='lineaer1')
            x = tf.layers.dropout(
                x, FLAGS.dropout_linear if self.mode == 'train' else 0.0)

        with tf.variable_scope('lstm'):
            #x = tf.reshape(x,[-1, FLAGS.seq_len, FLAGS.linear_num])
            #x.eval().shape

            cell_fw = tf.contrib.rnn.BasicLSTMCell(FLAGS.cell_num,
                                                   forget_bias=1.0)
            cell_fw = tf.contrib.rnn.DropoutWrapper(
                cell=cell_fw,
                output_keep_prob=1 -
                FLAGS.dropout_lstm if self.mode == 'train' else 1.0)

            cell_bw = tf.contrib.rnn.BasicLSTMCell(FLAGS.cell_num,
                                                   forget_bias=1.0)
            cell_bw = tf.contrib.rnn.DropoutWrapper(
                cell=cell_bw,
                output_keep_prob=1 -
                FLAGS.dropout_lstm if self.mode == 'train' else 1.0)

            # Now we feed `linear` into the LSTM BRNN cell and obtain the LSTM BRNN output.
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=x,
                dtype=tf.float32,
                time_major=False,
                scope='LSTM1')
        with tf.variable_scope('time_pooling'):
            if self.attention:
                outputs, alphas = self._attention(outputs,
                                                  FLAGS.attention_size,
                                                  return_alphas=True)
            else:
                outputs = tf.concat(outputs, 2)
                outputs = tf.expand_dim(outputs, axis=-1)
                seq_len = tf.shape(outputs)[1]
                outputs = self._max_pool(outputs,
                                         ksize=[seq_len, 1],
                                         strides=[seq_len, 1])
                outputs = tf.reshape(outputs, [-1, 2 * FLAGS.cell_num])

        with tf.variable_scope('dense'):
            y = self._linear(outputs, 'dense-matmul',
                             [2 * FLAGS.cell_num, FLAGS.hidden1])
            #y = self._batch_norm_wrapper('dense-bn', y)
            y = tf.layers.batch_normalization(y,
                                              axis=-1,
                                              momentum=0.99,
                                              training=self.train,
                                              name='dense-bn')
            y = self._leaky_relu(y, 0.01)
            y = tf.layers.dropout(
                y, FLAGS.dropout_fully1 if self.mode == 'train' else 0.0)

        self.logits = self._linear(y, 'softmax',
                                   [FLAGS.hidden1, FLAGS.num_class])
        self.softmax = tf.nn.softmax(self.logits)
Beispiel #10
0
class NERModel(BaseModel):
    """Specialized class of Model for NER"""

    def __init__(self, config):
        super(NERModel, self).__init__(config)
        self.idx_to_tag = {idx: tag for tag, idx in
                           self.config.vocab_tags.items()}


    def add_placeholders(self):
        """Define placeholders = entries to computational graph"""
        # shape = (batch size, max length of sentence in batch)
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
                        name="word_ids")

        # shape = (batch size)
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
                        name="sequence_lengths")

        # shape = (batch size, max length of sentence, max length of word)
        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
                        name="char_ids")

        # shape = (batch_size, max_length of sentence)
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
                        name="word_lengths")
	#to be used for seq2seq decoder
        if(self.config.use_seq2seq):
            self.decoder_targets = tf.placeholder(tf.int32, shape = [None, None], name="decoder_targets")
        # shape = (batch size, max length of sentence in batch)
        self.labels = tf.placeholder(tf.int32, shape=[None, None],
                        name="labels")

        # hyper parameters
        self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
                        name="dropout")
        self.lr = tf.placeholder(dtype=tf.float32, shape=[],
                        name="lr")

	#self.pad_token = '<PAD>'
	#self.eos_token = '<END>'
	#self.PAD = self.config.vocab_words[self.pad_token]
 	#self.EOS = self.config.vocab_words[self.eos_token]

    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        """Given some data, pad it and build a feed dictionary

        Args:
            words: list of sentences. A sentence is a list of ids of a list of
                words. A word is a list of ids
            labels: list of ids
            lr: (float) learning rate
            dropout: (float) keep prob

        Returns:
            dict {placeholder: value}

        """
        # perform padding of the given data:
        if self.config.use_chars:
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = pad_sequences(word_ids, 0)
            char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
                nlevels=2)
        else:
            word_ids, sequence_lengths = pad_sequences(words, 0)

        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths

        if labels is not None:
            labels, _ = pad_sequences(labels, self.config.vocab_tags['O'])
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout
        if (self.config.use_seq2seq):
            feed[self.decoder_targets] = word_ids
        return feed, sequence_lengths

    
    
    def add_word_embeddings_op(self):
        """Defines self.word_embeddings

        If self.config.embeddings is not None and is a np array initialized
        with pre-trained word vectors, the word embeddings is just a look-up
        and we don't train the vectors. Otherwise, a random matrix with
        the correct shape is initialized.
        """
        with tf.variable_scope("words"):
            if self.config.embeddings is None:
                self.logger.info("WARNING: randomly initializing word vectors")
                self._word_embeddings = tf.get_variable(
                        name="_word_embeddings",
                        dtype=tf.float32,
                        shape=[self.config.nwords, self.config.dim_word])
            else:
                self._word_embeddings = tf.Variable(
                        self.config.embeddings,
                        name="_word_embeddings",
                        dtype=tf.float32,
                        trainable=self.config.train_embeddings)

            word_embeddings = tf.nn.embedding_lookup(self._word_embeddings,
                    self.word_ids, name="word_embeddings")

        with tf.variable_scope("chars"):
            if self.config.use_chars:
                # get char embeddings matrix
                _char_embeddings = tf.get_variable(
                        name="_char_embeddings",
                        dtype=tf.float32,
                        shape=[self.config.nchars, self.config.dim_char])
                char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
                        self.char_ids, name="char_embeddings")

                # put the time dimension on axis=1
                s = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(char_embeddings,
                        shape=[s[0]*s[1], s[-2], self.config.dim_char])
                word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])

                # bi lstm on chars
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                        state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
                        state_is_tuple=True)
                _output = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw, cell_bw, char_embeddings,
                        sequence_length=word_lengths, dtype=tf.float32)

                # read and concat output
                _, ((_, output_fw), (_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis=-1)

                # shape = (batch size, max sentence length, char hidden size)
                output = tf.reshape(output,
                        shape=[s[0], s[1], 2*self.config.hidden_size_char])
                word_embeddings = tf.concat([word_embeddings, output], axis=-1)

        self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout)

    
    def word_drop_pre_bridge(self, input_word_seq_tensor, sequence_lengths, max_seq_length=None):
            #NOTE: There are no variables in the function, so shouldn't matter for the tf graph construction (hopefully)
            """ This function is only used during ABSA task. It takes in the input word sequence tensor and sequence lengths, and outputs a modified batch  where for each sentence, a row exists with normal sentence, and corresponding n rows with 1 missing word. 
        Same applies for the sequence lengths.

        In essence:
                For word_ids: Input 2d (n_batch,max_sequence_length) --> Intermediate 3d (n_batch, max_sequence_length, max_sequence_length) --> Output 2d (n_batch*max_sequence_length, max_sequence_length) --> Output 2d time major (max_sequence_length,n_batch*max_sequence_length)
            For seq_lens: Input 1d (n_batch,) --> Intermediate 2d (n_batch, max_sequence_length,) --> Output 1d (n_batch*max_sequence_length)
            """
    if self.config.use_seq2seq:
            if(max_seq_length is None):
                max_seq_length = input_word_seq_tensor.shape[-1]
            #1) Create mask to select word indices ( 1 dropped every time)
            np_mask_matrix = np.ones((max_seq_length, max_seq_length))
            a = np.array(range(max_seq_length))
            np_mask_matrix[np.arange(len(a)),a] = 0 #go through each row, and for that particular column set 0 (opposite of a diagonal matrix)
            tf_mask_matrix = tf.convert_to_tensor(np_mask_matrix, dtype="bool")
            padding = tf.constant([[0,0],[0,1]],dtype="int32")

            #2nd operation add dimensions to both input tensors`
            resultant_tensor = tf.expand_dims(input_word_seq_tensor,0)
            tensor_seq_lengths = tf.expand_dim(seq_lengths, 0)

            #3rd operation--> Make tensor for seq_lengths for dropped indices (they're always 1 less)
            seq_lengths_for_dropped = tf.expand_dims(seq_lengths - tf.ones(shape=seq_lengths.shape[0], dtype ="int32"),0)

            #4th operation --> looped Applying mask matrix to obtain dropped word ids; each result is appended to row of resultant_tensor
            for drop_index in range(max_seq_length):
                f = lambda word_seq: tf.boolean_mask(word_seq, tf_mask_matrix[:,drop_index])
                resultant_tensor = tf.concat([resultant_tensor, tf.expand_dims(tf.pad(tf.map_fn(f, input_word_seq_tensor), padding, "CONSTANT"),0)],0)
                tensor_seq_lengths = tf.concat([tensor_seq_lengths, seq_lengths_for_dropped],0)

            #5th operation-> reshape of tensor
            # The tensor is shaped such that the first n rows correspond to the first sentence (n is the sequence length)
            resultant_tensor = tf.reshape(resultant_tensor, [resultant_tensor.shape[0]*resultant_tensor.shape[1], resultant_tensor.shape[2]])
            tensor_seq_lengths = tf.reshape(tensor_seq_lengths, [tensor_seq_lengths.shape[0]*tensor_seq_lengths.shape[1],])
	#NOTE: Converting the tensor from batch*time -> time*batch BECAUSE OUR SPECIFIC ENCODER expects in that manner
            resultant_tensor = resultant_tensor.swapaxes(0,1) 
            return resultant_tensor, tensor_seq_lengths

        def bridge_seq2seq_embeddings(self):
	#This is to convert the seq2seq outputs of shape 2d Time*Batch --> Perform comparison of missing word with all words-->convert into 3d shape(Batch*Time*Embeds), and then concatenate as batch*Time*Embeds with word_embeddings
            seq2seq_encoder_out = tf.reshape(self.encoder_concat_rep,[self.word_ids.shape[-1]+1, self.word_ids.shape[0], self.config.seq2seq_enc_hidden_size*4]) #Batch_size*Dims output
            self.seq2seq_encoder_embeds = tf.subtract(seq2seq_encoder_out, perm=[1,0,2]) #NOTE#NOTE#NOTE#NOTE Have to replace with a generic function-subtract, KL, MMD
	#assert self.seq2seq_encoder_embeds.shape[0] == self.word_embeddings.shape[0]
            self.word_embeddings = tf.concat([self.word_embeddings, self.seq2seq_encoder_embeds], axis =-1)
				
    def convert_tensors(self):
	#NOTE Word ids during training of seq2seq are of different format(time*batch) whereas in absa task they are fed normal as batch*time) 
	if(self.config.train_seq2seq):
	    self.seq2seq_input_sequences, self.seq2seq_input_sequence_lengths =  self.word_ids, self.sequence_lengths
        else:
	    self.seq2seq_input_sequnces, self.seq2seq_input_sequence_lengths = self.bridge_process(self.word_ids, self.sequence_lengths)

    def add_seq2seq(self):
	"""This stores the seq2seq model which will be imported as part of the training graph since other options of creating a separate training graph/session and importing seemed lengthy. 

	1) It is to be first trained for autoencoding separately 
	2) Once the training is complete, user has to update config in model_config.py
	3) For usage in ABSA, the variable tf_encoded_concat_rep is used. It is updated to not trainable by blocking the gradient flow
"""
#NOTE: 1) There might be a more efficient manner to load and train the seq2seq separately, and then just use the final weights. 
#      2) Blocking gradients should not impact elements linked to this
        if(self.config.use_seq2seq):
            with tf.variable_scope('seq2seq_encoder'):
		self.seq2seq_input_sequences_embeds = tf.nn.embedding_lookup(self._word_embeddings, self.seq2seq_input_sequences, name="word_embeddings")

                encoder_cell = LSTMCell(self.config.seq2seq_enc_hidden_size)
                ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state)) = (tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell, cell_bw=encoder_cell, inputs = self.seq2seq_input_sequences_embeds, sequence_length = self.seq2seq_input_sequence_lengths, dtype = tf.float32, time_major=True))
               #encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs),2)
                encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c),1)
                encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h),1)

                self.encoder_final_state = LSTMStateTuple(c= encoder_final_state_c, h=encoder_final_state_h)

                self.encoder_concat_rep = tf.concat([encoder_final_state_c, encoder_final_state_h], 1)
                #NOTE: Very important to stop gradient flow once trained
                if(self.config.seq2seq_trained):
                    self.encoder_concat_rep = tf.stop_gradient(self.encoder_concat_rep) 
    
        
            with tf.variable_scope('seq2seq_decoder'):
                encoder_max_time, batch_size = tf.unstack(tf.shape(self.seq2seq_input_sequences))
         	#self.encoder_max = encoder_max_time
		#self.batch_max = batch_size 
                decoder_cell = LSTMCell(self.config.seq2seq_dec_hidden_size)
                decoder_lengths = self.sequence_lengths + 3 #2 additional terms
                W_dec = tf.Variable(tf.random_uniform([self.config.seq2seq_dec_hidden_size, self.config.nwords],-1,1), dtype = tf.float32)
                b_dec = tf.Variable(tf.zeros([self.config.nwords]), dtype = tf.float32)
   
                eos_time_slice = self.config.EOS*tf.ones([batch_size], dtype=tf.int32, name = "EOS")
                pad_time_slice = self.config.PAD*tf.ones([batch_size], dtype = tf.int32, name="PAD")

                eos_step_embedded = tf.nn.embedding_lookup(self._word_embeddings, eos_time_slice)
                pad_step_embedded = tf.nn.embedding_lookup(self._word_embeddings, pad_time_slice)
            
            def loop_fn_initial():
                initial_elements_finished = (0 >= decoder_lengths)  # all False at the initial step
                initial_input = eos_step_embedded
                initial_cell_state = self.encoder_final_state
                initial_cell_output = None
                initial_loop_state = None  # we don't need to pass any additional information
                return (initial_elements_finished,initial_input,initial_cell_state,initial_cell_output,initial_loop_state)

            def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
                def get_next_input():
                    output_logits = tf.add(tf.matmul(previous_output, W_dec), b_dec)
                    prediction = tf.argmax(output_logits, axis=1)
                    next_input = tf.nn.embedding_lookup(self._word_embeddings, prediction)
                    return next_input

                elements_finished = (time >= decoder_lengths)
                
                finished = tf.reduce_all(elements_finished) # -> boolean scalar
                input = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
                state = previous_state
                output = previous_output
                loop_state = None
                
                return (elements_finished, input, state, output, loop_state)

            def loop_fn(time, previous_output, previous_state, previous_loop_state):
                if previous_state is None:    # time == 0
                    assert previous_output is None and previous_state is None
                    return loop_fn_initial()
                else:
                    return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)
            with tf.variable_scope("seq2seq_decoding"):
                decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
                decoder_outputs = decoder_outputs_ta.stack()
            
                decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
                decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
                decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W_dec), b_dec)
                self.decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, self.config.nwords))
                self.decoder_prediction = tf.argmax(self.decoder_logits, 2)
      
    #If training of seq2seq is to be done, then we need to add loss and cost function to the graph
       # if(self.config.train_seq2seq and self.config.use_seq2seq):
        #    with tf.variable_scope('seq2seq_training'):
         #       stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels = tf.one_hot(decoder_targets, depth = self.config.nwords, dtype = tf.float32), logits = self.decoder_logits,)
          #      self.seq2seq_loss = tf.reduce_mean(stepwise_cross_entropy)
            
    def add_logits_op(self):
        """Defines self.logits

        For each word in each sentence of the batch, it corresponds to a vector
        of scores, of dimension equal to the number of tags.
        """
        with tf.variable_scope("bi-lstm"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, self.word_embeddings,
                    sequence_length=self.sequence_lengths, dtype=tf.float32)
            output = tf.concat([output_fw, output_bw], axis=-1)
            output = tf.nn.dropout(output, self.dropout)

        with tf.variable_scope("proj"):
            W = tf.get_variable("W", dtype=tf.float32,
                    shape=[2*self.config.hidden_size_lstm, self.config.ntags])

            b = tf.get_variable("b", shape=[self.config.ntags],
                    dtype=tf.float32, initializer=tf.zeros_initializer())

            nsteps = tf.shape(output)[1]
            output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])


    def add_pred_op(self):
        """Defines self.labels_pred

        This op is defined only in the case where we don't use a CRF since in
        that case we can make the prediction "in the graph" (thanks to tf
        functions in other words). With theCRF, as the inference is coded
        in python and not in pure tensroflow, we have to make the prediciton
        outside the graph.
        """
        if not self.config.use_crf:
            self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
                    tf.int32)


    def add_loss_op(self):
        """Defines the loss"""
        if self.config.use_crf:
        #if self.config.use_crf and not self.config.train_seq2seq:
            
            log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
                    self.logits, self.labels, self.sequence_lengths)
            self.trans_params = trans_params # need to evaluate it for decoding
            self.loss = tf.reduce_mean(-log_likelihood)
        else: #Use when no crf and se
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        if(self.config.use_seq2seq):
            
            self.stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels = tf.one_hot(self.decoder_targets, depth = self.config.nwords, dtype = tf.float32), logits = self.decoder_logits,)
            self.seq2seq_loss = tf.reduce_mean(self.stepwise_cross_entropy)
        
        # for tensorboard
        if(self.config.train_seq2seq):
            tf.summary.scalar("seq2seq_loss",self.seq2seq_loss)
        else:
            tf.summary.scalar("loss", self.loss)
        

    def build(self):
        # NER specific functions
        self.add_placeholders()
        self.add_word_embeddings_op()
        if(not self.config.use_seq2seq):
            self.add_logits_op()
            self.add_pred_op()
            self.add_loss_op()
        else:

           
	    self.convert_tensors()
            self.add_seq2seq()
            self.bridge_seq2seq_embeddings()
	    self.add_logits_op()
	    self.add_pred_op()
            self.add_loss_op()		
        
        if(self.config.use_seq2seq):#This is also a node in the graph and hence needs to be stored
        # Generic functions that add training op and initialize session
            self.add_train_op(self.config.lr_method, self.lr, self.seq2seq_loss,
                self.config.clip, True)
        
        self.add_train_op(self.config.lr_method, self.lr, self.loss,
                self.config.clip)
        self.initialize_session() # now self.sess is defined and vars are init


    def predict_batch(self, words):
        """
        Args:
            words: list of sentences

        Returns:
            labels_pred: list of labels for each sentence
            sequence_length

        """
        fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)

        if self.config.use_crf:
            # get tag scores and transition params of CRF
            viterbi_sequences = []
            logits, trans_params = self.sess.run(
                    [self.logits, self.trans_params], feed_dict=fd)

            # iterate over the sentences because no batching in vitervi_decode
            for logit, sequence_length in zip(logits, sequence_lengths):
                logit = logit[:sequence_length] # keep only the valid steps
                viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                        logit, trans_params)
                viterbi_sequences += [viterbi_seq]

            return viterbi_sequences, sequence_lengths

        else:
            labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)

            return labels_pred, sequence_lengths


    def run_epoch_seq2seq(self, train, dev, epoch):
        """for seq2seq training"""
        """train is a list of sequences"""
        """dev is also a list of sequences"""
        
        batch_size = self.config.seq2seq_batch_size
        nbatches = (len(train) + batch_size - 1) // batch_size
        prog = Progbar(target=nbatches)
        #train_op = tf.train.AdamOptimizer(learning_rate= self.config.lr).minimize(self.seq2seq_loss)
        #train_batch_generator = self.gen_batch_seq2seq(train,batch_size)
        for i, (words, labels) in enumerate(minibatches(train, batch_size)):
            #print("TR",len(words),len(words[0]), len(labels), len(labels[0]))
            df = self.next_feed(words, lr=self.config.lr, dropout = self.config.dropout_seq2seq)
          
          #  cross_entropy, decoder_logits, encoder_useful_state = self.sess.run([self.stepwise_cross_entropy, self.decoder_logits,self.encoder_concat_rep], feed_dict =df)
            _, train_loss, summary = self.sess.run([self.seq2seq_train_op, self.seq2seq_loss, self.merged], feed_dict = df)
            
	   #print("ENC_TIME",enc_time)
           # print("BATCH_SIZE",b_size) 
            #print(cross_entropy)
            #print(decoder_logits[0])
            #print(encoder_useful_state[0])
    	    prog.update(i + 1, [("train loss", train_loss)])
       	    #if(i%70)
	     #   print("ACTUAL,PREDICTED",words[0],predictions[0])	
        for words, labels in minibatches(dev, batch_size):
            dev_batch = self.feed_enc(words)
	    te_loss = 5
            predictions, encoder_useful_state = self.sess.run([self.decoder_prediction, self.encoder_concat_rep], dev_batch)
            #print("TE",len(words),len(words[0]),len(predictions), len(predictions[0]))
        words = np.transpose(np.array(words))
	predictions = np.transpose(np.array(predictions))
	print("ACTUAL,PREDICTED",words[2],predictions[2])
        print("Encoder state 0: {}".format(encoder_useful_state[0][0]))
	msg = "Autoencoding testing loss: {}%2f".format(te_loss)
        self.logger.info(msg)
        return te_loss

    def run_epoch(self, train, dev, epoch):
        """Performs one complete pass over the train set and evaluate on dev

        Args:
            train: dataset that yields tuple of sentences, tags
            dev: dataset
            epoch: (int) index of the current epoch

        Returns:
            f1: (python float), score to select model on, higher is better

        """
        # progbar stuff for logging
        batch_size = self.config.batch_size
        nbatches = (len(train) + batch_size - 1) // batch_size
        prog = Progbar(target=nbatches)

        # iterate over dataset
        for i, (words, labels) in enumerate(minibatches(train, batch_size)):
            fd, _ = self.get_feed_dict(words, labels, self.config.lr,
                    self.config.dropout)
	    _, train_loss, summary = self.sess.run([self.train_op, self.loss, self.merged], feed_dict = fd)
            #enc_rep, _, train_loss, summary = self.sess.run(
           #         [self.encoder_concat_rep,self.train_op, self.loss, self.merged], feed_dict=fd)

            prog.update(i + 1, [("train loss", train_loss)])

            # tensorboard
            if i % 10 == 0:
                self.file_writer.add_summary(summary, epoch*nbatches + i)
	
	for words, labels in minibatches(dev, self.config.batch_size):
            dev_batch = self.feed_enc(words)
	    te_loss = 5
            encoder_useful_state = self.sess.run([self.encoder_concat_rep], dev_batch)
        
	print("Encoder state 0: {}".format(encoder_useful_state[0][0]))
	print(len(encoder_useful_state[0][0]))
	
        metrics = self.run_evaluate(dev)
        msg = " - ".join(["{} {:04.2f}".format(k, v)
                for k, v in metrics.items()])
        self.logger.info(msg)

        return metrics["f1"]

    def calculate_f1(self, tp,fp,tn,fn):
        if(tp+fn==0):
            recall = 0
        else:
            recall = float(tp)/(tp+fn)
        if(tp+fp ==0):
            precision = 0
        else:
            precision = float(tp)/(tp+fp)
        if(precision+recall==0):
            f1 = 0
        else:
            f1 = 2*(precision*recall)/(precision+recall)
        return f1, recall, precision

    def run_evaluate(self, test):
        """Evaluates performance on test set

        Args:
            test: dataset that yields tuple of (sentences, tags)

        Returns:
            metrics: (dict) metrics["acc"] = 98.4, ...

        """
        asp_tp = 0.
        asp_fp = 0.
        asp_tn = 0.
        asp_fn = 0.
    
        op_tp = 0.
        op_fp = 0.
        op_tn = 0.
        op_fn = 0.
    
        ot_tp = 0.
        ot_fp = 0.
        ot_tn = 0.
        ot_fn = 0.
        
        tag2id = self.config.vocab_tags 
        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in minibatches(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(words)

            for lab, lab_pred, length in zip(labels, labels_pred,
                                             sequence_lengths):
                lab      = lab[:length]
                lab_pred = lab_pred[:length]
                
                
                for actual,pred in zip(lab, lab_pred):
                    actual = actual
                    pred = pred
                    #print(type(tag2id['B-A']), type(actual), type(pred))
                    #print(actual, actual ==4)
                    #print(pred, pred ==4)
                    if(actual == tag2id['B-A'] or actual == tag2id['I-A']): #BA or IA-> Replace by tag2id later --> 0 and 2 for i-a and B-a; 1 and 3; 4
                        if(pred == tag2id['B-A'] or pred == tag2id['I-A']):
                            asp_tp +=1
                            op_tn +=1
                            ot_tn +=1
                        else:
                            if(pred==tag2id['B-O'] or pred==tag2id['I-O']): 
                                asp_fn+=1
                                op_fp+=1
                                ot_tn+=1
                            elif(pred==tag2id['O']):
                                asp_fn+=1
                                ot_fp+=1
                                op_tn+=1
                            else:
                                print("Somethings wrong in prediction")
                            
                    elif(actual==tag2id['B-O'] or actual==tag2id['I-O']): #BO or IO
                        if(pred==tag2id['B-O'] or pred==tag2id['I-O']):
                            op_tp +=1
                            asp_tn +=1
                            ot_tn +=1
                        else:
                            if(pred == tag2id['B-A'] or pred==tag2id['I-A']): 
                                op_fn+=1
                                asp_fp+=1
                                ot_tn+=1
                            elif(pred==tag2id['O']):
                                op_fn+=1
                                ot_fp+=1
                                asp_tn+=1
                            else:
                                print("Somethings wrong in prediction")
                                
                                
                    elif(actual == tag2id['O']):
                        if(pred==tag2id['O']):
                            ot_tp +=1
                            asp_tn +=1
                            op_tn +=1
                        else:
                            if(pred == tag2id['B-A'] or pred==tag2id['I-A']): 
                                ot_fn+=1
                                asp_fp+=1
                                op_tn+=1
                            elif(pred==tag2id['B-O'] or pred==tag2id['I-O']):
                                ot_fn+=1
                                op_fp+=1
                                asp_tn+=1
                            else:
                                print("Somethings wrong in prediction")                                
                    else:
                        print("Somethings wrong")
                   
                                
                                
                                
                
                accs    += [a==b for (a, b) in zip(lab, lab_pred)]

                lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
                lab_pred_chunks = set(get_chunks(lab_pred,
                                                 self.config.vocab_tags))

                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds   += len(lab_pred_chunks)
                total_correct += len(lab_chunks)
        
        assert(asp_tp+asp_fp+asp_tn+asp_fn == op_tp+op_fp+op_tn+op_fn == ot_tp+ot_fp+ot_tn+ot_fn)
        #print(tag2id)
        #print(1)
        #print(asp_tp, asp_fp, asp_tn, asp_fn)
        asp_scores = self.calculate_f1(asp_tp,asp_fp,asp_tn,asp_fn)
        #print(2)
        op_scores = self.calculate_f1(op_tp,op_fp,op_tn,op_fn)
        #print(3)
        ot_scores = self.calculate_f1(ot_tp,ot_fp,ot_tn,ot_fn)
        
                
                
                
        p   = correct_preds / total_preds if correct_preds > 0 else 0
        r   = correct_preds / total_correct if correct_preds > 0 else 0
        f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)

        return {"acc": 100*acc, "f1": 100*f1, "asp_f1":100*asp_scores[0], "op_f1":100*op_scores[0], "ot_f1":100*ot_scores[0]}


    def predict(self, words_raw):
        """Returns list of tags

        Args:
            words_raw: list of words (string), just one sentence (no batch)

        Returns:
            preds: list of tags (string), one for each word in the sentence

        """
        words = [self.config.processing_word(w) for w in words_raw]
        if type(words[0]) == tuple:
            words = zip(*words)
        pred_ids, _ = self.predict_batch([words])
        preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]

        return preds


    def gen_batch_seq2seq(self, idd_data, batch_size):
        #np.random.shuffle(idd_data)
        #batch_size = self.config.seq2seq_batch_size
        print("Batch size", batch_size)
        rem = len(idd_data)%(self.config.seq2seq_batch_size)
        num_batches = (len(idd_data)/self.config.seq2seq_batch_size)
        if(rem>0):
            num_batches = num_batches+1
        print(idd_data)
        for i in range(num_batches):
            if(i==num_batches -1 and (not rem==0)):
                yield(idd_data[i*batch_size:])
            else:
                yield(idd_data[i*batch_size:(i+1)*batch_size])
    
                             
    def batch_modify(self, inputs, max_sequence_length=None):
        """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
        """
    
        sequence_lengths = [len(seq) for seq in inputs]
        batch_size = len(inputs)
    
        if max_sequence_length is None:
            max_sequence_length = max(sequence_lengths)
    
        inputs_batch_major = self.config.PAD*np.ones(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
        for i, seq in enumerate(inputs):
            for j, element in enumerate(seq):
                inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
        inputs_time_major = inputs_batch_major.swapaxes(0, 1)

        return inputs_time_major, sequence_lengths
                             
    def next_feed(self, batch, lr = 0.02, labels = None, dropout= 1.0):
        encoder_inputs_, encoder_input_lengths_ = self.batch_modify(batch)
        #print(self.config.EOS, self.config.PAD)
        decoder_targets_, _ = self.batch_modify(
            [(sequence) + [self.config.EOS] + [self.config.PAD] * 2 for sequence in batch] #additional 3 spaces
        )
        feed = {
            self.word_ids: encoder_inputs_,
            self.sequence_lengths: encoder_input_lengths_,
            self.decoder_targets: decoder_targets_,
        }
#	print("HELLO")
#	print(encoder_inputs_[0])
#	print(decoder_targets_[0])
        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
	
        feed[self.labels] = decoder_targets_
 
        if labels is not None:
            labels, _ = pad_sequences(labels, self.config.vocab_tags['O'])
            feed[self.labels] = labels
	
        if lr is not None:
            feed[self.lr] = self.config.lr

        if dropout is not None:
            feed[self.dropout] = dropout
        return feed

    def feed_enc(self, enc_batch, lr = 0.02, labels = None, dropout= 1.0):
    
        
        encoder_inputs_, encoder_input_lengths_ = self.batch_modify(enc_batch)
        
        feed = {
            self.word_ids: encoder_inputs_, 
            self.sequence_lengths: encoder_input_lengths_,
            self.decoder_targets: encoder_inputs_}
        
        if self.config.use_chars:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
        
        feed[self.labels] = encoder_inputs_
        if labels is not None:
            labels, _ = pad_sequences(labels, self.config.vocab_tags['O'])
            feed[self.labels] = labels
	
        if lr is not None:
            feed[self.lr] = self.config.lr

        if dropout is not None:
            feed[self.dropout] = dropout
            
            
        
        return feed