Beispiel #1
0
 def package(self, word_tokens, char_tokens, ws, cs):
     return adict({
         self.words: word_tokens,
         self.chars: char_tokens,
         self.ws: ws,
         self.cs: cs
     })
Beispiel #2
0
    def batch2(self, labels=None, words=None, trim_words=None, shuf=False):
        n = len(labels)
        b = {'n': n}

        # shuffle
        if shuf:
            p = np.random.permutation(n)
            q = [i + n for i in range(self.batch_size - n)]
            p = np.hstack([p, q]).astype(np.int32)

        # if not full batch.....just copy first item to fill
        for i in range(self.batch_size - n):
            labels.append(labels[0])
            words.append(words[0])

        ############################################################
        y = np.array(labels, dtype=np.float32)
        if shuf: y = y[p]
        y = y[..., None]  #y = np.expand_dims(y, 1)
        b['y'] = y

        ############################################################
        word_tensor = np.array(words, dtype=np.float32)
        if shuf: word_tensor = word_tensor[p]

        ####################
        ## TRIM KERAS SEQUENCES to LONGEST in BATCH!!!
        nz = first_nonzero(word_tensor)
        mz = min(nz)
        #mz = max(0, mz-100)

        ##1
        word_tensor = word_tensor[:, mz:]

        #         ##2
        #         m = word_tensor.shape[1]
        #         seq_lengths = [m-z for z in nz]
        #         max_text_length = m-mz
        #         x = np.zeros((self.batch_size, max_text_length)).astype(np.float32)
        #         for i in range(self.batch_size):
        #             s = word_tensor[i,nz[i]:]
        #             x[i,:len(s)] = s
        #         word_tensor = x

        #         ##3
        #         word_tensor = np.fliplr(word_tensor); seq_lengths = [m-z for z in nz]

        ####################
        b['w'] = word_tensor
        b['x'] = b['w']

        ############################################################
        max_seq_length = word_tensor.shape[1]
        seq_lengths = [max_seq_length for x in labels]
        b['s'] = seq_lengths

        ####################
        return adict(b)
Beispiel #3
0
    def batch_stream(self, stop=False):
        tok_stream = self.reader.chunk_stream(stop=stop)

        while True:
            batches = self.make_batches(tok_stream)
            if batches is None:
                break
            for c, w in zip(batches[0], batches[1]):
                if self.trim_chars:
                    c = self.trim_batch(c)
                yield adict({'w': w, 'c': c})
Beispiel #4
0
 def get_ystats(self):
     y = self.get_all_fields(key='y')
     y = np.array(y, dtype=np.float32)
     v, c = np.unique(y, return_counts=True)
     d = {}
     d['mean'] = np.mean(y)
     d['std'] = np.std(y)
     d['min'] = np.min(y)
     d['max'] = np.max(y)
     d['n'] = len(y)
     d['v'] = v
     d['c'] = c
     return adict(d)
Beispiel #5
0
def loss_graph(logits, batch_size, num_unroll_steps):

    with tf.variable_scope('Loss'):
        targets = tf.placeholder(tf.int64, [batch_size, num_unroll_steps],
                                 name='targets')
        target_list = [
            tf.squeeze(x, [1]) for x in tf.split(targets, num_unroll_steps, 1)
        ]

        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=target_list),
                              name='loss')

        tf.summary.scalar("model/loss", loss)

    return adict(targets=targets, loss=loss)
Beispiel #6
0
 def parse_line(self, line, t=None):
     rec = line.strip().split(self.sep)
     d = {}
     for k, v in self.fields.items():
         if isinstance(v, basestring):
             d[v] = rec[k].strip()
         else:
             d.update(v.parse_line(rec[k].strip()))
         if t and v == 'y':
             y = float(d[v])
             p = t[y]
             if self.rng.rand() > p:
                 #print('sample NO\t[{},{}]'.format(y,p))
                 return None
             #print('sample YES\t[{},{}]'.format(y,p))
     return adict(d)
Beispiel #7
0
def training_graph(loss, learning_rate=1.0, max_grad_norm=5.0):
    ''' Builds training graph. '''
    global_step = tf.Variable(0, name='global_step', trainable=False)

    with tf.variable_scope('SGD_Training'):
        # SGD learning parameter
        learning_rate = tf.Variable(learning_rate,
                                    trainable=False,
                                    name='learning_rate')

        # collect all trainable variables
        tvars = tf.trainable_variables()
        grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                                    max_grad_norm)

        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars),
                                             global_step=global_step)

    return adict(learning_rate=learning_rate,
                 global_step=global_step,
                 global_norm=global_norm,
                 train_op=train_op)
Beispiel #8
0
from nlp.tf_tools.attention import attention
import nlp.tf_tools.sonnet_modules as sm
''' get config '''
parser = options.get_parser()
config_file = 'config/ats.conf'
argv = []  # override config file here

FLAGS = config.get_config(parser=parser, config_file=config_file, argv=argv)
FLAGS.chkpt_dir = make_abs(FLAGS.chkpt_dir)
FLAGS.rand_seed = U.seed_random(FLAGS.rand_seed)
pprint.pprint(FLAGS)

embed = U.adict({
    'type': FLAGS.embed_type,
    'char': FLAGS.embed_type == 'char',
    'word': FLAGS.embed_type == 'word'
})
FLAGS.kernel_widths = eval(eval(FLAGS.kernel_widths))
FLAGS.kernel_features = eval(eval(FLAGS.kernel_features))
''' setup checkpoint directory '''
if not os.path.exists(FLAGS.chkpt_dir):
    U.mkdirs(FLAGS.chkpt_dir)
    print('Created checkpoint directory', FLAGS.chkpt_dir)
config.save_local_config(FLAGS)

#mode = FLAGS.run_mode
batch_size = FLAGS.batch_size
pid = FLAGS.item_id
essay_file = os.path.join(FLAGS.data_dir, '{0}',
                          '{0}.txt.clean.tok').format(pid)
Beispiel #9
0
def inference_graph(char_vocab_size,
                    word_vocab_size,
                    char_embed_size=15,
                    batch_size=20,
                    num_highway_layers=2,
                    num_rnn_layers=2,
                    rnn_size=650,
                    max_word_length=65,
                    kernels=[1, 2, 3, 4, 5, 6, 7],
                    kernel_features=[50, 100, 150, 200, 200, 200, 200],
                    num_unroll_steps=35,
                    dropout=0.0):

    assert len(kernels) == len(
        kernel_features), 'Kernel and Features must have the same size'

    input_ = tf.placeholder(
        tf.int32,
        shape=[batch_size, num_unroll_steps, max_word_length],
        name="input")
    ''' First, embed characters '''
    with tf.variable_scope('Embedding'):
        char_embedding = tf.get_variable('char_embedding',
                                         [char_vocab_size, char_embed_size])
        ''' this op clears embedding vector of first symbol (symbol at position 0, which is 
        by convention the position of the padding symbol). It can be used to mimic Torch7 
        embedding operator that keeps padding mapped to zero embedding vector and ignores 
        gradient updates. 
        For that do the following in TF:
        1. after parameter initialization, apply this op to zero out padding embedding vector
        2. after each gradient update, apply this op to keep padding at zero'''
        clear_char_embedding_padding = tf.scatter_update(
            char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size]))

        # [batch_size x max_word_length, num_unroll_steps, char_embed_size]
        input_embedded = tf.nn.embedding_lookup(char_embedding, input_)

        input_embedded = tf.reshape(input_embedded,
                                    [-1, max_word_length, char_embed_size])
    ''' Second, apply convolutions '''
    # [batch_size x num_unroll_steps, cnn_size]  # where cnn_size=sum(kernel_features)
    input_cnn = tdnn(input_embedded, kernels, kernel_features)
    ''' Maybe apply Highway '''
    if num_highway_layers > 0:
        input_cnn = highway(input_cnn,
                            input_cnn.get_shape()[-1],
                            num_layers=num_highway_layers)
    ''' Finally, do LSTM '''
    with tf.variable_scope('LSTM'):

        def create_rnn_cell():
            cell = tf.contrib.rnn.BasicLSTMCell(rnn_size,
                                                state_is_tuple=True,
                                                forget_bias=0.0,
                                                reuse=False)
            if dropout > 0.0:
                cell = tf.contrib.rnn.DropoutWrapper(cell,
                                                     output_keep_prob=1. -
                                                     dropout)
            return cell

        if num_rnn_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell(
                [create_rnn_cell() for _ in range(num_rnn_layers)],
                state_is_tuple=True)
        else:
            cell = create_rnn_cell()

        initial_rnn_state = cell.zero_state(batch_size, dtype=tf.float32)

        #         print('inputs')
        #         print(input_cnn.get_shape().as_list())

        input_cnn = tf.reshape(input_cnn, [batch_size, num_unroll_steps, -1])
        #         print('input_cnn')
        #         print(input_cnn.get_shape().as_list())

        input_cnn2 = [
            tf.squeeze(x, [1])
            for x in tf.split(input_cnn, num_unroll_steps, 1)
        ]
        #         print('input_cnn2')
        #         print([cnn.get_shape().as_list() for cnn in input_cnn2])

        outputs, final_rnn_state = tf.contrib.rnn.static_rnn(
            cell,
            input_cnn2,
            initial_state=initial_rnn_state,
            dtype=tf.float32)

        # linear projection onto output (word) vocab
        logits = []
        with tf.variable_scope('WordEmbedding') as scope:
            for idx, output in enumerate(outputs):
                if idx > 0:
                    scope.reuse_variables()
                logits.append(linear(output, word_vocab_size))

    return adict(input=input_,
                 clear_char_embedding_padding=clear_char_embedding_padding,
                 input_embedded=input_embedded,
                 input_cnn=input_cnn,
                 initial_rnn_state=initial_rnn_state,
                 final_rnn_state=final_rnn_state,
                 rnn_outputs=outputs,
                 logits=logits)
Beispiel #10
0
    def batch(
        self,
        ids=None,
        labels=None,
        words=None,
        chars=None,
        w=None,
        c=None,
        trim_words=None,
        trim_chars=None,
        spad='pre',
        wpad='pre',
        cpad='post',
        split_sentences=False,
    ):
        if ids:
            self.last = (ids, labels, words, chars, w, c)
        else:
            (ids, labels, words, chars, w, c) = self.last

        if trim_words == None:
            trim_words = self.trim_words
        if not trim_words and self.max_text_length == None:
            self.max_text_length = self.reader.get_maxlen()
        if trim_chars == None:
            trim_chars = self.trim_chars

        n = len(ids)
        b = {'n': n}

        # if not full batch.....just copy first item to fill
        for i in range(self.batch_size - n):
            ids.append(ids[0])
            labels.append(labels[0])
            words.append(words[0])
            chars.append(chars[0])

        b['id'] = ids  # <-- THIS key ('id') SHOULD COME FROM FIELD_PARSER.fields

        y = np.array(labels, dtype=np.float32)
        if self.normy:
            y = self.normalize(y)
        y = y[..., None]  #y = np.expand_dims(y, 1)
        b['y'] = y  # <-- THIS key ('y') SHOULD COME FROM FIELD_PARSER.fields

        if w and not isListEmpty(words):
            m = (self.max_text_length, )
            if trim_words: m = (None, )
            if split_sentences: m = (None, ) + m
            m = (None, ) + m

            p = (wpad, )
            if split_sentences: p = (spad, ) + p
            p = (None, ) + p

            #word_tensor, seq_lengths= self.pad_sequences(words, m=m, p=p)
            word_tensor, seq_lengths = U.pad_sequences(words, m=m, p=p)

            b['w'] = word_tensor
            b['x'] = b['w']

        if c and not isListEmpty(chars):
            m = (self.max_word_length, )
            if trim_chars: m = (None, )
            if trim_words: m = (None, ) + m
            else: m = (self.max_text_length, ) + m
            if split_sentences: m = (None, ) + m
            m = (None, ) + m

            p = (wpad, cpad)
            if split_sentences: p = (spad, ) + p
            p = (None, ) + p

            #char_tensor, seq_lengths = self.pad_sequences(chars, m=m, p=p)
            char_tensor, seq_lengths = U.pad_sequences(chars, m=m, p=p)

            b['c'] = char_tensor
            b['x'] = b['c']

        b['s'] = seq_lengths

        return adict(b)
Beispiel #11
0
def dict2ns(dict):
    return U.adict(dict)