Python adict Beispiele, nlp.util.utils.adict Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: text_reader.py Projekt: githubfragments/nlp

 def package(self, word_tokens, char_tokens, ws, cs):
     return adict({
         self.words: word_tokens,
         self.chars: char_tokens,
         self.ws: ws,
         self.cs: cs
     })

Beispiel #2

0

Datei anzeigen

Datei: text_reader.py Projekt: githubfragments/nlp

    def batch2(self, labels=None, words=None, trim_words=None, shuf=False):
        n = len(labels)
        b = {'n': n}

        # shuffle
        if shuf:
            p = np.random.permutation(n)
            q = [i + n for i in range(self.batch_size - n)]
            p = np.hstack([p, q]).astype(np.int32)

        # if not full batch.....just copy first item to fill
        for i in range(self.batch_size - n):
            labels.append(labels[0])
            words.append(words[0])

        ############################################################
        y = np.array(labels, dtype=np.float32)
        if shuf: y = y[p]
        y = y[..., None]  #y = np.expand_dims(y, 1)
        b['y'] = y

        ############################################################
        word_tensor = np.array(words, dtype=np.float32)
        if shuf: word_tensor = word_tensor[p]

        ####################
        ## TRIM KERAS SEQUENCES to LONGEST in BATCH!!!
        nz = first_nonzero(word_tensor)
        mz = min(nz)
        #mz = max(0, mz-100)

        ##1
        word_tensor = word_tensor[:, mz:]

        #         ##2
        #         m = word_tensor.shape[1]
        #         seq_lengths = [m-z for z in nz]
        #         max_text_length = m-mz
        #         x = np.zeros((self.batch_size, max_text_length)).astype(np.float32)
        #         for i in range(self.batch_size):
        #             s = word_tensor[i,nz[i]:]
        #             x[i,:len(s)] = s
        #         word_tensor = x

        #         ##3
        #         word_tensor = np.fliplr(word_tensor); seq_lengths = [m-z for z in nz]

        ####################
        b['w'] = word_tensor
        b['x'] = b['w']

        ############################################################
        max_seq_length = word_tensor.shape[1]
        seq_lengths = [max_seq_length for x in labels]
        b['s'] = seq_lengths

        ####################
        return adict(b)

Beispiel #3

0

Datei anzeigen

Datei: text_reader.py Projekt: githubfragments/nlp

    def batch_stream(self, stop=False):
        tok_stream = self.reader.chunk_stream(stop=stop)

        while True:
            batches = self.make_batches(tok_stream)
            if batches is None:
                break
            for c, w in zip(batches[0], batches[1]):
                if self.trim_chars:
                    c = self.trim_batch(c)
                yield adict({'w': w, 'c': c})

Beispiel #4

0

Datei anzeigen

Datei: text_reader.py Projekt: githubfragments/nlp

 def get_ystats(self):
     y = self.get_all_fields(key='y')
     y = np.array(y, dtype=np.float32)
     v, c = np.unique(y, return_counts=True)
     d = {}
     d['mean'] = np.mean(y)
     d['std'] = np.std(y)
     d['min'] = np.min(y)
     d['max'] = np.max(y)
     d['n'] = len(y)
     d['v'] = v
     d['c'] = c
     return adict(d)

Beispiel #5

0

Datei anzeigen

def loss_graph(logits, batch_size, num_unroll_steps):

    with tf.variable_scope('Loss'):
        targets = tf.placeholder(tf.int64, [batch_size, num_unroll_steps],
                                 name='targets')
        target_list = [
            tf.squeeze(x, [1]) for x in tf.split(targets, num_unroll_steps, 1)
        ]

        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=target_list),
                              name='loss')

        tf.summary.scalar("model/loss", loss)

    return adict(targets=targets, loss=loss)

Beispiel #6

0

Datei anzeigen

Datei: text_reader.py Projekt: githubfragments/nlp

 def parse_line(self, line, t=None):
     rec = line.strip().split(self.sep)
     d = {}
     for k, v in self.fields.items():
         if isinstance(v, basestring):
             d[v] = rec[k].strip()
         else:
             d.update(v.parse_line(rec[k].strip()))
         if t and v == 'y':
             y = float(d[v])
             p = t[y]
             if self.rng.rand() > p:
                 #print('sample NO\t[{},{}]'.format(y,p))
                 return None
             #print('sample YES\t[{},{}]'.format(y,p))
     return adict(d)

Beispiel #7

0

Datei anzeigen

def training_graph(loss, learning_rate=1.0, max_grad_norm=5.0):
    ''' Builds training graph. '''
    global_step = tf.Variable(0, name='global_step', trainable=False)

    with tf.variable_scope('SGD_Training'):
        # SGD learning parameter
        learning_rate = tf.Variable(learning_rate,
                                    trainable=False,
                                    name='learning_rate')

        # collect all trainable variables
        tvars = tf.trainable_variables()
        grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                                    max_grad_norm)

        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars),
                                             global_step=global_step)

    return adict(learning_rate=learning_rate,
                 global_step=global_step,
                 global_norm=global_norm,
                 train_op=train_op)

Beispiel #8

0

Datei anzeigen

Datei: train.py Projekt: Mars2018/tf-ats

from nlp.tf_tools.attention import attention
import nlp.tf_tools.sonnet_modules as sm
''' get config '''
parser = options.get_parser()
config_file = 'config/ats.conf'
argv = []  # override config file here

FLAGS = config.get_config(parser=parser, config_file=config_file, argv=argv)
FLAGS.chkpt_dir = make_abs(FLAGS.chkpt_dir)
FLAGS.rand_seed = U.seed_random(FLAGS.rand_seed)
pprint.pprint(FLAGS)

embed = U.adict({
    'type': FLAGS.embed_type,
    'char': FLAGS.embed_type == 'char',
    'word': FLAGS.embed_type == 'word'
})
FLAGS.kernel_widths = eval(eval(FLAGS.kernel_widths))
FLAGS.kernel_features = eval(eval(FLAGS.kernel_features))
''' setup checkpoint directory '''
if not os.path.exists(FLAGS.chkpt_dir):
    U.mkdirs(FLAGS.chkpt_dir)
    print('Created checkpoint directory', FLAGS.chkpt_dir)
config.save_local_config(FLAGS)

#mode = FLAGS.run_mode
batch_size = FLAGS.batch_size
pid = FLAGS.item_id
essay_file = os.path.join(FLAGS.data_dir, '{0}',
                          '{0}.txt.clean.tok').format(pid)

Beispiel #9

0

Datei anzeigen

def inference_graph(char_vocab_size,
                    word_vocab_size,
                    char_embed_size=15,
                    batch_size=20,
                    num_highway_layers=2,
                    num_rnn_layers=2,
                    rnn_size=650,
                    max_word_length=65,
                    kernels=[1, 2, 3, 4, 5, 6, 7],
                    kernel_features=[50, 100, 150, 200, 200, 200, 200],
                    num_unroll_steps=35,
                    dropout=0.0):

    assert len(kernels) == len(
        kernel_features), 'Kernel and Features must have the same size'

    input_ = tf.placeholder(
        tf.int32,
        shape=[batch_size, num_unroll_steps, max_word_length],
        name="input")
    ''' First, embed characters '''
    with tf.variable_scope('Embedding'):
        char_embedding = tf.get_variable('char_embedding',
                                         [char_vocab_size, char_embed_size])
        ''' this op clears embedding vector of first symbol (symbol at position 0, which is 
        by convention the position of the padding symbol). It can be used to mimic Torch7 
        embedding operator that keeps padding mapped to zero embedding vector and ignores 
        gradient updates. 
        For that do the following in TF:
        1. after parameter initialization, apply this op to zero out padding embedding vector
        2. after each gradient update, apply this op to keep padding at zero'''
        clear_char_embedding_padding = tf.scatter_update(
            char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size]))

        # [batch_size x max_word_length, num_unroll_steps, char_embed_size]
        input_embedded = tf.nn.embedding_lookup(char_embedding, input_)

        input_embedded = tf.reshape(input_embedded,
                                    [-1, max_word_length, char_embed_size])
    ''' Second, apply convolutions '''
    # [batch_size x num_unroll_steps, cnn_size]  # where cnn_size=sum(kernel_features)
    input_cnn = tdnn(input_embedded, kernels, kernel_features)
    ''' Maybe apply Highway '''
    if num_highway_layers > 0:
        input_cnn = highway(input_cnn,
                            input_cnn.get_shape()[-1],
                            num_layers=num_highway_layers)
    ''' Finally, do LSTM '''
    with tf.variable_scope('LSTM'):

        def create_rnn_cell():
            cell = tf.contrib.rnn.BasicLSTMCell(rnn_size,
                                                state_is_tuple=True,
                                                forget_bias=0.0,
                                                reuse=False)
            if dropout > 0.0:
                cell = tf.contrib.rnn.DropoutWrapper(cell,
                                                     output_keep_prob=1. -
                                                     dropout)
            return cell

        if num_rnn_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell(
                [create_rnn_cell() for _ in range(num_rnn_layers)],
                state_is_tuple=True)
        else:
            cell = create_rnn_cell()

        initial_rnn_state = cell.zero_state(batch_size, dtype=tf.float32)

        #         print('inputs')
        #         print(input_cnn.get_shape().as_list())

        input_cnn = tf.reshape(input_cnn, [batch_size, num_unroll_steps, -1])
        #         print('input_cnn')
        #         print(input_cnn.get_shape().as_list())

        input_cnn2 = [
            tf.squeeze(x, [1])
            for x in tf.split(input_cnn, num_unroll_steps, 1)
        ]
        #         print('input_cnn2')
        #         print([cnn.get_shape().as_list() for cnn in input_cnn2])

        outputs, final_rnn_state = tf.contrib.rnn.static_rnn(
            cell,
            input_cnn2,
            initial_state=initial_rnn_state,
            dtype=tf.float32)

        # linear projection onto output (word) vocab
        logits = []
        with tf.variable_scope('WordEmbedding') as scope:
            for idx, output in enumerate(outputs):
                if idx > 0:
                    scope.reuse_variables()
                logits.append(linear(output, word_vocab_size))

    return adict(input=input_,
                 clear_char_embedding_padding=clear_char_embedding_padding,
                 input_embedded=input_embedded,
                 input_cnn=input_cnn,
                 initial_rnn_state=initial_rnn_state,
                 final_rnn_state=final_rnn_state,
                 rnn_outputs=outputs,
                 logits=logits)

Beispiel #10

0

Datei anzeigen

Datei: text_reader.py Projekt: githubfragments/nlp

    def batch(
        self,
        ids=None,
        labels=None,
        words=None,
        chars=None,
        w=None,
        c=None,
        trim_words=None,
        trim_chars=None,
        spad='pre',
        wpad='pre',
        cpad='post',
        split_sentences=False,
    ):
        if ids:
            self.last = (ids, labels, words, chars, w, c)
        else:
            (ids, labels, words, chars, w, c) = self.last

        if trim_words == None:
            trim_words = self.trim_words
        if not trim_words and self.max_text_length == None:
            self.max_text_length = self.reader.get_maxlen()
        if trim_chars == None:
            trim_chars = self.trim_chars

        n = len(ids)
        b = {'n': n}

        # if not full batch.....just copy first item to fill
        for i in range(self.batch_size - n):
            ids.append(ids[0])
            labels.append(labels[0])
            words.append(words[0])
            chars.append(chars[0])

        b['id'] = ids  # <-- THIS key ('id') SHOULD COME FROM FIELD_PARSER.fields

        y = np.array(labels, dtype=np.float32)
        if self.normy:
            y = self.normalize(y)
        y = y[..., None]  #y = np.expand_dims(y, 1)
        b['y'] = y  # <-- THIS key ('y') SHOULD COME FROM FIELD_PARSER.fields

        if w and not isListEmpty(words):
            m = (self.max_text_length, )
            if trim_words: m = (None, )
            if split_sentences: m = (None, ) + m
            m = (None, ) + m

            p = (wpad, )
            if split_sentences: p = (spad, ) + p
            p = (None, ) + p

            #word_tensor, seq_lengths= self.pad_sequences(words, m=m, p=p)
            word_tensor, seq_lengths = U.pad_sequences(words, m=m, p=p)

            b['w'] = word_tensor
            b['x'] = b['w']

        if c and not isListEmpty(chars):
            m = (self.max_word_length, )
            if trim_chars: m = (None, )
            if trim_words: m = (None, ) + m
            else: m = (self.max_text_length, ) + m
            if split_sentences: m = (None, ) + m
            m = (None, ) + m

            p = (wpad, cpad)
            if split_sentences: p = (spad, ) + p
            p = (None, ) + p

            #char_tensor, seq_lengths = self.pad_sequences(chars, m=m, p=p)
            char_tensor, seq_lengths = U.pad_sequences(chars, m=m, p=p)

            b['c'] = char_tensor
            b['x'] = b['c']

        b['s'] = seq_lengths

        return adict(b)

Beispiel #11

0

Datei anzeigen

Datei: config.py Projekt: githubfragments/nlp

def dict2ns(dict):
    return U.adict(dict)