def package(self, word_tokens, char_tokens, ws, cs): return adict({ self.words: word_tokens, self.chars: char_tokens, self.ws: ws, self.cs: cs })
def batch2(self, labels=None, words=None, trim_words=None, shuf=False): n = len(labels) b = {'n': n} # shuffle if shuf: p = np.random.permutation(n) q = [i + n for i in range(self.batch_size - n)] p = np.hstack([p, q]).astype(np.int32) # if not full batch.....just copy first item to fill for i in range(self.batch_size - n): labels.append(labels[0]) words.append(words[0]) ############################################################ y = np.array(labels, dtype=np.float32) if shuf: y = y[p] y = y[..., None] #y = np.expand_dims(y, 1) b['y'] = y ############################################################ word_tensor = np.array(words, dtype=np.float32) if shuf: word_tensor = word_tensor[p] #################### ## TRIM KERAS SEQUENCES to LONGEST in BATCH!!! nz = first_nonzero(word_tensor) mz = min(nz) #mz = max(0, mz-100) ##1 word_tensor = word_tensor[:, mz:] # ##2 # m = word_tensor.shape[1] # seq_lengths = [m-z for z in nz] # max_text_length = m-mz # x = np.zeros((self.batch_size, max_text_length)).astype(np.float32) # for i in range(self.batch_size): # s = word_tensor[i,nz[i]:] # x[i,:len(s)] = s # word_tensor = x # ##3 # word_tensor = np.fliplr(word_tensor); seq_lengths = [m-z for z in nz] #################### b['w'] = word_tensor b['x'] = b['w'] ############################################################ max_seq_length = word_tensor.shape[1] seq_lengths = [max_seq_length for x in labels] b['s'] = seq_lengths #################### return adict(b)
def batch_stream(self, stop=False): tok_stream = self.reader.chunk_stream(stop=stop) while True: batches = self.make_batches(tok_stream) if batches is None: break for c, w in zip(batches[0], batches[1]): if self.trim_chars: c = self.trim_batch(c) yield adict({'w': w, 'c': c})
def get_ystats(self): y = self.get_all_fields(key='y') y = np.array(y, dtype=np.float32) v, c = np.unique(y, return_counts=True) d = {} d['mean'] = np.mean(y) d['std'] = np.std(y) d['min'] = np.min(y) d['max'] = np.max(y) d['n'] = len(y) d['v'] = v d['c'] = c return adict(d)
def loss_graph(logits, batch_size, num_unroll_steps): with tf.variable_scope('Loss'): targets = tf.placeholder(tf.int64, [batch_size, num_unroll_steps], name='targets') target_list = [ tf.squeeze(x, [1]) for x in tf.split(targets, num_unroll_steps, 1) ] loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=target_list), name='loss') tf.summary.scalar("model/loss", loss) return adict(targets=targets, loss=loss)
def parse_line(self, line, t=None): rec = line.strip().split(self.sep) d = {} for k, v in self.fields.items(): if isinstance(v, basestring): d[v] = rec[k].strip() else: d.update(v.parse_line(rec[k].strip())) if t and v == 'y': y = float(d[v]) p = t[y] if self.rng.rand() > p: #print('sample NO\t[{},{}]'.format(y,p)) return None #print('sample YES\t[{},{}]'.format(y,p)) return adict(d)
def training_graph(loss, learning_rate=1.0, max_grad_norm=5.0): ''' Builds training graph. ''' global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('SGD_Training'): # SGD learning parameter learning_rate = tf.Variable(learning_rate, trainable=False, name='learning_rate') # collect all trainable variables tvars = tf.trainable_variables() grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss, tvars), max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) return adict(learning_rate=learning_rate, global_step=global_step, global_norm=global_norm, train_op=train_op)
from nlp.tf_tools.attention import attention import nlp.tf_tools.sonnet_modules as sm ''' get config ''' parser = options.get_parser() config_file = 'config/ats.conf' argv = [] # override config file here FLAGS = config.get_config(parser=parser, config_file=config_file, argv=argv) FLAGS.chkpt_dir = make_abs(FLAGS.chkpt_dir) FLAGS.rand_seed = U.seed_random(FLAGS.rand_seed) pprint.pprint(FLAGS) embed = U.adict({ 'type': FLAGS.embed_type, 'char': FLAGS.embed_type == 'char', 'word': FLAGS.embed_type == 'word' }) FLAGS.kernel_widths = eval(eval(FLAGS.kernel_widths)) FLAGS.kernel_features = eval(eval(FLAGS.kernel_features)) ''' setup checkpoint directory ''' if not os.path.exists(FLAGS.chkpt_dir): U.mkdirs(FLAGS.chkpt_dir) print('Created checkpoint directory', FLAGS.chkpt_dir) config.save_local_config(FLAGS) #mode = FLAGS.run_mode batch_size = FLAGS.batch_size pid = FLAGS.item_id essay_file = os.path.join(FLAGS.data_dir, '{0}', '{0}.txt.clean.tok').format(pid)
def inference_graph(char_vocab_size, word_vocab_size, char_embed_size=15, batch_size=20, num_highway_layers=2, num_rnn_layers=2, rnn_size=650, max_word_length=65, kernels=[1, 2, 3, 4, 5, 6, 7], kernel_features=[50, 100, 150, 200, 200, 200, 200], num_unroll_steps=35, dropout=0.0): assert len(kernels) == len( kernel_features), 'Kernel and Features must have the same size' input_ = tf.placeholder( tf.int32, shape=[batch_size, num_unroll_steps, max_word_length], name="input") ''' First, embed characters ''' with tf.variable_scope('Embedding'): char_embedding = tf.get_variable('char_embedding', [char_vocab_size, char_embed_size]) ''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to zero embedding vector and ignores gradient updates. For that do the following in TF: 1. after parameter initialization, apply this op to zero out padding embedding vector 2. after each gradient update, apply this op to keep padding at zero''' clear_char_embedding_padding = tf.scatter_update( char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size])) # [batch_size x max_word_length, num_unroll_steps, char_embed_size] input_embedded = tf.nn.embedding_lookup(char_embedding, input_) input_embedded = tf.reshape(input_embedded, [-1, max_word_length, char_embed_size]) ''' Second, apply convolutions ''' # [batch_size x num_unroll_steps, cnn_size] # where cnn_size=sum(kernel_features) input_cnn = tdnn(input_embedded, kernels, kernel_features) ''' Maybe apply Highway ''' if num_highway_layers > 0: input_cnn = highway(input_cnn, input_cnn.get_shape()[-1], num_layers=num_highway_layers) ''' Finally, do LSTM ''' with tf.variable_scope('LSTM'): def create_rnn_cell(): cell = tf.contrib.rnn.BasicLSTMCell(rnn_size, state_is_tuple=True, forget_bias=0.0, reuse=False) if dropout > 0.0: cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=1. - dropout) return cell if num_rnn_layers > 1: cell = tf.contrib.rnn.MultiRNNCell( [create_rnn_cell() for _ in range(num_rnn_layers)], state_is_tuple=True) else: cell = create_rnn_cell() initial_rnn_state = cell.zero_state(batch_size, dtype=tf.float32) # print('inputs') # print(input_cnn.get_shape().as_list()) input_cnn = tf.reshape(input_cnn, [batch_size, num_unroll_steps, -1]) # print('input_cnn') # print(input_cnn.get_shape().as_list()) input_cnn2 = [ tf.squeeze(x, [1]) for x in tf.split(input_cnn, num_unroll_steps, 1) ] # print('input_cnn2') # print([cnn.get_shape().as_list() for cnn in input_cnn2]) outputs, final_rnn_state = tf.contrib.rnn.static_rnn( cell, input_cnn2, initial_state=initial_rnn_state, dtype=tf.float32) # linear projection onto output (word) vocab logits = [] with tf.variable_scope('WordEmbedding') as scope: for idx, output in enumerate(outputs): if idx > 0: scope.reuse_variables() logits.append(linear(output, word_vocab_size)) return adict(input=input_, clear_char_embedding_padding=clear_char_embedding_padding, input_embedded=input_embedded, input_cnn=input_cnn, initial_rnn_state=initial_rnn_state, final_rnn_state=final_rnn_state, rnn_outputs=outputs, logits=logits)
def batch( self, ids=None, labels=None, words=None, chars=None, w=None, c=None, trim_words=None, trim_chars=None, spad='pre', wpad='pre', cpad='post', split_sentences=False, ): if ids: self.last = (ids, labels, words, chars, w, c) else: (ids, labels, words, chars, w, c) = self.last if trim_words == None: trim_words = self.trim_words if not trim_words and self.max_text_length == None: self.max_text_length = self.reader.get_maxlen() if trim_chars == None: trim_chars = self.trim_chars n = len(ids) b = {'n': n} # if not full batch.....just copy first item to fill for i in range(self.batch_size - n): ids.append(ids[0]) labels.append(labels[0]) words.append(words[0]) chars.append(chars[0]) b['id'] = ids # <-- THIS key ('id') SHOULD COME FROM FIELD_PARSER.fields y = np.array(labels, dtype=np.float32) if self.normy: y = self.normalize(y) y = y[..., None] #y = np.expand_dims(y, 1) b['y'] = y # <-- THIS key ('y') SHOULD COME FROM FIELD_PARSER.fields if w and not isListEmpty(words): m = (self.max_text_length, ) if trim_words: m = (None, ) if split_sentences: m = (None, ) + m m = (None, ) + m p = (wpad, ) if split_sentences: p = (spad, ) + p p = (None, ) + p #word_tensor, seq_lengths= self.pad_sequences(words, m=m, p=p) word_tensor, seq_lengths = U.pad_sequences(words, m=m, p=p) b['w'] = word_tensor b['x'] = b['w'] if c and not isListEmpty(chars): m = (self.max_word_length, ) if trim_chars: m = (None, ) if trim_words: m = (None, ) + m else: m = (self.max_text_length, ) + m if split_sentences: m = (None, ) + m m = (None, ) + m p = (wpad, cpad) if split_sentences: p = (spad, ) + p p = (None, ) + p #char_tensor, seq_lengths = self.pad_sequences(chars, m=m, p=p) char_tensor, seq_lengths = U.pad_sequences(chars, m=m, p=p) b['c'] = char_tensor b['x'] = b['c'] b['s'] = seq_lengths return adict(b)
def dict2ns(dict): return U.adict(dict)