def MakeCharVocabMat(self, word_vocab, char_vocab): graphemes = [['{'] + Vocab.Graphemes(x) + ['}'] for x in word_vocab.GetWords()] self.max_len = max([len(x) for x in graphemes]) grapheme_ids = [] lengths = [] for g in graphemes: ids = [char_vocab[c] for c in g] lengths.append(len(ids)) if len(ids) < self.max_len: ids += [char_vocab['}']] * (self.max_len - len(ids)) grapheme_ids.append(ids) self.word_lens = tf.Variable(trainable=False, initial_value=lengths, name='word_lens') self.words_as_chars = tf.Variable(trainable=False, initial_value=grapheme_ids, name='words_as_chars')
dataset.ReadData(args.data, params.context_vars + ['text'], splitter=params.splitter, valdata=args.valdata, types=params.context_var_types) if args.mode == 'train': # do the word vocab if args.vocab is not None: vocab = Vocab.Load(args.vocab) else: vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=params.min_vocab_count) if params.splitter == 'word': # do the character vocab graphemes = [['{'] + Vocab.Graphemes(x) + ['}'] for x in vocab.GetWords()] char_vocab = Vocab.MakeFromData(graphemes, min_count=1) char_vocab.Save(os.path.join(args.expdir, 'char_vocab.pickle')) else: char_vocab = None context_vocabs = {} # do the context vocabs for i, context_var in enumerate(params.context_vars): # skip numerical vocabularies if hasattr(params, 'context_var_types' ) and params.context_var_types[i] == 'numerical': context_vocabs[context_var] = None continue v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)],
preshuffle=args.mode=='train', batch_size=params.batch_size) print 'reading data' dataset.ReadData(args.data, params.context_vars + ['text'], splitter=params.splitter, valdata=args.valdata, types=params.context_var_types) if args.mode == 'train': # do the word vocab if args.vocab is not None: vocab = Vocab.Load(args.vocab) else: vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=params.min_vocab_count) if params.splitter == 'word': # do the character vocab graphemes = [['{'] + Vocab.Graphemes(x) + ['}'] for x in vocab.GetWords()] char_vocab = Vocab.MakeFromData(graphemes, min_count=1) char_vocab.Save(os.path.join(args.expdir, 'char_vocab.pickle')) else: char_vocab = None context_vocabs = {} # do the context vocabs for i, context_var in enumerate(params.context_vars): # skip numerical vocabularies if hasattr(params, 'context_var_types') and params.context_var_types[i] == 'numerical': context_vocabs[context_var] = None continue v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)], min_count=50, no_special_syms=True) context_vocabs[context_var] = v