Beispiel #1
0
  dataset = Dataset(max_len=params.max_len + 1, 
                    preshuffle=args.mode=='train',
                    batch_size=params.batch_size)
  print 'reading data'
  dataset.ReadData(args.data, params.context_vars + ['text'],
                   mode=mode, splitter=params.splitter)

if args.mode == 'train':
  if args.vocab is not None:
    vocab = Vocab.Load(args.vocab)
  else:
    min_count = 20
    if hasattr(params, 'min_vocab_count'):
      min_count = params.min_vocab_count
    vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=min_count)
  context_vocabs = {}
  for context_var in params.context_vars:
    v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)],
                           min_count=50, no_special_syms=True)
    context_vocabs[context_var] = v
    print 'num {0}: {1}'.format(context_var, len(v))
    
  vocab.Save(os.path.join(args.expdir, 'word_vocab.pickle'))
  print 'vocab size {0}'.format(len(vocab))
  with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'wb') as f:
    pickle.dump(context_vocabs, f)
else:
  vocab = Vocab.Load(os.path.join(args.expdir, 'word_vocab.pickle'))
  with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'rb') as f:
    context_vocabs = pickle.load(f)