dataset = Dataset(max_len=params.max_len + 1, preshuffle=args.mode=='train', batch_size=params.batch_size) print 'reading data' dataset.ReadData(args.data, params.context_vars + ['text'], mode=mode, splitter=params.splitter) if args.mode == 'train': if args.vocab is not None: vocab = Vocab.Load(args.vocab) else: min_count = 20 if hasattr(params, 'min_vocab_count'): min_count = params.min_vocab_count vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=min_count) context_vocabs = {} for context_var in params.context_vars: v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)], min_count=50, no_special_syms=True) context_vocabs[context_var] = v print 'num {0}: {1}'.format(context_var, len(v)) vocab.Save(os.path.join(args.expdir, 'word_vocab.pickle')) print 'vocab size {0}'.format(len(vocab)) with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'wb') as f: pickle.dump(context_vocabs, f) else: vocab = Vocab.Load(os.path.join(args.expdir, 'word_vocab.pickle')) with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'rb') as f: context_vocabs = pickle.load(f)