) and params.context_var_types[i] == 'numerical': context_vocabs[context_var] = None continue v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)], min_count=50, no_special_syms=True) context_vocabs[context_var] = v print 'num {0}: {1}'.format(context_var, len(v)) vocab.Save(os.path.join(args.expdir, 'word_vocab.pickle')) print 'vocab size {0}'.format(len(vocab)) with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'wb') as f: pickle.dump(context_vocabs, f) dataset.Prepare(vocab, context_vocabs) else: vocab = Vocab.Load(os.path.join(args.expdir, 'word_vocab.pickle')) if params.splitter == 'word': char_vocab = Vocab.Load(os.path.join(args.expdir, 'char_vocab.pickle')) else: char_vocab = None with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'rb') as f: context_vocabs = pickle.load(f) use_nce_loss = args.mode == 'train' if len(vocab) < 5000: # disable NCE for small vocabularies use_nce_loss = False if args.mode == 'classify' and len(vocab) > 5000: use_nce_loss = True