params.batch_size = 20 SEPERATOR = ' ' if params.splitter == 'char': SEPERATOR = '' if args.mode in ('train', 'eval', 'classify'): mode = args.mode if args.partition_override: mode = 'all' dataset = Dataset(max_len=params.max_len + 1, preshuffle=args.mode=='train', batch_size=params.batch_size) print 'reading data' dataset.ReadData(args.data, params.context_vars + ['text'], mode=mode, splitter=params.splitter) if args.mode == 'train': if args.vocab is not None: vocab = Vocab.Load(args.vocab) else: min_count = 20 if hasattr(params, 'min_vocab_count'): min_count = params.min_vocab_count vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=min_count) context_vocabs = {} for context_var in params.context_vars: v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)], min_count=50, no_special_syms=True) context_vocabs[context_var] = v print 'num {0}: {1}'.format(context_var, len(v))
params.batch_size = 1 SEPERATOR = ' ' if params.splitter == 'char': SEPERATOR = '' if args.mode in ('train', 'eval', 'classify', 'uniclass', 'geoclass'): mode = args.mode dataset = Dataset(max_len=params.max_len + 1, preshuffle=args.mode == 'train', batch_size=params.batch_size) print 'reading data' dataset.ReadData(args.data, params.context_vars + ['text'], splitter=params.splitter, valdata=args.valdata, types=params.context_var_types) if args.mode == 'train': # do the word vocab if args.vocab is not None: vocab = Vocab.Load(args.vocab) else: vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=params.min_vocab_count) if params.splitter == 'word': # do the character vocab graphemes = [['{'] + Vocab.Graphemes(x) + ['}'] for x in vocab.GetWords()] char_vocab = Vocab.MakeFromData(graphemes, min_count=1)