# unknown token. input_reader = DiscreteSequenceReader(field=args.input_field, top_k=args.vocab_size, at_least=args.vocab_min_count, lowercase=args.vocab_lower, left_pad=args.start_pad, right_pad=args.stop_pad, unknown_token=args.unknown_token, offset_output=True) # Read training and validation data from tsv files. # collect_stats=True for the training dataset so that vocab statistics # are collected and rare words can be replaced with unknown token. data_train = read_sequence_data(args.train, input_reader, skip_header=args.skip_header, collect_stats=True, batch_size=args.batch_size, gpu=args.gpu) data_valid = read_sequence_data(args.valid, input_reader, skip_header=args.skip_header, collect_stats=False, batch_size=args.batch_size, gpu=args.gpu) # Create the decoder input control logic. input_module classes can work # on an entire sequence (i.e. training) and work a timestep at a time # during prediction (using either greedy or beam search). decoder_input = input_module.InputGroup() decoder_input.add_discrete_sequence(input_reader.vocab.size,
if model.get_meta("model_type") == "rnnlm": source_reader = None source_vocab = None target_reader = model.get_meta("input_reader") target_vocab = input_reader.vocab feature_readers = model.meta.get("feature_readers", None) else: source_reader = model.get_meta("source_reader") source_vocab = source_reader.vocab target_reader = model.get_meta("target_reader") target_vocab = target_reader.vocab feature_readers = model.meta.get("feature_readers", None) data = read_sequence_data( args.data, target_reader, skip_header=args.skip_header, input_sequence_readers=source_reader, feature_readers=feature_readers, collect_stats=False, batch_size=1, gpu=args.gpu) sweep_feature = 0 #TODO fix vocab size to know about zero offset feature_size = feature_readers[sweep_feature].vocab.size - 1 sweep_features = torch.LongTensor([x for x in range(1, feature_size + 1)]) for batch in data.iter_batch(): encoder_length = batch.encoder_length.repeat(feature_size) encoder_inputs = [batch.encoder_inputs[0].repeat(feature_size, 1)] decoder_inputs = [Variable(batch.decoder_inputs[0].data[:,:1].repeat(feature_size, 1))] decoder_features = []
lowercase=args.target_vocab_lower, left_pad=args.dec_start_token, right_pad=args.dec_stop_token, unknown_token=args.unknown_token, offset_output=True) feature_readers = [DiscreteFeatureReader( field=field, missing_token=args.missing_feature_value) for field in args.feature_fields] # Read training and validation data from tsv files. # collect_stats=True for the training dataset so that vocab statistics # are collected and rare words can be replaced with unknown token. data_train = read_sequence_data( args.train, target_reader, input_sequence_readers=source_reader, feature_readers=feature_readers, skip_header=args.skip_header, collect_stats=True, batch_size=args.batch_size, gpu=args.gpu) data_valid = read_sequence_data( args.valid, target_reader, input_sequence_readers=source_reader, feature_readers=feature_readers, skip_header=args.skip_header, collect_stats=False, batch_size=args.batch_size, gpu=args.gpu) # Create the encoder/decoder input control logic. # input_module classes can work # on an entire sequence (i.e. training) and work a timestep at a time # during prediction (using either greedy or beam search). encoder_input = input_module.InputGroup()