コード例 #1
0
    # unknown token.
    input_reader = DiscreteSequenceReader(field=args.input_field,
                                          top_k=args.vocab_size,
                                          at_least=args.vocab_min_count,
                                          lowercase=args.vocab_lower,
                                          left_pad=args.start_pad,
                                          right_pad=args.stop_pad,
                                          unknown_token=args.unknown_token,
                                          offset_output=True)

    # Read training and validation data from tsv files.
    # collect_stats=True for the training dataset so that vocab statistics
    # are collected and rare words can be replaced with unknown token.
    data_train = read_sequence_data(args.train,
                                    input_reader,
                                    skip_header=args.skip_header,
                                    collect_stats=True,
                                    batch_size=args.batch_size,
                                    gpu=args.gpu)

    data_valid = read_sequence_data(args.valid,
                                    input_reader,
                                    skip_header=args.skip_header,
                                    collect_stats=False,
                                    batch_size=args.batch_size,
                                    gpu=args.gpu)

    # Create the decoder input control logic. input_module classes can work
    # on an entire sequence (i.e. training) and work a timestep at a time
    # during prediction (using either greedy or beam search).
    decoder_input = input_module.InputGroup()
    decoder_input.add_discrete_sequence(input_reader.vocab.size,
コード例 #2
0
ファイル: complete_feature_sample.py プロジェクト: kedz/ntg
    if model.get_meta("model_type") == "rnnlm":
        source_reader = None
        source_vocab = None
        target_reader = model.get_meta("input_reader")
        target_vocab = input_reader.vocab
        feature_readers = model.meta.get("feature_readers", None)
    else:
        source_reader = model.get_meta("source_reader")
        source_vocab = source_reader.vocab
        target_reader = model.get_meta("target_reader")
        target_vocab = target_reader.vocab
        feature_readers = model.meta.get("feature_readers", None)

    data = read_sequence_data(
        args.data, target_reader, skip_header=args.skip_header,
        input_sequence_readers=source_reader,
        feature_readers=feature_readers,
        collect_stats=False, batch_size=1, gpu=args.gpu)

    sweep_feature = 0
    #TODO fix vocab size to know about zero offset
    feature_size = feature_readers[sweep_feature].vocab.size - 1

    sweep_features = torch.LongTensor([x for x in range(1, feature_size + 1)])

    for batch in data.iter_batch():

        encoder_length = batch.encoder_length.repeat(feature_size)
        encoder_inputs = [batch.encoder_inputs[0].repeat(feature_size, 1)]
        decoder_inputs = [Variable(batch.decoder_inputs[0].data[:,:1].repeat(feature_size, 1))]
        decoder_features = []
コード例 #3
0
        lowercase=args.target_vocab_lower, left_pad=args.dec_start_token, 
        right_pad=args.dec_stop_token,
        unknown_token=args.unknown_token,
        offset_output=True)

    feature_readers = [DiscreteFeatureReader(
                           field=field, 
                           missing_token=args.missing_feature_value)
                       for field in args.feature_fields]

    # Read training and validation data from tsv files.
    # collect_stats=True for the training dataset so that vocab statistics
    # are collected and rare words can be replaced with unknown token.
    data_train = read_sequence_data(
        args.train, target_reader, 
        input_sequence_readers=source_reader,
        feature_readers=feature_readers,
        skip_header=args.skip_header,
        collect_stats=True, batch_size=args.batch_size, gpu=args.gpu)

    data_valid = read_sequence_data(
        args.valid, target_reader, 
        input_sequence_readers=source_reader,
        feature_readers=feature_readers,
        skip_header=args.skip_header,
        collect_stats=False, batch_size=args.batch_size, gpu=args.gpu)

    # Create the encoder/decoder input control logic. 
    # input_module classes can work
    # on an entire sequence (i.e. training) and work a timestep at a time
    # during prediction (using either greedy or beam search).
    encoder_input = input_module.InputGroup()