コード例 #1
0
ファイル: inputter.py プロジェクト: sylvestersly/OpenNMT-py
def build_dataset(fields,
                  data_type,
                  src,
                  src_dir=None,
                  tgt=None,
                  src_seq_len=50,
                  tgt_seq_len=50,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  normalize_audio=True,
                  use_filter_pred=True,
                  image_channel_size=3):
    """
    src: path to corpus file or iterator over source data
    tgt: path to corpus file, iterator over target data, or None
    """
    dataset_classes = {
        'text': TextDataset,
        'img': ImageDataset,
        'audio': AudioDataset
    }
    assert data_type in dataset_classes
    assert src is not None
    if data_type == 'text':
        src_examples_iter = TextDataset.make_examples(src, "src")
    elif data_type == 'img':
        # there is a truncate argument as well, but it was never set to
        # anything besides None before
        src_examples_iter = ImageDataset.make_examples(
            src, src_dir, 'src', channel_size=image_channel_size)
    else:
        src_examples_iter = AudioDataset.make_examples(src, src_dir, "src",
                                                       sample_rate,
                                                       window_size,
                                                       window_stride, window,
                                                       normalize_audio, None)

    if tgt is None:
        tgt_examples_iter = None
    else:
        tgt_examples_iter = TextDataset.make_examples(tgt, "tgt")

    # the second conjunct means nothing will be filtered at translation time
    # if there is no target data
    if use_filter_pred and tgt_examples_iter is not None:
        filter_pred = partial(filter_example,
                              use_src_len=data_type == 'text',
                              max_src_len=src_seq_len,
                              max_tgt_len=tgt_seq_len)
    else:
        filter_pred = None

    dataset_cls = dataset_classes[data_type]
    dataset = dataset_cls(fields,
                          src_examples_iter,
                          tgt_examples_iter,
                          filter_pred=filter_pred)
    return dataset
コード例 #2
0
def build_dataset(fields,
                  data_type,
                  src,
                  ans,
                  src_dir=None,
                  tgt=None,
                  src_seq_len=50,
                  tgt_seq_len=50,
                  ans_seq_len=50,
                  sample_rate=0,
                  window_size=0,
                  window_stride=0,
                  window=None,
                  use_filter_pred=True):
    """
    src: path to corpus file or iterator over source data
    tgt: path to corpus file, iterator over target data, or None
    """
    dataset_classes = {'text': TextDataset}
    assert data_type in dataset_classes
    assert src is not None
    if data_type == 'text':
        src_examples_iter = TextDataset.make_examples(src, "src")
        ans_examples_iter = TextDataset.make_examples(ans, "ans")
    if tgt is None:
        tgt_examples_iter = None
    else:
        tgt_examples_iter = TextDataset.make_examples(tgt, "tgt")

    # the second conjunct means nothing will be filtered at translation time
    # if there is no target data
    if use_filter_pred and tgt_examples_iter is not None:
        filter_pred = partial(filter_example,
                              use_src_len=data_type == 'text',
                              max_src_len=src_seq_len,
                              max_tgt_len=tgt_seq_len,
                              max_ans_len=ans_seq_len)
    else:
        filter_pred = None

    dataset_cls = dataset_classes[data_type]
    dataset = dataset_cls(fields,
                          src_examples_iter,
                          tgt_examples_iter,
                          ans_examples_iter,
                          filter_pred=filter_pred)
    return dataset
コード例 #3
0
ファイル: inputter.py プロジェクト: quanc1989/NanoDecoder
def build_dataset(
        fields,
        data_type,
        src,
        src_dir=None,
        tgt=None,
        src_seq_len=50,
        tgt_seq_len=50,
        src_seq_length_trunc=0,
        tgt_seq_length_trunc=0,
        # dynamic_dict=False,
        flag_fft=False,
        sample_rate=0,
        window_size=0,
        window_stride=0,
        window=None,
        normalize_audio=False,
        use_filter_pred=True,
        corpus_type='train'
    # image_channel_size=3
):
    """
    src: path to corpus file or iterator over source data
    tgt: path to corpus file, iterator over target data, or None
    """
    # dataset_classes = {
    #     'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset
    # }

    dataset_classes = {'nano': NanoDataset}

    assert data_type in dataset_classes
    assert src is not None
    # assert not dynamic_dict or data_type == 'text', \
    #     'it is not possible to use dynamic_dict with non-text input'
    # if data_type == 'text':
    #     src_examples_iter = TextDataset.make_examples(
    #         src, src_seq_length_trunc, "src"
    #     )
    # elif data_type == 'img':
    #     # there is a truncate argument as well, but it was never set to
    #     # anything besides None before
    #     src_examples_iter = ImageDataset.make_examples(
    #         src, src_dir, 'src', channel_size=image_channel_size
    #     )
    # else:
    #     src_examples_iter = AudioDataset.make_examples(
    #         src, src_dir, "src", sample_rate,
    #         window_size, window_stride, window,
    #         normalize_audio, None)

    src_examples_iter = NanoDataset.make_examples(src, src_dir, "src",
                                                  flag_fft, sample_rate,
                                                  window_size, window_stride,
                                                  window, normalize_audio,
                                                  None, corpus_type)

    if tgt is None:
        tgt_examples_iter = None
    else:
        tgt_examples_iter = TextDataset.make_examples(tgt,
                                                      tgt_seq_length_trunc,
                                                      "tgt")

    # the second conjunct means nothing will be filtered at translation time
    # if there is no target data
    if use_filter_pred and tgt_examples_iter is not None:
        filter_pred = partial(filter_example,
                              use_src_len=data_type == 'text',
                              max_src_len=src_seq_len,
                              max_tgt_len=tgt_seq_len)
    else:
        filter_pred = None

    # dataset_cls = dataset_classes[data_type]
    dataset = NanoDataset(fields,
                          src_examples_iter,
                          tgt_examples_iter,
                          dynamic_dict=False,
                          filter_pred=filter_pred)
    return dataset
コード例 #4
0
def build_dataset(fields, data_type, src, knl,
                  src_dir=None, tgt=None,
                  knl_seq_len=800, src_seq_len=150, tgt_seq_len=50,
                  knl_seq_length_trunc=200, src_seq_length_trunc=50, tgt_seq_length_trunc=0,
                  dynamic_dict=False, sample_rate=0,
                  window_size=0, window_stride=0, window=None,
                  normalize_audio=True, use_filter_pred=True,
                  image_channel_size=3, corpus_type='train', model_mode='default'):
    """
    src: path to corpus file or iterator over source data
    tgt: path to corpus file, iterator over target data, or None
    """
    dataset_classes = {
        'text': TextDataset, 'img': ImageDataset, 'audio': AudioDataset
    }
    assert data_type in dataset_classes
    assert src is not None
    assert not dynamic_dict or data_type == 'text', \
        'it is not possible to use dynamic_dict with non-text input'
    if data_type == 'text':
        src_examples_iter = TextDataset.make_examples(
            src, src_seq_length_trunc, "src", corpus_type, model_mode
        )
        knl_examples_iter = TextDataset.make_examples(
            knl, knl_seq_length_trunc, "knl", corpus_type, model_mode
        )
    elif data_type == 'img':
        # there is a truncate argument as well, but it was never set to
        # anything besides None before
        src_examples_iter = ImageDataset.make_examples(
            src, src_dir, 'src', channel_size=image_channel_size
        )
    else:
        src_examples_iter = AudioDataset.make_examples(
            src, src_dir, "src", sample_rate,
            window_size, window_stride, window,
            normalize_audio, None)

    if tgt is None:
        tgt_examples_iter = None
    else:
        tgt_examples_iter = TextDataset.make_examples(
            tgt, tgt_seq_length_trunc, "tgt", corpus_type, model_mode)

    # the second conjunct means nothing will be filtered at translation time
    # if there is no target data
    if use_filter_pred and tgt_examples_iter is not None:
        filter_pred = partial(
            filter_example, use_src_len=data_type == 'text', use_knl_len=data_type == 'text',
            max_src_len=src_seq_len, max_tgt_len=tgt_seq_len, max_knl_len=knl_seq_len
        )
    else:
        filter_pred = None

    dataset_cls = dataset_classes[data_type]
    dataset = dataset_cls(
        fields, src_examples_iter, tgt_examples_iter, knl_examples_iter,
        dynamic_dict=dynamic_dict, filter_pred=filter_pred
    )

    print("[onmt.inputters.inputter.py] dataset_cls:{}".format(dataset_cls))
    print("[onmt.inputters.inputter.py] dataset:{}".format(dataset))

    return dataset