コード例 #1
0
ファイル: DataLoader.py プロジェクト: wkulczi/NABU
def LoadDataset(train_path, eval_path, test_path, vocab_path, sentencepiece):
    # load the train and eval datasets
    with open(train_path, 'rb') as f:
        train_set = pickle.load(f)
    with open(eval_path, 'rb') as f:
        eval_set = pickle.load(f)
    with open(test_path, 'rb') as f:
        test_set = pickle.load(f)

    train_inp, train_tgt = zip(*train_set)
    eval_inp, eval_tgt = zip(*eval_set)

    # load the vocab
    if sentencepiece == 'True':
        sp = spm.SentencePieceProcessor()
        sp.load(vocab_path)

        input_tensor = [sp.encode_as_ids(w) for w in train_inp]
        input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
            input_tensor, padding='post')
        target_tensor = [sp.encode_as_ids(w) for w in train_tgt]
        target_tensor = tf.keras.preprocessing.sequence.pad_sequences(
            target_tensor, padding='post')
        eval_inp = [sp.encode_as_ids(w) for w in eval_inp]
        eval_inp = tf.keras.preprocessing.sequence.pad_sequences(
            eval_inp, padding='post')
        eval_tgt = [sp.encode_as_ids(w) for w in eval_tgt]
        eval_tgt = tf.keras.preprocessing.sequence.pad_sequences(
            eval_tgt, padding='post')

        test_inp = [sp.encode_as_ids(w) for w in test_set]
        test_inp = tf.keras.preprocessing.sequence.pad_sequences(
            test_inp, padding='post')

        return input_tensor, target_tensor, \
               eval_inp, eval_tgt, test_inp, sp, max_length(target_tensor)
    else:
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        input_tensor = _tensorize(vocab, train_inp)
        target_tensor = _tensorize(vocab, train_tgt)
        eval_inp = _tensorize(vocab, eval_inp)
        eval_tgt = _tensorize(vocab, eval_tgt)
        test_inp = _tensorize(vocab, test_set)

        return input_tensor, target_tensor, \
               eval_inp, eval_tgt, test_inp, vocab, max_length(target_tensor)
コード例 #2
0
def ProcessMultilingualDataset(args, set=None):
    """
  Takes in the prepocessed Datasets and converts them
  into tensorflow tensors, Adds padding to make the
  targets uniform and packages the individual datasets
  as a combined tf.data.Dataset object.
  Also shuffles and batches the dataset.

  Note : The datasets are not concatenated into one big
  dataset if Knowledge Distillation is being used. We would
  require all datasets seperately to pass each batch through
  both the teacher model and student model. Then the fucntion
  returns a dict with all datasets.

  :param args: Args obj which contains paths to the preprocessed files
  :type args: ArgParse object
  :return: The multilingual dataset along with source and targer vocabs
  and their sizes and maximum target sequence length.
  :rtype: tf.data.Dataset object, vocab objects (src and tgt vocab),
          int ( max sequence length ), int (total buffer size,
          int ( steps per epoch, not much used )
  """

    multilingual_dataset = {}
    TRAIN_BUFFER_SIZE = 0
    EVAL_BUFFER_SIZE = 0

    if args.model == 'gat':
        dataset, src_vocab, tgt_vocab = LoadMultlingualDataset(args)
        for lang in languages:
            if args.sentencepiece == 'False':
                dataset[lang + '_train_tgt'] = _tensorize(
                    src_vocab, dataset[lang + '_train_tgt'])
                dataset[lang + '_eval_tgt'] = _tensorize(
                    src_vocab, dataset[lang + '_eval_tgt'])

            else:
                dataset[lang + '_train_tgt'] = [
                    tgt_vocab.encode_as_ids(w)
                    for w in dataset[lang + '_train_tgt']
                ]
                dataset[
                    lang +
                    '_train_tgt'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_train_tgt'], padding='post')
                dataset[lang + '_eval_tgt'] = [
                    tgt_vocab.encode_as_ids(w)
                    for w in dataset[lang + '_eval_tgt']
                ]
                dataset[
                    lang +
                    '_eval_tgt'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_eval_tgt'], padding='post')

            for part in ['train', 'eval', 'test']:
                dataset[lang + '_' + part + '_nodes'] = padding(
                    _tensorize(src_vocab,
                               dataset[lang + '_' + part + '_nodes']), 16)
                dataset[lang + '_' + part + '_labels'] = padding(
                    _tensorize(src_vocab,
                               dataset[lang + '_' + part + '_labels']), 16)
                dataset[lang + '_' + part + '_node1'] = padding(
                    _tensorize(src_vocab,
                               dataset[lang + '_' + part + '_node1']), 16)
                dataset[lang + '_' + part + '_node2'] = padding(
                    _tensorize(src_vocab,
                               dataset[lang + '_' + part + '_node2']), 16)

            TRAIN_BUFFER_SIZE += (dataset[lang + '_train_nodes']).shape[0]
            EVAL_BUFFER_SIZE += (dataset[lang + '_eval_nodes']).shape[0]

        MaxSeqSize = max(dataset['eng_train_tgt'].shape[1],
                         dataset['ger_train_tgt'].shape[1],
                         dataset['rus_train_tgt'].shape[1])

        MULTI_BUFFER_SIZE = 0
        BATCH_SIZE = args.batch_size
        for lang in languages:
            dataset[lang + '_train_tgt'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_train_tgt'], padding='post'), MaxSeqSize)
            dataset[lang + '_eval_tgt'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_eval_tgt'], padding='post'), MaxSeqSize)

            BUFFER_SIZE = len(dataset[lang + '_train_tgt'])
            MULTI_BUFFER_SIZE += BUFFER_SIZE
            dataset_size = dataset[lang + '_train_tgt'].shape[0]

            for part in ['train', 'eval']:
                if part == 'train':
                    multilingual_dataset[
                        lang + '_' + part +
                        '_set'] = tf.data.Dataset.from_tensor_slices(
                            (dataset[lang + '_' + part + '_nodes'],
                             dataset[lang + '_' + part + '_labels'],
                             dataset[lang + '_' + part + '_node1'],
                             dataset[lang + '_' + part + '_node2'],
                             dataset[lang + '_' + part + '_tgt']))
                    multilingual_dataset[lang + '_' + part +
                                         '_set'] = multilingual_dataset[
                                             lang + '_' + part +
                                             '_set'].shuffle(BUFFER_SIZE)
                else:
                    multilingual_dataset[
                        lang + '_' + part +
                        '_set'] = tf.data.Dataset.from_tensor_slices(
                            (dataset[lang + '_' + part + '_nodes'],
                             dataset[lang + '_' + part + '_labels'],
                             dataset[lang + '_' + part + '_node1'],
                             dataset[lang + '_' + part + '_node2'],
                             dataset[lang + '_' + part + '_tgt']))

            multilingual_dataset[
                lang + '_test_set'] = tf.data.Dataset.from_tensor_slices(
                    (dataset[lang + '_test_nodes'],
                     dataset[lang + '_test_labels'],
                     dataset[lang + '_test_node1'],
                     dataset[lang + '_test_node2']))

        final_dataset = {}
        for opt in ['train', 'test', 'eval']:
            final_dataset[opt + '_set'] = \
              multilingual_dataset['eng_' + opt + '_set'].concatenate(
                multilingual_dataset['ger_' + opt + '_set'].concatenate(
                  multilingual_dataset['rus_' + opt + '_set']))

        if args.sentencepiece == 'False':
            src_vocab_size = len(src_vocab.word_index) + 1
            tgt_vocab_size = args.vocab_size
        else:
            src_vocab_size = len(src_vocab.word_index) + 1
            tgt_vocab_size = tgt_vocab.get_piece_size()

        final_dataset['train_set'] = final_dataset['train_set'].shuffle(
            MULTI_BUFFER_SIZE)
        final_dataset['train_set'] = final_dataset['train_set'].batch(
            BATCH_SIZE, drop_remainder=True)
        final_dataset['eval_set'] = final_dataset['eval_set'].batch(
            BATCH_SIZE, drop_remainder=True)
        final_dataset['test_set'] = final_dataset['test_set'].batch(
            BATCH_SIZE, drop_remainder=False)
        steps_per_epoch = int(MULTI_BUFFER_SIZE // BATCH_SIZE)

        print('BUFFER SIZE ' + str(MULTI_BUFFER_SIZE))
        print("Dataset shapes : ")

        return (final_dataset, src_vocab, src_vocab_size, tgt_vocab,
                tgt_vocab_size, MULTI_BUFFER_SIZE, steps_per_epoch, MaxSeqSize)

    else:
        dataset, vocab = LoadMultlingualDataset(args)
        for lang in languages:
            if args.sentencepiece == 'False':
                dataset[lang + '_train_src'] = _tensorize(
                    vocab, dataset[lang + '_train_src'])
                dataset[lang + '_eval_src'] = _tensorize(
                    vocab, dataset[lang + '_eval_src'])
                dataset[lang + '_train_tgt'] = _tensorize(
                    vocab, dataset[lang + '_train_tgt'])
                dataset[lang + '_eval_tgt'] = _tensorize(
                    vocab, dataset[lang + '_eval_tgt'])
                dataset[lang + '_test_src'] = _tensorize(
                    vocab, dataset[lang + '_eval_tgt'])

            else:
                dataset[lang + '_train_src'] = [
                    vocab.encode_as_ids(w)
                    for w in dataset[lang + '_train_src']
                ]
                dataset[
                    lang +
                    '_train_src'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_train_src'], padding='post')
                dataset[lang + '_eval_src'] = [
                    vocab.encode_as_ids(w) for w in dataset[lang + '_eval_src']
                ]
                dataset[
                    lang +
                    '_eval_src'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_eval_src'], padding='post')

                dataset[lang + '_train_tgt'] = [
                    vocab.encode_as_ids(w)
                    for w in dataset[lang + '_train_tgt']
                ]
                dataset[
                    lang +
                    '_train_tgt'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_train_tgt'], padding='post')
                dataset[lang + '_eval_tgt'] = [
                    vocab.encode_as_ids(w) for w in dataset[lang + '_eval_tgt']
                ]
                dataset[
                    lang +
                    '_eval_tgt'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_eval_tgt'], padding='post')
                dataset[lang + '_test_src'] = [
                    vocab.encode_as_ids(w) for w in dataset[lang + '_test_src']
                ]
                dataset[
                    lang +
                    '_test_src'] = tf.keras.preprocessing.sequence.pad_sequences(
                        dataset[lang + '_test_src'], padding='post')

            TRAIN_BUFFER_SIZE += (dataset[lang + '_train_src']).shape[0]
            EVAL_BUFFER_SIZE += (dataset[lang + '_eval_src']).shape[0]

        MaxSeqSize = max(dataset['eng_train_tgt'].shape[1],
                         dataset['ger_train_tgt'].shape[1],
                         dataset['rus_train_tgt'].shape[1])
        SrcSeqSize = max(dataset['eng_train_src'].shape[1],
                         dataset['ger_train_src'].shape[1],
                         dataset['rus_train_src'].shape[1])
        TestSeqSize = max(dataset['eng_test_src'].shape[1],
                          dataset['ger_test_src'].shape[1],
                          dataset['rus_test_src'].shape[1])

        MULTI_BUFFER_SIZE = 0
        BATCH_SIZE = args.batch_size
        for lang in languages:
            dataset[lang + '_train_src'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_train_src'], padding='post'), SrcSeqSize)
            dataset[lang + '_eval_src'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_eval_src'], padding='post'), SrcSeqSize)

            dataset[lang + '_train_tgt'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_train_tgt'], padding='post'), MaxSeqSize)
            dataset[lang + '_eval_tgt'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_eval_tgt'], padding='post'), MaxSeqSize)
            dataset[lang + '_test_src'] = padding(
                tf.keras.preprocessing.sequence.pad_sequences(
                    dataset[lang + '_test_src'], padding='post'), TestSeqSize)

            BUFFER_SIZE = len(dataset[lang + '_train_tgt'])
            MULTI_BUFFER_SIZE += BUFFER_SIZE

            for part in ['train', 'eval']:
                if part == 'train':
                    multilingual_dataset[
                        lang + '_' + part +
                        '_set'] = tf.data.Dataset.from_tensor_slices(
                            (dataset[lang + '_' + part + '_src'],
                             dataset[lang + '_' + part + '_tgt']))
                    multilingual_dataset[lang + '_' + part +
                                         '_set'] = multilingual_dataset[
                                             lang + '_' + part +
                                             '_set'].shuffle(BUFFER_SIZE)
                else:
                    multilingual_dataset[
                        lang + '_' + part +
                        '_set'] = tf.data.Dataset.from_tensor_slices(
                            (dataset[lang + '_' + part + '_src'],
                             dataset[lang + '_' + part + '_tgt']))

            multilingual_dataset[
                lang + '_test_set'] = tf.data.Dataset.from_tensor_slices(
                    dataset[lang + '_test_src'])

        final_dataset = {}
        for opt in ['train', 'test', 'eval']:
            final_dataset[opt + '_set'] = \
              multilingual_dataset['eng_' + opt + '_set'].concatenate(
                multilingual_dataset['ger_' + opt + '_set'].concatenate(
                  multilingual_dataset['rus_' + opt + '_set']))

        if args.sentencepiece == 'False':
            tgt_vocab_size = args.vocab_size
        else:
            tgt_vocab_size = vocab.get_piece_size()

        final_dataset['train_set'] = final_dataset['train_set'].shuffle(
            MULTI_BUFFER_SIZE)
        final_dataset['train_set'] = final_dataset['train_set'].batch(
            BATCH_SIZE, drop_remainder=True)
        final_dataset['eval_set'] = final_dataset['eval_set'].batch(
            BATCH_SIZE, drop_remainder=True)
        final_dataset['test_set'] = final_dataset['test_set'].batch(
            BATCH_SIZE, drop_remainder=False)
        steps_per_epoch = int(MULTI_BUFFER_SIZE // BATCH_SIZE)

        print('BUFFER SIZE ' + str(MULTI_BUFFER_SIZE))
        print("Dataset shapes : ")

        return (final_dataset, vocab, tgt_vocab_size, MULTI_BUFFER_SIZE,
                steps_per_epoch, MaxSeqSize)
コード例 #3
0
ファイル: DataLoader.py プロジェクト: wkulczi/NABU
def LoadGatDataset(train_path,
                   eval_path,
                   test_path,
                   srv_vocab,
                   tgt_vocab,
                   opt,
                   sentencepiece,
                   lang,
                   num_examples=None):
    train_ = {}
    eval_ = {}
    test_ = {}
    # load the train and eval datasets
    with open(train_path, 'rb') as f:
        train_set = pickle.load(f)
    with open(eval_path, 'rb') as f:
        eval_set = pickle.load(f)
    with open(test_path, 'rb') as f:
        test_set = pickle.load(f)

    # load vocab
    if sentencepiece == 'True':
        sp = spm.SentencePieceProcessor()
        sp.load(tgt_vocab)
    with open(srv_vocab, 'rb') as f:
        src_vocab = pickle.load(f)

    train_input, train_tgt = zip(*train_set)
    eval_input, eval_tgt = zip(*eval_set)
    (train_nodes, train_labels, train_node1, train_node2) = zip(*train_input)
    (eval_nodes, eval_labels, eval_node1, eval_node2) = zip(*eval_input)
    (test_nodes, test_labels, test_node1, test_node2) = zip(*test_set)

    train_["train_node_tensor"] = _tensorize(src_vocab, train_nodes)
    train_["train_label_tensor"] = _tensorize(src_vocab, train_labels)
    train_["train_node1_tensor"] = _tensorize(src_vocab, train_node1)
    train_["train_node2_tensor"] = _tensorize(src_vocab, train_node2)

    eval_["eval_node_tensor"] = _tensorize(src_vocab, eval_nodes)
    eval_["eval_label_tensor"] = _tensorize(src_vocab, eval_labels)
    eval_["eval_node1_tensor"] = _tensorize(src_vocab, eval_node1)
    eval_["eval_node2_tensor"] = _tensorize(src_vocab, eval_node2)

    test_["test_node_tensor"] = _tensorize(src_vocab, test_nodes)
    test_["test_label_tensor"] = _tensorize(src_vocab, test_labels)
    test_["test_node1_tensor"] = _tensorize(src_vocab, test_node1)
    test_["test_node2_tensor"] = _tensorize(src_vocab, test_node2)

    if sentencepiece == 'True':
        train_tgt_tensor = [sp.encode_as_ids(w) for w in train_tgt]
        train_[
            "train_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences(
                train_tgt_tensor, padding='post')
        eval_tgt_tensor = [sp.encode_as_ids(w) for w in eval_tgt]
        eval_[
            "eval_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences(
                eval_tgt_tensor, padding='post')
        target_vocab = sp
    else:
        train_tgt_tensor = src_vocab.texts_to_sequences(train_tgt)
        train_[
            "train_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences(
                train_tgt_tensor, padding='post')
        eval_tgt_tensor = src_vocab.texts_to_sequences(eval_tgt)
        eval_[
            "eval_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences(
                eval_tgt_tensor, padding='post')
        target_vocab = src_vocab

    return (train_, eval_, test_, src_vocab, target_vocab,
            max_length(train_tgt_tensor))