def LoadDataset(train_path, eval_path, test_path, vocab_path, sentencepiece): # load the train and eval datasets with open(train_path, 'rb') as f: train_set = pickle.load(f) with open(eval_path, 'rb') as f: eval_set = pickle.load(f) with open(test_path, 'rb') as f: test_set = pickle.load(f) train_inp, train_tgt = zip(*train_set) eval_inp, eval_tgt = zip(*eval_set) # load the vocab if sentencepiece == 'True': sp = spm.SentencePieceProcessor() sp.load(vocab_path) input_tensor = [sp.encode_as_ids(w) for w in train_inp] input_tensor = tf.keras.preprocessing.sequence.pad_sequences( input_tensor, padding='post') target_tensor = [sp.encode_as_ids(w) for w in train_tgt] target_tensor = tf.keras.preprocessing.sequence.pad_sequences( target_tensor, padding='post') eval_inp = [sp.encode_as_ids(w) for w in eval_inp] eval_inp = tf.keras.preprocessing.sequence.pad_sequences( eval_inp, padding='post') eval_tgt = [sp.encode_as_ids(w) for w in eval_tgt] eval_tgt = tf.keras.preprocessing.sequence.pad_sequences( eval_tgt, padding='post') test_inp = [sp.encode_as_ids(w) for w in test_set] test_inp = tf.keras.preprocessing.sequence.pad_sequences( test_inp, padding='post') return input_tensor, target_tensor, \ eval_inp, eval_tgt, test_inp, sp, max_length(target_tensor) else: with open(vocab_path, 'rb') as f: vocab = pickle.load(f) input_tensor = _tensorize(vocab, train_inp) target_tensor = _tensorize(vocab, train_tgt) eval_inp = _tensorize(vocab, eval_inp) eval_tgt = _tensorize(vocab, eval_tgt) test_inp = _tensorize(vocab, test_set) return input_tensor, target_tensor, \ eval_inp, eval_tgt, test_inp, vocab, max_length(target_tensor)
def ProcessMultilingualDataset(args, set=None): """ Takes in the prepocessed Datasets and converts them into tensorflow tensors, Adds padding to make the targets uniform and packages the individual datasets as a combined tf.data.Dataset object. Also shuffles and batches the dataset. Note : The datasets are not concatenated into one big dataset if Knowledge Distillation is being used. We would require all datasets seperately to pass each batch through both the teacher model and student model. Then the fucntion returns a dict with all datasets. :param args: Args obj which contains paths to the preprocessed files :type args: ArgParse object :return: The multilingual dataset along with source and targer vocabs and their sizes and maximum target sequence length. :rtype: tf.data.Dataset object, vocab objects (src and tgt vocab), int ( max sequence length ), int (total buffer size, int ( steps per epoch, not much used ) """ multilingual_dataset = {} TRAIN_BUFFER_SIZE = 0 EVAL_BUFFER_SIZE = 0 if args.model == 'gat': dataset, src_vocab, tgt_vocab = LoadMultlingualDataset(args) for lang in languages: if args.sentencepiece == 'False': dataset[lang + '_train_tgt'] = _tensorize( src_vocab, dataset[lang + '_train_tgt']) dataset[lang + '_eval_tgt'] = _tensorize( src_vocab, dataset[lang + '_eval_tgt']) else: dataset[lang + '_train_tgt'] = [ tgt_vocab.encode_as_ids(w) for w in dataset[lang + '_train_tgt'] ] dataset[ lang + '_train_tgt'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_train_tgt'], padding='post') dataset[lang + '_eval_tgt'] = [ tgt_vocab.encode_as_ids(w) for w in dataset[lang + '_eval_tgt'] ] dataset[ lang + '_eval_tgt'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_eval_tgt'], padding='post') for part in ['train', 'eval', 'test']: dataset[lang + '_' + part + '_nodes'] = padding( _tensorize(src_vocab, dataset[lang + '_' + part + '_nodes']), 16) dataset[lang + '_' + part + '_labels'] = padding( _tensorize(src_vocab, dataset[lang + '_' + part + '_labels']), 16) dataset[lang + '_' + part + '_node1'] = padding( _tensorize(src_vocab, dataset[lang + '_' + part + '_node1']), 16) dataset[lang + '_' + part + '_node2'] = padding( _tensorize(src_vocab, dataset[lang + '_' + part + '_node2']), 16) TRAIN_BUFFER_SIZE += (dataset[lang + '_train_nodes']).shape[0] EVAL_BUFFER_SIZE += (dataset[lang + '_eval_nodes']).shape[0] MaxSeqSize = max(dataset['eng_train_tgt'].shape[1], dataset['ger_train_tgt'].shape[1], dataset['rus_train_tgt'].shape[1]) MULTI_BUFFER_SIZE = 0 BATCH_SIZE = args.batch_size for lang in languages: dataset[lang + '_train_tgt'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_train_tgt'], padding='post'), MaxSeqSize) dataset[lang + '_eval_tgt'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_eval_tgt'], padding='post'), MaxSeqSize) BUFFER_SIZE = len(dataset[lang + '_train_tgt']) MULTI_BUFFER_SIZE += BUFFER_SIZE dataset_size = dataset[lang + '_train_tgt'].shape[0] for part in ['train', 'eval']: if part == 'train': multilingual_dataset[ lang + '_' + part + '_set'] = tf.data.Dataset.from_tensor_slices( (dataset[lang + '_' + part + '_nodes'], dataset[lang + '_' + part + '_labels'], dataset[lang + '_' + part + '_node1'], dataset[lang + '_' + part + '_node2'], dataset[lang + '_' + part + '_tgt'])) multilingual_dataset[lang + '_' + part + '_set'] = multilingual_dataset[ lang + '_' + part + '_set'].shuffle(BUFFER_SIZE) else: multilingual_dataset[ lang + '_' + part + '_set'] = tf.data.Dataset.from_tensor_slices( (dataset[lang + '_' + part + '_nodes'], dataset[lang + '_' + part + '_labels'], dataset[lang + '_' + part + '_node1'], dataset[lang + '_' + part + '_node2'], dataset[lang + '_' + part + '_tgt'])) multilingual_dataset[ lang + '_test_set'] = tf.data.Dataset.from_tensor_slices( (dataset[lang + '_test_nodes'], dataset[lang + '_test_labels'], dataset[lang + '_test_node1'], dataset[lang + '_test_node2'])) final_dataset = {} for opt in ['train', 'test', 'eval']: final_dataset[opt + '_set'] = \ multilingual_dataset['eng_' + opt + '_set'].concatenate( multilingual_dataset['ger_' + opt + '_set'].concatenate( multilingual_dataset['rus_' + opt + '_set'])) if args.sentencepiece == 'False': src_vocab_size = len(src_vocab.word_index) + 1 tgt_vocab_size = args.vocab_size else: src_vocab_size = len(src_vocab.word_index) + 1 tgt_vocab_size = tgt_vocab.get_piece_size() final_dataset['train_set'] = final_dataset['train_set'].shuffle( MULTI_BUFFER_SIZE) final_dataset['train_set'] = final_dataset['train_set'].batch( BATCH_SIZE, drop_remainder=True) final_dataset['eval_set'] = final_dataset['eval_set'].batch( BATCH_SIZE, drop_remainder=True) final_dataset['test_set'] = final_dataset['test_set'].batch( BATCH_SIZE, drop_remainder=False) steps_per_epoch = int(MULTI_BUFFER_SIZE // BATCH_SIZE) print('BUFFER SIZE ' + str(MULTI_BUFFER_SIZE)) print("Dataset shapes : ") return (final_dataset, src_vocab, src_vocab_size, tgt_vocab, tgt_vocab_size, MULTI_BUFFER_SIZE, steps_per_epoch, MaxSeqSize) else: dataset, vocab = LoadMultlingualDataset(args) for lang in languages: if args.sentencepiece == 'False': dataset[lang + '_train_src'] = _tensorize( vocab, dataset[lang + '_train_src']) dataset[lang + '_eval_src'] = _tensorize( vocab, dataset[lang + '_eval_src']) dataset[lang + '_train_tgt'] = _tensorize( vocab, dataset[lang + '_train_tgt']) dataset[lang + '_eval_tgt'] = _tensorize( vocab, dataset[lang + '_eval_tgt']) dataset[lang + '_test_src'] = _tensorize( vocab, dataset[lang + '_eval_tgt']) else: dataset[lang + '_train_src'] = [ vocab.encode_as_ids(w) for w in dataset[lang + '_train_src'] ] dataset[ lang + '_train_src'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_train_src'], padding='post') dataset[lang + '_eval_src'] = [ vocab.encode_as_ids(w) for w in dataset[lang + '_eval_src'] ] dataset[ lang + '_eval_src'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_eval_src'], padding='post') dataset[lang + '_train_tgt'] = [ vocab.encode_as_ids(w) for w in dataset[lang + '_train_tgt'] ] dataset[ lang + '_train_tgt'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_train_tgt'], padding='post') dataset[lang + '_eval_tgt'] = [ vocab.encode_as_ids(w) for w in dataset[lang + '_eval_tgt'] ] dataset[ lang + '_eval_tgt'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_eval_tgt'], padding='post') dataset[lang + '_test_src'] = [ vocab.encode_as_ids(w) for w in dataset[lang + '_test_src'] ] dataset[ lang + '_test_src'] = tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_test_src'], padding='post') TRAIN_BUFFER_SIZE += (dataset[lang + '_train_src']).shape[0] EVAL_BUFFER_SIZE += (dataset[lang + '_eval_src']).shape[0] MaxSeqSize = max(dataset['eng_train_tgt'].shape[1], dataset['ger_train_tgt'].shape[1], dataset['rus_train_tgt'].shape[1]) SrcSeqSize = max(dataset['eng_train_src'].shape[1], dataset['ger_train_src'].shape[1], dataset['rus_train_src'].shape[1]) TestSeqSize = max(dataset['eng_test_src'].shape[1], dataset['ger_test_src'].shape[1], dataset['rus_test_src'].shape[1]) MULTI_BUFFER_SIZE = 0 BATCH_SIZE = args.batch_size for lang in languages: dataset[lang + '_train_src'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_train_src'], padding='post'), SrcSeqSize) dataset[lang + '_eval_src'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_eval_src'], padding='post'), SrcSeqSize) dataset[lang + '_train_tgt'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_train_tgt'], padding='post'), MaxSeqSize) dataset[lang + '_eval_tgt'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_eval_tgt'], padding='post'), MaxSeqSize) dataset[lang + '_test_src'] = padding( tf.keras.preprocessing.sequence.pad_sequences( dataset[lang + '_test_src'], padding='post'), TestSeqSize) BUFFER_SIZE = len(dataset[lang + '_train_tgt']) MULTI_BUFFER_SIZE += BUFFER_SIZE for part in ['train', 'eval']: if part == 'train': multilingual_dataset[ lang + '_' + part + '_set'] = tf.data.Dataset.from_tensor_slices( (dataset[lang + '_' + part + '_src'], dataset[lang + '_' + part + '_tgt'])) multilingual_dataset[lang + '_' + part + '_set'] = multilingual_dataset[ lang + '_' + part + '_set'].shuffle(BUFFER_SIZE) else: multilingual_dataset[ lang + '_' + part + '_set'] = tf.data.Dataset.from_tensor_slices( (dataset[lang + '_' + part + '_src'], dataset[lang + '_' + part + '_tgt'])) multilingual_dataset[ lang + '_test_set'] = tf.data.Dataset.from_tensor_slices( dataset[lang + '_test_src']) final_dataset = {} for opt in ['train', 'test', 'eval']: final_dataset[opt + '_set'] = \ multilingual_dataset['eng_' + opt + '_set'].concatenate( multilingual_dataset['ger_' + opt + '_set'].concatenate( multilingual_dataset['rus_' + opt + '_set'])) if args.sentencepiece == 'False': tgt_vocab_size = args.vocab_size else: tgt_vocab_size = vocab.get_piece_size() final_dataset['train_set'] = final_dataset['train_set'].shuffle( MULTI_BUFFER_SIZE) final_dataset['train_set'] = final_dataset['train_set'].batch( BATCH_SIZE, drop_remainder=True) final_dataset['eval_set'] = final_dataset['eval_set'].batch( BATCH_SIZE, drop_remainder=True) final_dataset['test_set'] = final_dataset['test_set'].batch( BATCH_SIZE, drop_remainder=False) steps_per_epoch = int(MULTI_BUFFER_SIZE // BATCH_SIZE) print('BUFFER SIZE ' + str(MULTI_BUFFER_SIZE)) print("Dataset shapes : ") return (final_dataset, vocab, tgt_vocab_size, MULTI_BUFFER_SIZE, steps_per_epoch, MaxSeqSize)
def LoadGatDataset(train_path, eval_path, test_path, srv_vocab, tgt_vocab, opt, sentencepiece, lang, num_examples=None): train_ = {} eval_ = {} test_ = {} # load the train and eval datasets with open(train_path, 'rb') as f: train_set = pickle.load(f) with open(eval_path, 'rb') as f: eval_set = pickle.load(f) with open(test_path, 'rb') as f: test_set = pickle.load(f) # load vocab if sentencepiece == 'True': sp = spm.SentencePieceProcessor() sp.load(tgt_vocab) with open(srv_vocab, 'rb') as f: src_vocab = pickle.load(f) train_input, train_tgt = zip(*train_set) eval_input, eval_tgt = zip(*eval_set) (train_nodes, train_labels, train_node1, train_node2) = zip(*train_input) (eval_nodes, eval_labels, eval_node1, eval_node2) = zip(*eval_input) (test_nodes, test_labels, test_node1, test_node2) = zip(*test_set) train_["train_node_tensor"] = _tensorize(src_vocab, train_nodes) train_["train_label_tensor"] = _tensorize(src_vocab, train_labels) train_["train_node1_tensor"] = _tensorize(src_vocab, train_node1) train_["train_node2_tensor"] = _tensorize(src_vocab, train_node2) eval_["eval_node_tensor"] = _tensorize(src_vocab, eval_nodes) eval_["eval_label_tensor"] = _tensorize(src_vocab, eval_labels) eval_["eval_node1_tensor"] = _tensorize(src_vocab, eval_node1) eval_["eval_node2_tensor"] = _tensorize(src_vocab, eval_node2) test_["test_node_tensor"] = _tensorize(src_vocab, test_nodes) test_["test_label_tensor"] = _tensorize(src_vocab, test_labels) test_["test_node1_tensor"] = _tensorize(src_vocab, test_node1) test_["test_node2_tensor"] = _tensorize(src_vocab, test_node2) if sentencepiece == 'True': train_tgt_tensor = [sp.encode_as_ids(w) for w in train_tgt] train_[ "train_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences( train_tgt_tensor, padding='post') eval_tgt_tensor = [sp.encode_as_ids(w) for w in eval_tgt] eval_[ "eval_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences( eval_tgt_tensor, padding='post') target_vocab = sp else: train_tgt_tensor = src_vocab.texts_to_sequences(train_tgt) train_[ "train_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences( train_tgt_tensor, padding='post') eval_tgt_tensor = src_vocab.texts_to_sequences(eval_tgt) eval_[ "eval_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences( eval_tgt_tensor, padding='post') target_vocab = src_vocab return (train_, eval_, test_, src_vocab, target_vocab, max_length(train_tgt_tensor))