def transform(sentence, l1, l2, op_lang): if op_lang == "hi": dir = "training_checkpoints" with open("tensors1.pkl", 'rb') as f: example_input_batch = pickle.load(f) example_target_batch = pickle.load(f) elif op_lang == "nep": dir = "training_nepali" with open("tensors_nep.pkl", 'rb') as f: example_input_batch = pickle.load(f) example_target_batch = pickle.load(f) elif op_lang == "pun": dir = "training_punjabi" with open("tensors_pun.pkl", 'rb') as f: example_input_batch = pickle.load(f) example_target_batch = pickle.load(f) print(example_input_batch) global input_lang, output_lang input_lang = l1 output_lang = l2 global BATCH_SIZE, units, embedding_dim BATCH_SIZE = 64 units = 512 if op_lang == "pun": units = 1024 embedding_dim = 256 vocab_inp_size = len(input_lang.word2index) + 1 vocab_tar_size = len(output_lang.word2index) + 1 global encoder, decoder, optimizer optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) sample_hidden = encoder.initialize_hidden_state() sample_output, sample_hidden = encoder(example_input_batch, sample_hidden) attention_layer = BahdanauAttention(10) attention_result, attention_weights = attention_layer( sample_hidden, sample_output) decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE) sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output) checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) curr = pathlib.Path(__file__) print(encoder.summary()) temp = os.path.join(curr.parent, "available_models") checkpoint_dir = os.path.join(temp, dir) print(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") print(checkpoint_dir) print(checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))) # print(checkpoint.restore(checkpoint_dir)) print(encoder.summary()) return translate(sentence)
import tensorflow as tf from sklearn.model_selection import train_test_split import dataset from model import Encoder, Decoder, BahdanauAttention if __name__ == "__main__": os.environ["CUDA_VISIBLE_DEVICES"] = "-1" config = anyconfig.load(open("config.yaml", 'rb')) BATCH_SIZE = 1 SEQ_LENGTH = config["trainer"]["sequnce_length"] # encoder encoder = Encoder(config["trainer"]["gru_units"], BATCH_SIZE) sample_hidden = encoder.initialize_hidden_state() # attention attention_layer = BahdanauAttention(config["trainer"]["attention_units"]) # decoder decoder = Decoder(config["trainer"]["label_length"], config["trainer"]["gru_units"], BATCH_SIZE) # reloder optimizer = tf.keras.optimizers.Adam() checkpoint_dir = config["trainer"]["checkpoint_dir"] checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) # test inp, targ = dataset.get_json_test("datasets/test/160230555682988.json") print(inp.shape, targ.shape)
def train(args: Namespace): ''' Cette fonction permet l'entraînement de l'IA selon différents paramètres transmis lors de l'appel de celle-ci args: Renvoie tous les arguments nécessaires ''' #Télécharge la dataset nommé fra-end.zip et l'extrait dans mon cas ici C:\Users\theof\.keras\datasets\ uniquement si celui-ci n'a pas déjà été téléchargé et extrait pathFile = tf.keras.utils.get_file( 'fra-eng.zip', origin= 'http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip', extract=True) #Création de la première dataset inputTensor, inputVocab, inputSentencesSize, targetTensor, targetVocab, targetSentencesSize = loadAndCreateDataset( os.path.dirname(pathFile) + "/fra.txt", args.sentences_size) print('Datasets chargées') #Quelques informations utiles sont affichées dans le terminal statsDataPreprocessed(inputTensor, targetTensor, inputVocab, targetVocab) epochSize = args.epoch batchSize = args.batch_size units = args.units bufferSize = len(inputTensor) embeddingDim = args.embedding_dim #Création de la dataset pour l'entrainement inputBatch, targetBatch, dataset = createDataset(inputTensor, targetTensor, batchSize, bufferSize) #On stocke la taille du vocabulaire dans une variable vocabInputSize = len(inputVocab.word_index) + 1 vocabTargetSize = len(targetVocab.word_index) + 1 #Création de l'encodeur avec certaines caractéristiques encoderModel = Encoder(vocabInputSize, embeddingDim, units, batchSize) #On met tous les hidden states à 0 encodeHiddenLayers = encoderModel.initialize_hidden_state() #On récupère tous les hidden states de l'encoder encodedOutputLayers, encodeHiddenLayers = encoderModel( inputBatch, encodeHiddenLayers) print( 'Encoder output shape: (batch size, sequence length, units) {}'.format( encodedOutputLayers.shape)) print('Encoder Hidden state shape: (batch size, units) {}'.format( encodeHiddenLayers.shape)) #Création du mécanisme d'attention avec une taille de 10 couches attention_layer = BahdanauAttention(10) attention_result, attention_weights = attention_layer( encodeHiddenLayers, encodedOutputLayers) print("Attention result shape: (batch size, units) {}".format( attention_result.shape)) print( "Attention weights shape: (batch size, sequence_length, 1) {}".format( attention_weights.shape)) #Création du décodeur avec certaines caractéristiques decoder = Decoder(vocabTargetSize, embeddingDim, units, batchSize) sampleDecoderOutput, _, _ = decoder(tf.random.uniform( (batchSize, 1)), encodeHiddenLayers, encodedOutputLayers) print('Decoder output shape: (batch size, vocab size) {}'.format( sampleDecoderOutput.shape)) #Définition de l'optimizer de l'entraînement nommé Adam qui repose sur un algorithme du gradient stochastique optimizer = tf.keras.optimizers.Adam() #Objet contient une méthode qui permet le calcule de la perte de l'entropie croisée entre les entrées et les prédictions. lossObject = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') #Ssi aucun nom de configuration n'est proposé dans les arguments if (args.config_name == None): #Le programme génère un nombre aléatoirement entre 0 et 9999 configName = random.randint(0, 9999) #Sinon il copie la valeur de l'argument dans une variable else: configName = args.config_name #URL où sont stockés les checkpoints et le fichier de configuration de cet entraînement checkpointDirectory = './outputs/{}/'.format(configName) checkpoint_prefix = os.path.join(checkpointDirectory, "ckpt") #Objet qui va stocker les poids de l'entraînement selon l'optimiseur, l'encodeur et le décodeur checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoderModel, decoder=decoder) #Pour le nombre d'entraînements for epoch in range(epochSize): start = time.time() #Initialisation à 0 des hidden states de l'encodeur enc_hidden = encoderModel.initialize_hidden_state() total_loss = 0 #Pour chaque ligne de la dataset, on récupère les phrases ainsi que leurs positions (Elles ont le même) for (batch, (inp, targ)) in enumerate(dataset.take(len(inputTensor))): #Renvoie l'erreur batch_loss = trainStep(inp, targ, enc_hidden, encoderModel, decoder, targetVocab, lossObject, optimizer, batchSize) #On stocke l'erreur totale de l'entraînement total_loss += batch_loss #Si l'entraînement est fini if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format( epoch + 1, batch, batch_loss.numpy())) #Si la taille de l'entraînement est supérieur à 1 if (epochSize > 1): #Le programme sauvegarde une fois sur deux le checkpoint if (epoch + 1) % 2 == 0: checkpoint.save(file_prefix=checkpoint_prefix) #Sinon il sauvegarde le seul entraînement (Utile pour les tests) else: checkpoint.save(file_prefix=checkpoint_prefix) print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / len(inputTensor))) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) #Création d'un objet qui va être une variable contenant toutes les informations nécessaires pour les futures prédictions config = {} config['bufferSize'] = bufferSize config['batchSize'] = batchSize config['embeddingDim'] = embeddingDim config['units'] = units config['epoch'] = epochSize config['vocabInputSize'] = vocabInputSize config['vocabTargetSize'] = vocabTargetSize config['inputSentencesSize'] = inputSentencesSize config['targetSentencesSize'] = targetSentencesSize #Le programme enregistre le fichier de configuration dans le répertoire indiqué par l'utilisateur en JSON with open('{}config.json'.format(checkpointDirectory), 'w', encoding='UTF-8') as handle: json.dump(config, handle, indent=2, sort_keys=True) #Le programme enregistre le tokenizer des 2 langues, utile pour les traductions with open('{}inputVocab.pickle'.format(checkpointDirectory), 'wb') as handle: pickle.dump(inputVocab, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('{}targetVocab.pickle'.format(checkpointDirectory), 'wb') as handle: pickle.dump(targetVocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = './logs/gradient_tape/' + current_time + '/train' test_log_dir = './logs/gradient_tape/' + current_time + '/test' self.train_summary_writer = tf.summary.create_file_writer( train_log_dir) self.test_summary_writer = tf.summary.create_file_writer(test_log_dir) self.m = tf.keras.metrics.SparseCategoricalAccuracy() # self.recall = tf.keras.metrics.Recall() self.recall = [0] # self.F1Score = 2*self.m.result()*self.recall.result()/(self.recall.result()+self.m.result()) self.BATCH_SIZE = 128 self.embedding_dim = 24 self.units = 64 # 尝试实验不同大小的数据集 stop_word_dir = './stop_words.utf8' self.stop_words = self.get_stop_words(stop_word_dir) + [''] num_examples = 30000 QA_dir = './QA_data.txt' # QA_dir = 'C:/Users/Administrator/raw_chat_corpus/qingyun-11w/qinyun-11w.csv' self.input_tensor, self.target_tensor, self.inp_tokenizer, self.targ_tokenizer = self.load_dataset( QA_dir, num_examples) self.num_classes = len(self.targ_tokenizer.index_word) #目标词类别 #初始化混淆矩阵(训练用和测试用): self.train_confusion_matrix = tfa.metrics.MultiLabelConfusionMatrix( num_classes=self.num_classes) self.test_confusion_matrix = tfa.metrics.MultiLabelConfusionMatrix( num_classes=self.num_classes) self.F1Score = tfa.metrics.F1Score(num_classes=len( self.targ_tokenizer.index_word), average="micro") # self.F1Score = tfa.metrics.F1Score(num_classes=self.max_length_targ, average="micro") # input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( # self.input_tensor, # self.target_tensor, # test_size=0.2) # self.load_split_dataset(input_tensor_train,target_tensor_train) self.vocab_inp_size = len(self.inp_tokenizer.word_index) + 1 self.vocab_tar_size = len(self.targ_tokenizer.word_index) + 1 # encoder初始化 self.encoder = Encoder(self.vocab_inp_size, self.embedding_dim, self.units, self.BATCH_SIZE) plot_model(self.encoder, to_file='encoder.png', show_shapes=True, show_layer_names=True, rankdir='TB', dpi=900, expand_nested=True) # 样本输入 # sample_hidden = self.encoder.initialize_hidden_state() # sample_output, sample_hidden = self.encoder.call(self.example_input_batch, sample_hidden) # print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape)) # print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape)) # attention初始化 attention_layer = BahdanauAttention(10) # attention_result, attention_weights = attention_layer(sample_hidden, sample_output) plot_model(attention_layer, to_file='attention_layer.png', show_shapes=True, show_layer_names=True, rankdir='TB', dpi=900, expand_nested=True) # print("Attention result shape: (batch size, units) {}".format(attention_result.shape)) # print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape)) # decoder初始化 self.decoder = Decoder(self.vocab_tar_size, self.embedding_dim, self.units, self.BATCH_SIZE) plot_model(self.decoder, to_file='decoder.png', show_shapes=True, show_layer_names=True, rankdir='TB', dpi=900, expand_nested=True) # sample_decoder_output, _, _ = self.decoder(tf.random.uniform((self.BATCH_SIZE, 1)), # sample_hidden, sample_output) # # print('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape)) # optimizer初始化 self.optimizer = tf.keras.optimizers.Adam() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') # checkpoint & save model as object 初始化 self.checkpoint_dir = './training_checkpoints' self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "ckpt") self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, encoder=self.encoder, decoder=self.decoder)
print(vocab_target_size) dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE) dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) example_input_batch, example_target_batch = next(iter(dataset)) print(example_input_batch.shape, example_target_batch.shape) encoder = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE) # sample input sample_hidden = encoder.initialize_hidden_state() sample_output, sample_hidden = encoder(example_input_batch, sample_hidden) print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape)) print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape)) attention_layer = BahdanauAttention(10) attention_result, attention_weights = attention_layer(sample_hidden, sample_output) print("Attention result shape: (batch size, units) {}".format(attention_result.shape)) print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape)) decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE) sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output) print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape)) # optimizer optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') # checkpoints