コード例 #1
0
def transform(sentence, l1, l2, op_lang):
    if op_lang == "hi":
        dir = "training_checkpoints"
        with open("tensors1.pkl", 'rb') as f:
            example_input_batch = pickle.load(f)
            example_target_batch = pickle.load(f)
    elif op_lang == "nep":
        dir = "training_nepali"
        with open("tensors_nep.pkl", 'rb') as f:
            example_input_batch = pickle.load(f)
            example_target_batch = pickle.load(f)

    elif op_lang == "pun":
        dir = "training_punjabi"
        with open("tensors_pun.pkl", 'rb') as f:
            example_input_batch = pickle.load(f)
            example_target_batch = pickle.load(f)
    print(example_input_batch)

    global input_lang, output_lang
    input_lang = l1
    output_lang = l2
    global BATCH_SIZE, units, embedding_dim
    BATCH_SIZE = 64
    units = 512
    if op_lang == "pun":
        units = 1024
    embedding_dim = 256
    vocab_inp_size = len(input_lang.word2index) + 1
    vocab_tar_size = len(output_lang.word2index) + 1
    global encoder, decoder, optimizer
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
    sample_hidden = encoder.initialize_hidden_state()

    sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
    attention_layer = BahdanauAttention(10)
    attention_result, attention_weights = attention_layer(
        sample_hidden, sample_output)
    decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
    sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                          sample_hidden, sample_output)

    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    curr = pathlib.Path(__file__)
    print(encoder.summary())
    temp = os.path.join(curr.parent, "available_models")
    checkpoint_dir = os.path.join(temp, dir)
    print(checkpoint_dir)
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    print(checkpoint_dir)
    print(checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)))
    # print(checkpoint.restore(checkpoint_dir))
    print(encoder.summary())
    return translate(sentence)
コード例 #2
0
import tensorflow as tf
from sklearn.model_selection import train_test_split

import dataset
from model import Encoder, Decoder, BahdanauAttention

if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    config = anyconfig.load(open("config.yaml", 'rb'))
    BATCH_SIZE = 1
    SEQ_LENGTH = config["trainer"]["sequnce_length"]
    # encoder
    encoder = Encoder(config["trainer"]["gru_units"], BATCH_SIZE)
    sample_hidden = encoder.initialize_hidden_state()
    # attention
    attention_layer = BahdanauAttention(config["trainer"]["attention_units"])
    # decoder
    decoder = Decoder(config["trainer"]["label_length"],
                      config["trainer"]["gru_units"], BATCH_SIZE)
    # reloder
    optimizer = tf.keras.optimizers.Adam()
    checkpoint_dir = config["trainer"]["checkpoint_dir"]
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    # test
    inp, targ = dataset.get_json_test("datasets/test/160230555682988.json")
    print(inp.shape, targ.shape)
コード例 #3
0
def train(args: Namespace):
    '''
  Cette fonction permet l'entraînement de l'IA selon différents paramètres transmis lors de l'appel de celle-ci

  args: Renvoie tous les arguments nécessaires
  '''

    #Télécharge la dataset nommé fra-end.zip et l'extrait dans mon cas ici C:\Users\theof\.keras\datasets\ uniquement si celui-ci n'a pas déjà été téléchargé et extrait
    pathFile = tf.keras.utils.get_file(
        'fra-eng.zip',
        origin=
        'http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip',
        extract=True)

    #Création de la première dataset
    inputTensor, inputVocab, inputSentencesSize, targetTensor, targetVocab, targetSentencesSize = loadAndCreateDataset(
        os.path.dirname(pathFile) + "/fra.txt", args.sentences_size)
    print('Datasets chargées')

    #Quelques informations utiles sont affichées dans le terminal
    statsDataPreprocessed(inputTensor, targetTensor, inputVocab, targetVocab)

    epochSize = args.epoch
    batchSize = args.batch_size
    units = args.units
    bufferSize = len(inputTensor)
    embeddingDim = args.embedding_dim

    #Création de la dataset pour l'entrainement
    inputBatch, targetBatch, dataset = createDataset(inputTensor, targetTensor,
                                                     batchSize, bufferSize)

    #On stocke la taille du vocabulaire dans une variable
    vocabInputSize = len(inputVocab.word_index) + 1
    vocabTargetSize = len(targetVocab.word_index) + 1

    #Création de l'encodeur avec certaines caractéristiques
    encoderModel = Encoder(vocabInputSize, embeddingDim, units, batchSize)
    #On met tous les hidden states à 0
    encodeHiddenLayers = encoderModel.initialize_hidden_state()
    #On récupère tous les hidden states de l'encoder
    encodedOutputLayers, encodeHiddenLayers = encoderModel(
        inputBatch, encodeHiddenLayers)

    print(
        'Encoder output shape: (batch size, sequence length, units) {}'.format(
            encodedOutputLayers.shape))
    print('Encoder Hidden state shape: (batch size, units) {}'.format(
        encodeHiddenLayers.shape))

    #Création du mécanisme d'attention avec une taille de 10 couches
    attention_layer = BahdanauAttention(10)
    attention_result, attention_weights = attention_layer(
        encodeHiddenLayers, encodedOutputLayers)

    print("Attention result shape: (batch size, units) {}".format(
        attention_result.shape))
    print(
        "Attention weights shape: (batch size, sequence_length, 1) {}".format(
            attention_weights.shape))

    #Création du décodeur avec certaines caractéristiques
    decoder = Decoder(vocabTargetSize, embeddingDim, units, batchSize)
    sampleDecoderOutput, _, _ = decoder(tf.random.uniform(
        (batchSize, 1)), encodeHiddenLayers, encodedOutputLayers)

    print('Decoder output shape: (batch size, vocab size) {}'.format(
        sampleDecoderOutput.shape))

    #Définition de l'optimizer de l'entraînement nommé Adam qui repose sur un algorithme du gradient stochastique
    optimizer = tf.keras.optimizers.Adam()
    #Objet contient une méthode qui permet le calcule de la perte de l'entropie croisée entre les entrées et les prédictions.
    lossObject = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    #Ssi aucun nom de configuration n'est proposé dans les arguments
    if (args.config_name == None):
        #Le programme génère un nombre aléatoirement entre 0 et 9999
        configName = random.randint(0, 9999)
    #Sinon il copie la valeur de l'argument dans une variable
    else:
        configName = args.config_name

    #URL où sont stockés les checkpoints et le fichier de configuration de cet entraînement
    checkpointDirectory = './outputs/{}/'.format(configName)
    checkpoint_prefix = os.path.join(checkpointDirectory, "ckpt")

    #Objet qui va stocker les poids de l'entraînement selon l'optimiseur, l'encodeur et le décodeur
    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoderModel,
                                     decoder=decoder)

    #Pour le nombre d'entraînements
    for epoch in range(epochSize):
        start = time.time()
        #Initialisation à 0 des hidden states de l'encodeur
        enc_hidden = encoderModel.initialize_hidden_state()
        total_loss = 0

        #Pour chaque ligne de la dataset, on récupère les phrases ainsi que leurs positions (Elles ont le même)
        for (batch, (inp, targ)) in enumerate(dataset.take(len(inputTensor))):
            #Renvoie l'erreur
            batch_loss = trainStep(inp, targ, enc_hidden, encoderModel,
                                   decoder, targetVocab, lossObject, optimizer,
                                   batchSize)
            #On stocke l'erreur totale de l'entraînement
            total_loss += batch_loss
            #Si l'entraînement est fini
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch + 1, batch, batch_loss.numpy()))
        #Si la taille de l'entraînement est supérieur à 1
        if (epochSize > 1):
            #Le programme sauvegarde une fois sur deux le checkpoint
            if (epoch + 1) % 2 == 0:
                checkpoint.save(file_prefix=checkpoint_prefix)
        #Sinon il sauvegarde le seul entraînement (Utile pour les tests)
        else:
            checkpoint.save(file_prefix=checkpoint_prefix)
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / len(inputTensor)))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

    #Création d'un objet qui va être une variable contenant toutes les informations nécessaires pour les futures prédictions
    config = {}
    config['bufferSize'] = bufferSize
    config['batchSize'] = batchSize
    config['embeddingDim'] = embeddingDim
    config['units'] = units
    config['epoch'] = epochSize
    config['vocabInputSize'] = vocabInputSize
    config['vocabTargetSize'] = vocabTargetSize
    config['inputSentencesSize'] = inputSentencesSize
    config['targetSentencesSize'] = targetSentencesSize

    #Le programme enregistre le fichier de configuration dans le répertoire indiqué par l'utilisateur en JSON
    with open('{}config.json'.format(checkpointDirectory),
              'w',
              encoding='UTF-8') as handle:
        json.dump(config, handle, indent=2, sort_keys=True)
    #Le programme enregistre le tokenizer des 2 langues, utile pour les traductions
    with open('{}inputVocab.pickle'.format(checkpointDirectory),
              'wb') as handle:
        pickle.dump(inputVocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('{}targetVocab.pickle'.format(checkpointDirectory),
              'wb') as handle:
        pickle.dump(targetVocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
コード例 #4
0
    def __init__(self):

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        train_log_dir = './logs/gradient_tape/' + current_time + '/train'
        test_log_dir = './logs/gradient_tape/' + current_time + '/test'
        self.train_summary_writer = tf.summary.create_file_writer(
            train_log_dir)
        self.test_summary_writer = tf.summary.create_file_writer(test_log_dir)

        self.m = tf.keras.metrics.SparseCategoricalAccuracy()
        # self.recall = tf.keras.metrics.Recall()
        self.recall = [0]
        # self.F1Score = 2*self.m.result()*self.recall.result()/(self.recall.result()+self.m.result())
        self.BATCH_SIZE = 128
        self.embedding_dim = 24
        self.units = 64
        # 尝试实验不同大小的数据集
        stop_word_dir = './stop_words.utf8'
        self.stop_words = self.get_stop_words(stop_word_dir) + ['']
        num_examples = 30000
        QA_dir = './QA_data.txt'
        # QA_dir = 'C:/Users/Administrator/raw_chat_corpus/qingyun-11w/qinyun-11w.csv'
        self.input_tensor, self.target_tensor, self.inp_tokenizer, self.targ_tokenizer = self.load_dataset(
            QA_dir, num_examples)
        self.num_classes = len(self.targ_tokenizer.index_word)  #目标词类别
        #初始化混淆矩阵(训练用和测试用):
        self.train_confusion_matrix = tfa.metrics.MultiLabelConfusionMatrix(
            num_classes=self.num_classes)
        self.test_confusion_matrix = tfa.metrics.MultiLabelConfusionMatrix(
            num_classes=self.num_classes)

        self.F1Score = tfa.metrics.F1Score(num_classes=len(
            self.targ_tokenizer.index_word),
                                           average="micro")
        # self.F1Score = tfa.metrics.F1Score(num_classes=self.max_length_targ, average="micro")
        # input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
        #     self.input_tensor,
        #     self.target_tensor,
        #     test_size=0.2)
        # self.load_split_dataset(input_tensor_train,target_tensor_train)
        self.vocab_inp_size = len(self.inp_tokenizer.word_index) + 1
        self.vocab_tar_size = len(self.targ_tokenizer.word_index) + 1

        # encoder初始化
        self.encoder = Encoder(self.vocab_inp_size, self.embedding_dim,
                               self.units, self.BATCH_SIZE)
        plot_model(self.encoder,
                   to_file='encoder.png',
                   show_shapes=True,
                   show_layer_names=True,
                   rankdir='TB',
                   dpi=900,
                   expand_nested=True)
        # 样本输入
        # sample_hidden = self.encoder.initialize_hidden_state()
        # sample_output, sample_hidden = self.encoder.call(self.example_input_batch, sample_hidden)
        # print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
        # print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

        # attention初始化
        attention_layer = BahdanauAttention(10)
        # attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
        plot_model(attention_layer,
                   to_file='attention_layer.png',
                   show_shapes=True,
                   show_layer_names=True,
                   rankdir='TB',
                   dpi=900,
                   expand_nested=True)

        # print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
        # print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

        # decoder初始化
        self.decoder = Decoder(self.vocab_tar_size, self.embedding_dim,
                               self.units, self.BATCH_SIZE)
        plot_model(self.decoder,
                   to_file='decoder.png',
                   show_shapes=True,
                   show_layer_names=True,
                   rankdir='TB',
                   dpi=900,
                   expand_nested=True)
        # sample_decoder_output, _, _ = self.decoder(tf.random.uniform((self.BATCH_SIZE, 1)),
        #                                       sample_hidden, sample_output)
        #
        # print('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

        # optimizer初始化
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')

        # checkpoint & save model as object 初始化
        self.checkpoint_dir = './training_checkpoints'
        self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "ckpt")
        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
                                              encoder=self.encoder,
                                              decoder=self.decoder)
コード例 #5
0
ファイル: main.py プロジェクト: justeuer/sopro-nlpwithnn
print(vocab_target_size)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
print(example_input_batch.shape, example_target_batch.shape)

encoder = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))
decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))
# optimizer
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# checkpoints