def __init__(self, n_target_vocab, n_decoder_units, n_attention_units,
              n_encoder_units, n_maxout_units):
     super(CRNNAttention, self).__init__()
     with self.init_scope():
         self.crnn = CRNN()
         self.decoder = Decoder(
             n_target_vocab,
             n_decoder_units,
             n_attention_units,
             n_encoder_units * 2,  # because of bi-directional lstm
             n_maxout_units,
         )
Exemple #2
0
def load_seq2seq(filename, map_location=None):
    """
    Loads a model from a filename. (This function requires that the model was saved with save_model utils function)
    :param filename: Filename to be used.
    """
    from seq2seq import Encoder, Decoder
    model_dict = torch.load(
        filename, map_location=None
    ) if map_location is not None else torch.load(filename)
    encoder = Encoder(**model_dict["encoder_init_args"])
    decoder = Decoder(**model_dict["decoder_init_args"])
    encoder.load_state_dict(model_dict["encoder_state"])
    decoder.load_state_dict(model_dict["decoder_state"])
    return encoder, decoder
Exemple #3
0
def Main(vocab, vocab_inv):
    i = 0
    arret = False  # peut-être mis à True par le programme (étape 3)
    contexte = None
    # préparation GRU
    enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad])
    dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos],
                  vocab[eos], vocab[unk])
    enc.to('cuda')
    dec.to('cuda')
    # chargement des poids
    path_enc = "encoder_9.pkl"
    path_dec = "decoder_9.pkl"
    encoder_state = torch.load(path_enc)
    decoder_state = torch.load(path_dec)
    enc.load_states(encoder_state)
    enc.eval()
    dec.load_states(dict(decoder_state))
    dec.eval()
    # paramétrage de la taille de la prédiction
    taille = int(input("nombre de mots prédis à la fois ? : "))
    while (not arret):
        phrase = takeInput(i)
        exit_c, contexte = callGRU(enc, dec, phrase, vocab, contexte, taille)
        sortie = posttreatment(exit_c, vocab_inv)
        # sortie = "David Albert Huffman est un petit garçon de 10 ans des plus intelligents. Cependant, son monde cours à sa perte lorsque Poupoune décide de s'emparer de l'Europe, alors en pleine crise politique, pour y imposer son monde rose et fleurissant.Avec son ami Lamy, David va devoir lutter contre des adversaires redoutables pour sauver le monde, entrer au MIT et repousser la plus grande menace du Siècle (pour le moment) pour rétablir l'équilibre dans le rapport de Force." #test enchaînement
        printResult(sortie)
        # contexte = exit_c
        i += 1
def load_model(path='files/model_De', device=device):
    checkpoint = torch.load(path, map_location=device)

    in_lang, out_lang, pairs = prepareData('En', 'De')
    in_lang = checkpoint['in_lang_class']
    out_lang = checkpoint['out_lang_class']

    hidden_size = checkpoint['hidden_size']

    encoder = Encoder(in_lang.n_words, hidden_size).to(device)
    decoder = Decoder(hidden_size, out_lang.n_words, dropout_p=0.1).to(device)

    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])

    return encoder, decoder, in_lang, out_lang
def main():
    #Dataset
    dataset = PadDataset(WORKING_DIR, EMBEDDING_SIZE, diff_vocab = DIFF_VOCAB, embedding_path = EMBEDDING_PATH,\
                  limit_encode = LIMIT_ENCODE, limit_decode = LIMIT_DECODE)
    print("112")
    encoder_vocab_size = dataset.length_vocab_encode()
    decoder_vocab_size = dataset.length_vocab_decode()
    print("Steps per epoch %d" %(int(math.ceil(float(dataset.datasets["train"].number_of_samples)\
                                        /float(BATCH_SIZE)))))
    #Initialising Model
    embeddings_encoder = dataset.vocab.embeddings_encoder
    embeddings_encoder = torch.Tensor(embeddings_encoder).cuda()
    embeddings_decoder = dataset.vocab.embeddings_decoder
    embeddings_decoder = torch.Tensor(embeddings_decoder).cuda()
    content_encoder = Encoder(encoder_vocab_size, embeddings_encoder,
                              EMBEDDING_SIZE, HIDDEN_SIZE).cuda()
    print("123")
    query_encoder = Encoder(encoder_vocab_size, embeddings_encoder,
                            EMBEDDING_SIZE, HIDDEN_SIZE).cuda()
    print("ddf")
    decoder = Decoder(EMBEDDING_SIZE, embeddings_decoder, HIDDEN_SIZE,
                      decoder_vocab_size).cuda()
    print("adsf")
    seq2seqwattn = Seq2Seq(content_encoder, query_encoder, decoder).cuda()
    print("adsdf")

    run_this = run_model(dataset, seq2seqwattn)
    print('rehc')
    run_this.run_training()
    print('124124')
Exemple #6
0
def training(data_dir):
    news_train = NEWS(data_path=os.path.join(data_dir, '.train.pkl'),
                      vocab_path='../data/vocab.pkl')
    encoder, decoder = Encoder(emb_size=300,
                               hidden_size=512,
                               vocab_size=len(news_train.vocab)), Decoder()
    SAE = SentenceAE()
Exemple #7
0
def create_model(gen_config):
    encoder = Encoder(gen_config.vocab_size,
                      gen_config.emb_dim,
                      gen_config.hidden_size,
                      n_layers=2,
                      dropout=0.5)
    decoder = Decoder(gen_config.emb_dim,
                      gen_config.hidden_size,
                      gen_config.vocab_size,
                      n_layers=1,
                      dropout=0.5)
    seq2seq = Seq2Seq(encoder, decoder).cuda()
    optimizer = optim.Adam(seq2seq.parameters(), lr=gen_config.lr)
    return seq2seq, optimizer
class CRNNAttention(chainer.Chain):
    def __init__(self, n_target_vocab, n_decoder_units, n_attention_units,
                 n_encoder_units, n_maxout_units):
        super(CRNNAttention, self).__init__()
        with self.init_scope():
            self.crnn = CRNN()
            self.decoder = Decoder(
                n_target_vocab,
                n_decoder_units,
                n_attention_units,
                n_encoder_units * 2,  # because of bi-directional lstm
                n_maxout_units,
            )

    def __call__(self, xs, ys):
        recurrent_output = self.crnn(xs)
        output = self.decoder(ys, recurrent_output)

        concatenated_os = F.concat(output, axis=0)
        concatenated_ys = F.flatten(ys.T)
        n_words = len(self.xp.where(concatenated_ys.data != PAD)[0])

        loss = F.sum(
            F.softmax_cross_entropy(concatenated_os,
                                    concatenated_ys,
                                    reduce='no',
                                    ignore_label=PAD))
        loss = loss / n_words
        chainer.report({'loss': loss.data}, self)
        perp = self.xp.exp(loss.data * batch_size / n_words)
        chainer.report({'perp': perp}, self)

        return loss

    def translate(self, xs, max_length=100):
        """Generate sentences based on xs.

        Args:
            xs: Source sentences.

        Returns:
            ys: Generated target sentences.

        """
        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            hxs = self.crnn(xs)
            ys = self.decoder.translate(hxs, max_length)
        return ys
def train(article,
          title,
          word2idx,
          target2idx,
          source_lengths,
          target_lengths,
          args,
          val_article=None,
          val_title=None,
          val_source_lengths=None,
          val_target_lengths=None):

    if not os.path.exists('./temp/x.pkl'):
        size_of_val = int(len(article) * 0.05)
        val_article, val_title, val_source_lengths, val_target_lengths = \
            utils.sampling(article, title, source_lengths, target_lengths, size_of_val)

        utils.save_everything(article, title, source_lengths, target_lengths,
                              val_article, val_title, val_source_lengths,
                              val_target_lengths, word2idx)

    size_of_val = len(val_article)
    batch_size = args.batch
    train_size = len(article)
    val_size = len(val_article)
    max_a = max(source_lengths)
    max_t = max(target_lengths)
    print("source vocab size:", len(word2idx))
    print("target vocab size:", len(target2idx))
    print("max a:{}, max t:{}".format(max_a, max_t))
    print("train_size:", train_size)
    print("val size:", val_size)
    print("batch_size:", batch_size)
    print("-" * 30)
    use_coverage = False

    encoder = Encoder(len(word2idx))
    decoder = Decoder(len(target2idx), 50)
    if os.path.exists('decoder_model'):
        encoder.load_state_dict(torch.load('encoder_model'))
        decoder.load_state_dict(torch.load('decoder_model'))

    optimizer = torch.optim.Adam(list(encoder.parameters()) +
                                 list(decoder.parameters()),
                                 lr=0.001)
    n_epoch = 5
    print("Making word index and extend vocab")
    #article, article_tar, title, ext_vocab_all, ext_count = indexing_word(article, title, word2idx, target2idx)
    #article = to_tensor(article)
    #article_extend = to_tensor(article_extend)
    #title = to_tensor(title)
    print("preprocess done")

    if args.use_cuda:
        encoder.cuda()
        decoder.cuda()

    print("start training")
    for epoch in range(n_epoch):
        total_loss = 0
        batch_n = int(train_size / batch_size)
        if epoch > 0:
            use_coverage = True
        for b in range(batch_n):
            # initialization
            batch_x = article[b * batch_size:(b + 1) * batch_size]
            batch_y = title[b * batch_size:(b + 1) * batch_size]
            #batch_x_ext = article_extend[b*batch_size: (b+1)*batch_size]
            batch_x, batch_x_ext, batch_y, extend_vocab, extend_lengths = \
                utils.batch_index(batch_x, batch_y, word2idx, target2idx)

            if args.use_cuda:
                batch_x = batch_x.cuda()
                batch_y = batch_y.cuda()
                batch_x_ext = batch_x_ext.cuda()
            x_lengths = source_lengths[b * batch_size:(b + 1) * batch_size]
            y_lengths = target_lengths[b * batch_size:(b + 1) * batch_size]

            # work around to deal with length
            pack = pack_padded_sequence(batch_x_ext,
                                        x_lengths,
                                        batch_first=True)
            batch_x_ext_var, _ = pad_packed_sequence(pack, batch_first=True)
            current_loss = train_on_batch(encoder, decoder, optimizer, batch_x,
                                          batch_y, x_lengths, y_lengths,
                                          word2idx, target2idx,
                                          batch_x_ext_var, extend_lengths,
                                          use_coverage)

            batch_x = batch_x.cpu()
            batch_y = batch_y.cpu()
            batch_x_ext = batch_x_ext.cpu()

            print('epoch:{}/{}, batch:{}/{}, loss:{}'.format(
                epoch + 1, n_epoch, b + 1, batch_n, current_loss))
            if (b + 1) % args.show_decode == 0:
                torch.save(encoder.state_dict(), 'encoder_model')
                torch.save(decoder.state_dict(), 'decoder_model')
                batch_x_val, batch_x_ext_val, batch_y_val, extend_vocab, extend_lengths = \
                    utils.batch_index(val_article, val_title, word2idx, target2idx)
                for i in range(1):
                    idx = np.random.randint(0, val_size)
                    decode.beam_search(encoder, decoder,
                                       batch_x_val[idx].unsqueeze(0),
                                       batch_y_val[idx].unsqueeze(0), word2idx,
                                       target2idx, batch_x_ext_val[idx],
                                       extend_lengths[idx], extend_vocab[idx])

                batch_x_val = batch_x_val.cpu()
                batch_y_val = batch_y_val.cpu()
                batch_x_ext_val = batch_x_ext_val.cpu()

            total_loss += current_loss
            print('-' * 30)

    print()
    print("training finished")
Exemple #10
0
    parameters_dict = Load_Parameters(hyper_parameters_file)

    en_embedding_dim = parameters_dict['en_embedding_dim']
    de_embedding_dim = parameters_dict['de_embedding_dim']
    hidden_dim = parameters_dict['hidden_dim']
    num_layers = parameters_dict['num_layers']
    bidirectional = parameters_dict['bidirectional']
    use_lstm = parameters_dict['use_lstm']
    use_cuda = False
    batch_size = 1
    dropout_p = 0.0

    encoder = Encoder(en_embedding_dim, hidden_dim, en_vocab.n_items,
                      num_layers, dropout_p, bidirectional, use_lstm, use_cuda)
    decoder = Decoder(de_embedding_dim, hidden_dim, de_vocab.n_items,
                      num_layers, dropout_p, bidirectional, use_lstm, use_cuda)

    encoder.load_state_dict(torch.load(encoder_model_file, map_location='cpu'))
    decoder.load_state_dict(torch.load(decoder_model_file, map_location='cpu'))

    encoder.eval()
    decoder.eval()

    f_en_test = open('input.txt', 'r', encoding='utf-8')
    f_de_pred = open('output.txt', 'w', encoding='utf-8')

    while True:
        en_sent = f_en_test.readline()

        if not en_sent: break
Exemple #11
0
def train():
    # region Process Data
    path_to_files = tf.keras.utils.get_file(
        'english.txt',
        origin=
        'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en',
        extract=False)
    path_to_files = tf.keras.utils.get_file(
        'german.txt',
        origin=
        'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de',
        extract=False)
    path_to_files = os.path.dirname(path_to_files)

    input_tensor, target_tensor, inp_lang, targ_lang = load_wmt_dataset(
        path_to_files, num_examples, dict_size)
    max_length_targ, max_length_inp = max_length(target_tensor), max_length(
        input_tensor)
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
        input_tensor, target_tensor, test_size=0.2)

    BUFFER_SIZE = len(input_tensor_train)
    steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    # endregion

    # Region: model definition
    encoder = Encoder(dict_size, embedding_dim, units, BATCH_SIZE)
    attention_layer = BahdanauAttention(10)
    decoder = Decoder(dict_size, embedding_dim, units, BATCH_SIZE)
    optimizer = tf.keras.optimizers.Adam()

    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    @tf.function
    def train_step(inp, targ, enc_hidden):
        loss = 0

        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, enc_hidden)

            dec_hidden = enc_hidden

            dec_input = tf.expand_dims([targ_lang.word_index['<start>']] *
                                       BATCH_SIZE, 1)

            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden,
                                                     enc_output)

                loss += loss_function(targ[:, t], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))

        variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss

    for epoch in range(EPOCHS):
        start = time.time()

        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden)
            total_loss += batch_loss

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch + 1, batch, batch_loss.numpy()))
        # saving (checkpoint) the model every 2 epochs
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)

        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
Exemple #12
0
    def __init__(self):
        def Load_Vocab(file):
            with open(file, 'rb') as fd:
                _vocab = pickle.load(fd)
            return _vocab

        def Load_Parameters(file):
            with open(file, 'rb') as fd:
                parameters_dict = pickle.load(fd)
            return parameters_dict

        torch.manual_seed(1)
        torch.set_num_threads(4)
        en_vocab_dur_file = './en_vocab_dur.pkl'
        de_vocab_dur_file = './de_vocab_dur.pkl'

        encoder_dur_model_file = './encoder_dur.10.pt'
        decoder_dur_model_file = './decoder_dur.10.pt'

        en_vocab_key_file = './en_vocab.pkl'
        de_vocab_key_file = './de_vocab.pkl'

        encoder_key_model_file = './encoder.10.pt'
        decoder_key_model_file = './decoder.10.pt'
        hyper_parameters_file = './parameters_dict.pkl'
        self.en_vocab_key = Load_Vocab(en_vocab_key_file)
        self.de_vocab_key = Load_Vocab(de_vocab_key_file)

        self.en_vocab_dur = Load_Vocab(en_vocab_dur_file)
        self.de_vocab_dur = Load_Vocab(de_vocab_dur_file)

        self.trf_key = Transfrom(self.en_vocab_key)
        self.trf_dur = Transfrom(self.en_vocab_dur)

        self.parameters_dict = Load_Parameters(hyper_parameters_file)

        en_embedding_dim = self.parameters_dict['en_embedding_dim']
        de_embedding_dim = self.parameters_dict['de_embedding_dim']
        hidden_dim = self.parameters_dict['hidden_dim']
        num_layers = self.parameters_dict['num_layers']
        bidirectional = self.parameters_dict['bidirectional']
        use_lstm = self.parameters_dict['use_lstm']
        self.use_cuda_dur = self.use_cuda_key = False
        batch_size = 1
        dropout_p = 0.0

        self.encoder_key = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.decoder_key = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.encoder_dur = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)
        self.decoder_dur = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)

        self.encoder_key.load_state_dict(
            torch.load(encoder_key_model_file, map_location='cpu'))
        self.decoder_key.load_state_dict(
            torch.load(decoder_key_model_file, map_location='cpu'))
        self.encoder_dur.load_state_dict(
            torch.load(encoder_dur_model_file, map_location='cpu'))
        self.decoder_dur.load_state_dict(
            torch.load(decoder_dur_model_file, map_location='cpu'))

        self.encoder_key.eval()
        self.decoder_key.eval()
        self.encoder_dur.eval()
        self.decoder_dur.eval()
        """ __init__ return the parameters: {self.trf_dur,self.trf_key;
Exemple #13
0
    # 디코더 입력 인덱스 변환
    x_decoder = preprocessor.convert_text_to_index(answer, word_to_index,
                                                   DECODER_INPUT)

    # 디코더 목표 인덱스 변환
    y_decoder = preprocessor.convert_text_to_index(answer, word_to_index,
                                                   DECODER_TARGET)

    # 원핫 인코딩
    y_decoder = preprocessor.one_hot_encode(y_decoder)

    # 훈련 모델 인코더, 디코더 정의
    # 모델을 객체화 했기 때문에 객체로 가져오는 코드
    encoder = Encoder(len(preprocessor.words))
    decoder = Decoder(encoder.states, encoder.len_of_words)

    # github코드와 거의 동일
    # 훈련 모델 정의
    model = models.Model([encoder.inputs, decoder.inputs], decoder.outputs)

    # github코드와 동일
    # 학습 방법 설정
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # github코드와 같은 결과를 내보냄
    # 예측 모델을 생성하는 것을 객체의 함수로 만들었기 때문에 다음과 같이 코딩했습니다.
    # 예측 모델 정의
    encoder_model = encoder.get_predict_model()
 #for i,batch in enumerate(valid_iterator):
 #    print("Train Src Shape: ",str(batch.src.shape))
 #    print("Train Trg Shape: ",str(batch.trg.shape))
 
 # +
 INPUT_DIM = len(src.vocab)
 OUTPUT_DIM = len(trg.vocab)
 ENC_EMB_DIM = 128
 DEC_EMB_DIM = 128
 HID_DIM = 256
 N_LAYERS = 1
 ENC_DROPOUT = 0.0
 DEC_DROPOUT = 0.0
 
 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
 dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
 
 model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE)
 model.apply(init_weights)
 # -
 
 optimizer = optim.Adam(model.parameters())
 TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]
 criterion = nn.CrossEntropyLoss().to(DEVICE)
 print(TRG_PAD_IDX)     
 
 # +
 N_EPOCHS = 1000
 CLIP = 1
 counter = 0
 patience = 200
Exemple #15
0
def train():

    encode_input, decode_input, decode_output = gen_datasets()
    encode_input, decode_input, decode_output = shuffle(
        encode_input, decode_input, decode_output)

    encoder_input = Input(shape=(config['max_num'], ), name='encode_input')
    embedded_input = Embedding(config['en_voc_size'],
                               100,
                               weights=[en_w2v_matrix],
                               trainable=False,
                               name="embedded_layer")(encoder_input)
    encoder = Encoder(3,
                      100,
                      5,
                      20,
                      config['max_num'],
                      mask_zero=True,
                      name='encoder')
    encoder_output = encoder([
        embedded_input, embedded_input, embedded_input, encoder_input,
        encoder_input
    ])

    # decoder
    decoder_input = Input(shape=(config['max_num'], ), name='decode_input')
    embedded_input2 = Embedding(config['ch_voc_size'],
                                100,
                                weights=[ch_w2v_matrix],
                                trainable=False,
                                name="embedded_layer2")(decoder_input)
    decoder = Decoder(3,
                      100,
                      5,
                      20,
                      config['max_num'],
                      mask_zero=True,
                      name='decoder')
    decoder_output = decoder([
        embedded_input2, encoder_output, encoder_output, decoder_input,
        encoder_input
    ])
    decoder_dense = Dense(config['ch_voc_size'],
                          activation='softmax',
                          name='dense_layer')
    decoder_output = decoder_dense(decoder_output)
    label_smooth = LabelSmooth(0.1, config['ch_voc_size'])
    decoder_output = label_smooth(decoder_output)
    model = Model([encoder_input, decoder_input], decoder_output)

    opt = Adam(0.001, 0.9, 0.98, epsilon=1e-9)
    # model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    model.compile(optimizer=opt, loss=mask_loss, metrics=[mask_accuracy])

    model.summary()
    tb = TensorBoard(log_dir='./tb_logs/0125',
                     histogram_freq=0,
                     write_graph=True,
                     write_images=False,
                     embeddings_freq=0,
                     embeddings_layer_names=None,
                     embeddings_metadata=None)
    cp = ModelCheckpoint(
        './models/attention_seq2seq.{epoch:02d}-{val_loss:.2f}.hdf5',
        monitor='val_loss',
        verbose=0,
        save_best_only=False,
        save_weights_only=False,
        mode='auto',
        period=1)
    try:
        model.fit([encode_input, decode_input],
                  decode_output,
                  validation_split=0.2,
                  batch_size=128,
                  epochs=10,
                  callbacks=[tb, cp])
    except KeyboardInterrupt:
        model.save('attention_seq2seq')
    else:
        model.save('attention_seq2seq')
Exemple #16
0
    parameters_dict = {}
    parameters_dict['en_embedding_dim'] = en_embedding_dim
    parameters_dict['de_embedding_dim'] = de_embedding_dim
    parameters_dict['hidden_dim'] = hidden_dim
    parameters_dict['num_layers'] = num_layers
    parameters_dict['bidirectional'] = bidirectional
    parameters_dict['use_lstm'] = use_lstm

    with open('parameters_dict.pkl', 'wb') as fd:
        pickle.dump(parameters_dict, fd)
    
    batch_total = sum([1 for _ in pl.gen_pairs(batch_size)])
    ones_matrix = autograd.Variable(torch.ones(1, de_vocab.n_items))
    
    encoder = Encoder(en_embedding_dim, hidden_dim, en_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda)
    decoder = Decoder(de_embedding_dim, hidden_dim, de_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda)

    
    encoder_model_file = 'encoder_rev.7.pt'
    decoder_model_file = 'decoder_rev.7.pt'
    encoder.load_state_dict(torch.load(encoder_model_file))
    decoder.load_state_dict(torch.load(decoder_model_file))

    '''
    #Load Pre-trained Embedding
    model_file = 'bi_gru.100.100.2.pt'
    if model_file != '' : model.load_state_dict(torch.load(model_file))
    else: model.load_pre_train_emb('cityu_training.char.emb.npy', 'cityu_training.char.dict', vocab)
    '''
    
    loss_function = nn.NLLLoss(reduction = 'sum', ignore_index = de_vocab.item2index['_PAD_'])
Exemple #17
0
class Text2song(object):
    def __init__(self):
        def Load_Vocab(file):
            with open(file, 'rb') as fd:
                _vocab = pickle.load(fd)
            return _vocab

        def Load_Parameters(file):
            with open(file, 'rb') as fd:
                parameters_dict = pickle.load(fd)
            return parameters_dict

        torch.manual_seed(1)
        torch.set_num_threads(4)
        en_vocab_dur_file = './en_vocab_dur.pkl'
        de_vocab_dur_file = './de_vocab_dur.pkl'

        encoder_dur_model_file = './encoder_dur.10.pt'
        decoder_dur_model_file = './decoder_dur.10.pt'

        en_vocab_key_file = './en_vocab.pkl'
        de_vocab_key_file = './de_vocab.pkl'

        encoder_key_model_file = './encoder.10.pt'
        decoder_key_model_file = './decoder.10.pt'
        hyper_parameters_file = './parameters_dict.pkl'
        self.en_vocab_key = Load_Vocab(en_vocab_key_file)
        self.de_vocab_key = Load_Vocab(de_vocab_key_file)

        self.en_vocab_dur = Load_Vocab(en_vocab_dur_file)
        self.de_vocab_dur = Load_Vocab(de_vocab_dur_file)

        self.trf_key = Transfrom(self.en_vocab_key)
        self.trf_dur = Transfrom(self.en_vocab_dur)

        self.parameters_dict = Load_Parameters(hyper_parameters_file)

        en_embedding_dim = self.parameters_dict['en_embedding_dim']
        de_embedding_dim = self.parameters_dict['de_embedding_dim']
        hidden_dim = self.parameters_dict['hidden_dim']
        num_layers = self.parameters_dict['num_layers']
        bidirectional = self.parameters_dict['bidirectional']
        use_lstm = self.parameters_dict['use_lstm']
        self.use_cuda_dur = self.use_cuda_key = False
        batch_size = 1
        dropout_p = 0.0

        self.encoder_key = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.decoder_key = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.encoder_dur = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)
        self.decoder_dur = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)

        self.encoder_key.load_state_dict(
            torch.load(encoder_key_model_file, map_location='cpu'))
        self.decoder_key.load_state_dict(
            torch.load(decoder_key_model_file, map_location='cpu'))
        self.encoder_dur.load_state_dict(
            torch.load(encoder_dur_model_file, map_location='cpu'))
        self.decoder_dur.load_state_dict(
            torch.load(decoder_dur_model_file, map_location='cpu'))

        self.encoder_key.eval()
        self.decoder_key.eval()
        self.encoder_dur.eval()
        self.decoder_dur.eval()
        """ __init__ return the parameters: {self.trf_dur,self.trf_key;
                                            self.encoder_dur,self.encoder_key;
                                            self.decoder_dur,self.decoder_key;
                                            self.en_vocab_dur,self.en_vocab_key;
                                            self.de_vocab_dur,self.de_vocab_key;
                                            self.use_cuda_dur,self,self.use_cuda_key.}"""

    def get_song(self, lyric):
        def stop_before_eos(li, length):
            if '_EOS_' in li:
                i = li.index('_EOS_')
                li = li[:i]
            while (li.__len__() < length):
                li.append(li[-1])
            return li

        def important_function_in_while_loop(trf, sent, encoder, decoder,
                                             de_vocab, use_cuda, en_sent):
            en_seq, en_seq_len = trf.trans_input(sent)

            en_seq = torch.LongTensor(en_seq)
            encoder_input = en_seq
            encoder_output, encoder_state = encoder(encoder_input, en_seq_len)

            # initial decoder hidden
            decoder_state = decoder.init_state(encoder_state)

            # Start decoding
            decoder_inputs = torch.LongTensor([de_vocab.item2index['_START_']])

            pred_char = ''

            if use_cuda: decoder_inputs = decoder_inputs.cuda()
            decoder_outputs, decoder_state = decoder(decoder_inputs,
                                                     encoder_output,
                                                     decoder_state)

            max_len = len(en_sent.split())

            return (pred_char, encoder_output, decoder_outputs, decoder_state,
                    max_len)

        f_en_test = io.StringIO(lyric)

        pred_list = []

        while True:
            en_sent = f_en_test.readline()

            if not en_sent: break

            sent = en_sent.strip()
            pred_sent_dur = []
            pred_sent_key = []
            pred_char_key, encoder_output_key, decoder_outputs_key, decoder_state_key, max_len_key = \
                important_function_in_while_loop(self.trf_key, sent, self.encoder_key, self.decoder_key, self.de_vocab_key, self.use_cuda_key,
                                                 en_sent)

            pred_char_dur, encoder_output_dur, decoder_outputs_dur, decoder_state_dur, max_len_dur = \
                important_function_in_while_loop(self.trf_dur, sent, self.encoder_dur, self.decoder_dur, self.de_vocab_dur, self.use_cuda_dur,
                                                 en_sent)

            # Greedy search
            while pred_char_key != '_EOS_' and pred_char_dur != '_EOS_':
                log_prob_key, v_idx_key = decoder_outputs_key.data.topk(1)
                pred_char_key = self.de_vocab_key.index2item[v_idx_key.item()]
                pred_sent_key.append(pred_char_key)

                log_prob_dur, v_idx_dur = decoder_outputs_dur.data.topk(1)
                pred_char_dur = self.de_vocab_dur.index2item[v_idx_dur.item()]
                pred_sent_dur.append(pred_char_dur)

                if (len(pred_sent_dur) > max_len_dur
                        or len(pred_sent_dur) > max_len_key):
                    break

                decoder_inputs_dur = torch.LongTensor([v_idx_dur.item()])
                if self.use_cuda_dur:
                    decoder_inputs_dur = decoder_inputs_dur.cuda()
                decoder_outputs_dur, decoder_state_dur = self.decoder_dur(
                    decoder_inputs_dur, encoder_output_dur, decoder_state_dur)

                decoder_inputs_key = torch.LongTensor([v_idx_key.item()])
                if self.use_cuda_key:
                    decoder_inputs_key = decoder_inputs_key.cuda()
                decoder_outputs_key, decoder_state_key = self.decoder_key(
                    decoder_inputs_key, encoder_output_key, decoder_state_key)
            length = len(sent.split())
            pred_list.append({
                'lyrics': sent,
                'key': stop_before_eos(pred_sent_key, length),
                'duration': stop_before_eos(pred_sent_dur, length)
            })
            # pred_list.append({'lyrics': sent, 'key': pred_sent_key, 'duration': pred_sent_dur})

        return pred_list
def run_smiles_generator(test_file):

    src = Field(sequential=True,
                tokenize=tokenize_drug,
                init_token='<sos>',
                eos_token='<eos>')

    trg = Field(sequential=True,
                tokenize=tokenize_drug,
                init_token='<sos>',
                eos_token='<eos>')

    #Get the train and test set in torchtext format
    datafields = [
        ("src",
         src),  # we won't be needing the id, so we pass in None as the field
        ("trg", trg)
    ]
    train, test = TabularDataset.splits(path='../data/SMILES_Autoencoder/',
                                        train='all_smiles_revised_final.csv',
                                        test=test_file,
                                        format='csv',
                                        skip_header=True,
                                        fields=datafields)

    #Split the dataset into train and validation set
    train_data, valid_data = train.split(split_ratio=0.99)

    print(f"Number of examples: {len(train_data.examples)}")
    src.build_vocab(train_data, min_freq=2)
    trg.build_vocab(train_data, min_freq=2)

    #Total no of unique words in our vocabulary
    print(f"Unique tokens in source vocabulary: {len(src.vocab)}")
    print(f"Unique tokens in target vocabulary: {len(trg.vocab)}")
    TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]
    print("Padding Id: ", TRG_PAD_IDX)

    #Create the iterator to traverse over test samples for which we need to generate latent space
    BATCH_SIZE = 128
    (train_iterator, test_iterator) = BucketIterator.splits(
        (train_data, test),
        batch_size=BATCH_SIZE,
        device=DEVICE,
        sort=False,
        shuffle=False)
    print(src.vocab.stoi)
    print(trg.vocab.stoi)

    #Define the model once again
    INPUT_DIM = len(src.vocab)
    OUTPUT_DIM = len(trg.vocab)
    ENC_EMB_DIM = 128
    DEC_EMB_DIM = 128
    HID_DIM = 256
    N_LAYERS = 1
    ENC_DROPOUT = 0.0
    DEC_DROPOUT = 0.0

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE)
    model.apply(init_weights)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(DEVICE)
    model.load_state_dict(
        torch.load('../models/lstm_out/torchtext_checkpoint.pt',
                   map_location=torch.device('cpu')))

    #Get latent space for all drugs
    model.eval()
    epoch_loss = 0

    ls_list = []
    encode_list = []
    decode_list = []
    error_list = []
    with torch.no_grad():
        for j, batch in enumerate(test_iterator):
            new_src = batch.src
            new_trg = batch.trg

            #Get output
            outputs = model(new_src, new_trg, 1)  #turn on teacher forcing
            output = outputs[0]
            hidden = outputs[1]
            cell_state = outputs[2]

            #Get latent space
            o1 = torch.argmax(torch.softmax(output, dim=2), dim=2)
            h1 = torch.mean(hidden, dim=0).cpu().detach().tolist()
            c1 = torch.mean(cell_state, dim=0).cpu().detach().tolist()

            for i in range(len(h1)):
                temp_ls = h1[i]
                temp_encode = new_trg[:, i].cpu().detach().tolist()
                temp_decode = o1[:, i].cpu().detach().tolist()
                try:
                    index_1 = temp_decode.index(1)
                except:
                    index_1 = len(temp_decode)
                temp_error = np.array(temp_encode) - np.array(temp_decode)
                error = sum(
                    np.absolute(temp_error[1:index_1]) > 0) / len(temp_error)
                error_list.append(error)
                ls_list.append(temp_ls)
                encode_list.append(temp_encode)
                decode_list.append(temp_decode)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            rev_trg = new_trg[1:].view(-1)

            loss = criterion(output, rev_trg)
            print("Reconstruction Loss for iteration " + str(j) + " is :" +
                  str(round(loss.item(), 3)))
            epoch_loss += loss.item()

    #Print overall average error
    print("Average reconstruction error: ", epoch_loss / len(test_iterator))
    torch.cuda.empty_cache()

    final_list, only_smiles_list = [], []
    for i in range(len(encode_list)):
        temp_encode = encode_list[i]
        temp_decode = decode_list[i]
        temp_encode_str, temp_decode_str, temp_mol_str, temp_error_str = '', '', '', ''

        #Get original string
        for j in range(1, len(temp_encode)):

            #Break when it sees padding
            if (temp_encode[j] == 1):
                break

            #Don't pad end of sentence
            if (temp_encode[j] != 3):
                temp_encode_str += src.vocab.itos[temp_encode[j]]

        #Get decoded string
        for j in range(1, len(temp_decode)):

            if (temp_decode[j] == 1):
                break

            if (temp_decode[j] != 3):
                temp_decode_str += src.vocab.itos[temp_decode[j]]

        #m = Chem.MolFromSmiles(temp_decode_str)
        #if (m is not None):
        #    temp_mol_str = '1'
        #else:
        #    temp_mol_str = '0'

        #string_list = [temp_encode_str, temp_decode_str, temp_mol_str, str(error_list[i])]
        #only_smiles_list.append(string_list)
        #string_list_with_ls = string_list + ls_list[i]
        #final_list.append(string_list_with_ls)

    colids = ['LS_' + str(x) for x in range(len(ls_list[0]))]
    final_out_df = pd.DataFrame(ls_list, columns=colids)
    return (final_out_df)
Exemple #19
0
tst_src_p = pickle.load(open('data/tst_src_p.pkl', 'rb'))
l_tst_tgt = pickle.load(open('data/l_tst_tgt.pkl', 'rb'))
tst_tgt_p = pickle.load(open('data/tst_tgt_p.pkl', 'rb'))

l_trn_src = pickle.load(open('data/l_trn_src.pkl', 'rb'))
trn_src_p = pickle.load(open('data/trn_src_p.pkl', 'rb'))
l_trn_tgt = pickle.load(open('data/l_trn_tgt.pkl', 'rb'))
trn_tgt_p = pickle.load(open('data/trn_tgt_p.pkl', 'rb'))

tst_src_t = torch.LongTensor(tst_src_p)
tst_tgt_t = torch.LongTensor(tst_tgt_p)
trn_src_t = torch.LongTensor(trn_src_p)
trn_tgt_t = torch.LongTensor(trn_tgt_p)

enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad])
dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos],
              vocab[eos], vocab[unk])
enc.to('cuda')
dec.to('cuda')
opt_enc = torch.optim.Adam(enc.parameters())
opt_dec = torch.optim.Adam(dec.parameters())

n_batch = len(trn_src_p) // batch_size

for e in range(epochs):
    enc.train()
    dec.train()
    epoch_loss = 0
    for i in range(n_batch):
        opt_enc.zero_grad()
        opt_dec.zero_grad()
        lengths = torch.LongTensor(l_trn_src[batch_size * i:batch_size *