コード例 #1
0
def main():
    #Dataset
    dataset = PadDataset(WORKING_DIR, EMBEDDING_SIZE, diff_vocab = DIFF_VOCAB, embedding_path = EMBEDDING_PATH,\
                  limit_encode = LIMIT_ENCODE, limit_decode = LIMIT_DECODE)
    print("112")
    encoder_vocab_size = dataset.length_vocab_encode()
    decoder_vocab_size = dataset.length_vocab_decode()
    print("Steps per epoch %d" %(int(math.ceil(float(dataset.datasets["train"].number_of_samples)\
                                        /float(BATCH_SIZE)))))
    #Initialising Model
    embeddings_encoder = dataset.vocab.embeddings_encoder
    embeddings_encoder = torch.Tensor(embeddings_encoder).cuda()
    embeddings_decoder = dataset.vocab.embeddings_decoder
    embeddings_decoder = torch.Tensor(embeddings_decoder).cuda()
    content_encoder = Encoder(encoder_vocab_size, embeddings_encoder,
                              EMBEDDING_SIZE, HIDDEN_SIZE).cuda()
    print("123")
    query_encoder = Encoder(encoder_vocab_size, embeddings_encoder,
                            EMBEDDING_SIZE, HIDDEN_SIZE).cuda()
    print("ddf")
    decoder = Decoder(EMBEDDING_SIZE, embeddings_decoder, HIDDEN_SIZE,
                      decoder_vocab_size).cuda()
    print("adsf")
    seq2seqwattn = Seq2Seq(content_encoder, query_encoder, decoder).cuda()
    print("adsdf")

    run_this = run_model(dataset, seq2seqwattn)
    print('rehc')
    run_this.run_training()
    print('124124')
コード例 #2
0
ファイル: peeky_seq2seq.py プロジェクト: saboteeeen/iLab2020
 def __init__(self, vocab_size, wordvec_size, hidden_size):
     self.lstm_layers = []
     V, D, H = vocab_size, wordvec_size, hidden_size
     self.hidden_size = hidden_size
     self.encoder = Encoder(V, D, H)
     self.decoder = PeekyDecoder(V, D, H)
     self.softmax = TimeSoftmaxWithLoss()
     self.enlayers = [self.encoder.embed,
                      self.encoder.lstm]
     self.params = self.encoder.params + self.decoder.params
     self.grads = self.encoder.grads + self.decoder.grads
コード例 #3
0
def Main(vocab, vocab_inv):
    i = 0
    arret = False  # peut-être mis à True par le programme (étape 3)
    contexte = None
    # préparation GRU
    enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad])
    dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos],
                  vocab[eos], vocab[unk])
    enc.to('cuda')
    dec.to('cuda')
    # chargement des poids
    path_enc = "encoder_9.pkl"
    path_dec = "decoder_9.pkl"
    encoder_state = torch.load(path_enc)
    decoder_state = torch.load(path_dec)
    enc.load_states(encoder_state)
    enc.eval()
    dec.load_states(dict(decoder_state))
    dec.eval()
    # paramétrage de la taille de la prédiction
    taille = int(input("nombre de mots prédis à la fois ? : "))
    while (not arret):
        phrase = takeInput(i)
        exit_c, contexte = callGRU(enc, dec, phrase, vocab, contexte, taille)
        sortie = posttreatment(exit_c, vocab_inv)
        # sortie = "David Albert Huffman est un petit garçon de 10 ans des plus intelligents. Cependant, son monde cours à sa perte lorsque Poupoune décide de s'emparer de l'Europe, alors en pleine crise politique, pour y imposer son monde rose et fleurissant.Avec son ami Lamy, David va devoir lutter contre des adversaires redoutables pour sauver le monde, entrer au MIT et repousser la plus grande menace du Siècle (pour le moment) pour rétablir l'équilibre dans le rapport de Force." #test enchaînement
        printResult(sortie)
        # contexte = exit_c
        i += 1
コード例 #4
0
ファイル: utils.py プロジェクト: ricardorei/Automatic-CS
def load_seq2seq(filename, map_location=None):
    """
    Loads a model from a filename. (This function requires that the model was saved with save_model utils function)
    :param filename: Filename to be used.
    """
    from seq2seq import Encoder, Decoder
    model_dict = torch.load(
        filename, map_location=None
    ) if map_location is not None else torch.load(filename)
    encoder = Encoder(**model_dict["encoder_init_args"])
    decoder = Decoder(**model_dict["decoder_init_args"])
    encoder.load_state_dict(model_dict["encoder_state"])
    decoder.load_state_dict(model_dict["decoder_state"])
    return encoder, decoder
コード例 #5
0
def load_model(path='files/model_De', device=device):
    checkpoint = torch.load(path, map_location=device)

    in_lang, out_lang, pairs = prepareData('En', 'De')
    in_lang = checkpoint['in_lang_class']
    out_lang = checkpoint['out_lang_class']

    hidden_size = checkpoint['hidden_size']

    encoder = Encoder(in_lang.n_words, hidden_size).to(device)
    decoder = Decoder(hidden_size, out_lang.n_words, dropout_p=0.1).to(device)

    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])

    return encoder, decoder, in_lang, out_lang
コード例 #6
0
def training(data_dir):
    news_train = NEWS(data_path=os.path.join(data_dir, '.train.pkl'),
                      vocab_path='../data/vocab.pkl')
    encoder, decoder = Encoder(emb_size=300,
                               hidden_size=512,
                               vocab_size=len(news_train.vocab)), Decoder()
    SAE = SentenceAE()
コード例 #7
0
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size: int) -> None:
        self.encoder = Encoder(vocab_size, wordvec_size, hidden_size)
        self.decoder = PeekyDecoder(vocab_size, wordvec_size, hidden_size)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
コード例 #8
0
ファイル: peeky_seq2seq.py プロジェクト: bbbnodx/seq2seq
    def __init__(self, vocab_size, wordvec_size, hidden_size, ignore_index=-1):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.loss = TimeSoftmaxWithLoss(ignore_index=ignore_index)

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
コード例 #9
0
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
コード例 #10
0
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(v, D, H)
        self.encoder = PeekyDecoder(V, D, H)  # seq2seq 과 다르게 PeekyDecoder를 사용.
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.encoder.grads


# Encoder의 정보를 널리 퍼지게 하는 Peeky
# Peeky 를 이용하게 되면 신경망은 가중치 매개변수가 커져서 계산량도 늘어난다.
# 커진 매개변수만큼의 핸디캡을 잘 감안해야 할 것.
# 또한 seq2seq의 정확도는 하이퍼파라미터 영향을 크게 받는다.
コード例 #11
0
def create_model(gen_config):
    encoder = Encoder(gen_config.vocab_size,
                      gen_config.emb_dim,
                      gen_config.hidden_size,
                      n_layers=2,
                      dropout=0.5)
    decoder = Decoder(gen_config.emb_dim,
                      gen_config.hidden_size,
                      gen_config.vocab_size,
                      n_layers=1,
                      dropout=0.5)
    seq2seq = Seq2Seq(encoder, decoder).cuda()
    optimizer = optim.Adam(seq2seq.parameters(), lr=gen_config.lr)
    return seq2seq, optimizer
コード例 #12
0
ファイル: peeky_seq2seq.py プロジェクト: saboteeeen/iLab2020
class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        self.lstm_layers = []
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.hidden_size = hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()
        self.enlayers = [self.encoder.embed,
                         self.encoder.lstm]
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads

    def reset_state(self):
        self.encoder.lstm.reset_state()
        self.decoder.lstm.reset_state()
        
    def save_params(self, file_name='PeekySeq2seq.pkl'):
        with open(file_name, 'wb') as f:
            pickle.dump(self.params, f)

    def load_params(self, file_name='PeekySeq2seq.pkl'):
        with open(file_name, 'rb') as f:
            self.params = pickle.load(f)
    def generate(self, start_id, xs ,skip_ids=None, sample_size=100):
        count = 1
        word_ids = [start_id]
        x = start_id
        hs = self.encoder.forward(xs)#batch_xを入力
        x = np.array(x).reshape(1, 1)
        #hs = hs.reshape(1,self.hidden_size)
        sampled = self.decoder.generate(hs, x, sample_size)#100words returned
        while count < len(sampled):
            if (skip_ids is None) or (sampled != skip_ids):
                x = sampled
                word_ids.append(int(x[count]))# int function can not use type of list as input. 
            count += 1
        return word_ids
    def enpredict(self, xs): #Softmaxレイヤの直前までを処理する(文章生成で使用)
        for layer in self.enlayers:
            xs = layer.forward(xs)
        return xs
コード例 #13
0
                                                   ENCODER_INPUT)

    # 디코더 입력 인덱스 변환
    x_decoder = preprocessor.convert_text_to_index(answer, word_to_index,
                                                   DECODER_INPUT)

    # 디코더 목표 인덱스 변환
    y_decoder = preprocessor.convert_text_to_index(answer, word_to_index,
                                                   DECODER_TARGET)

    # 원핫 인코딩
    y_decoder = preprocessor.one_hot_encode(y_decoder)

    # 훈련 모델 인코더, 디코더 정의
    # 모델을 객체화 했기 때문에 객체로 가져오는 코드
    encoder = Encoder(len(preprocessor.words))
    decoder = Decoder(encoder.states, encoder.len_of_words)

    # github코드와 거의 동일
    # 훈련 모델 정의
    model = models.Model([encoder.inputs, decoder.inputs], decoder.outputs)

    # github코드와 동일
    # 학습 방법 설정
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # github코드와 같은 결과를 내보냄
    # 예측 모델을 생성하는 것을 객체의 함수로 만들었기 때문에 다음과 같이 코딩했습니다.
    # 예측 모델 정의
コード例 #14
0
def train():

    encode_input, decode_input, decode_output = gen_datasets()
    encode_input, decode_input, decode_output = shuffle(
        encode_input, decode_input, decode_output)

    encoder_input = Input(shape=(config['max_num'], ), name='encode_input')
    embedded_input = Embedding(config['en_voc_size'],
                               100,
                               weights=[en_w2v_matrix],
                               trainable=False,
                               name="embedded_layer")(encoder_input)
    encoder = Encoder(3,
                      100,
                      5,
                      20,
                      config['max_num'],
                      mask_zero=True,
                      name='encoder')
    encoder_output = encoder([
        embedded_input, embedded_input, embedded_input, encoder_input,
        encoder_input
    ])

    # decoder
    decoder_input = Input(shape=(config['max_num'], ), name='decode_input')
    embedded_input2 = Embedding(config['ch_voc_size'],
                                100,
                                weights=[ch_w2v_matrix],
                                trainable=False,
                                name="embedded_layer2")(decoder_input)
    decoder = Decoder(3,
                      100,
                      5,
                      20,
                      config['max_num'],
                      mask_zero=True,
                      name='decoder')
    decoder_output = decoder([
        embedded_input2, encoder_output, encoder_output, decoder_input,
        encoder_input
    ])
    decoder_dense = Dense(config['ch_voc_size'],
                          activation='softmax',
                          name='dense_layer')
    decoder_output = decoder_dense(decoder_output)
    label_smooth = LabelSmooth(0.1, config['ch_voc_size'])
    decoder_output = label_smooth(decoder_output)
    model = Model([encoder_input, decoder_input], decoder_output)

    opt = Adam(0.001, 0.9, 0.98, epsilon=1e-9)
    # model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    model.compile(optimizer=opt, loss=mask_loss, metrics=[mask_accuracy])

    model.summary()
    tb = TensorBoard(log_dir='./tb_logs/0125',
                     histogram_freq=0,
                     write_graph=True,
                     write_images=False,
                     embeddings_freq=0,
                     embeddings_layer_names=None,
                     embeddings_metadata=None)
    cp = ModelCheckpoint(
        './models/attention_seq2seq.{epoch:02d}-{val_loss:.2f}.hdf5',
        monitor='val_loss',
        verbose=0,
        save_best_only=False,
        save_weights_only=False,
        mode='auto',
        period=1)
    try:
        model.fit([encode_input, decode_input],
                  decode_output,
                  validation_split=0.2,
                  batch_size=128,
                  epochs=10,
                  callbacks=[tb, cp])
    except KeyboardInterrupt:
        model.save('attention_seq2seq')
    else:
        model.save('attention_seq2seq')
コード例 #15
0
def train(article,
          title,
          word2idx,
          target2idx,
          source_lengths,
          target_lengths,
          args,
          val_article=None,
          val_title=None,
          val_source_lengths=None,
          val_target_lengths=None):

    if not os.path.exists('./temp/x.pkl'):
        size_of_val = int(len(article) * 0.05)
        val_article, val_title, val_source_lengths, val_target_lengths = \
            utils.sampling(article, title, source_lengths, target_lengths, size_of_val)

        utils.save_everything(article, title, source_lengths, target_lengths,
                              val_article, val_title, val_source_lengths,
                              val_target_lengths, word2idx)

    size_of_val = len(val_article)
    batch_size = args.batch
    train_size = len(article)
    val_size = len(val_article)
    max_a = max(source_lengths)
    max_t = max(target_lengths)
    print("source vocab size:", len(word2idx))
    print("target vocab size:", len(target2idx))
    print("max a:{}, max t:{}".format(max_a, max_t))
    print("train_size:", train_size)
    print("val size:", val_size)
    print("batch_size:", batch_size)
    print("-" * 30)
    use_coverage = False

    encoder = Encoder(len(word2idx))
    decoder = Decoder(len(target2idx), 50)
    if os.path.exists('decoder_model'):
        encoder.load_state_dict(torch.load('encoder_model'))
        decoder.load_state_dict(torch.load('decoder_model'))

    optimizer = torch.optim.Adam(list(encoder.parameters()) +
                                 list(decoder.parameters()),
                                 lr=0.001)
    n_epoch = 5
    print("Making word index and extend vocab")
    #article, article_tar, title, ext_vocab_all, ext_count = indexing_word(article, title, word2idx, target2idx)
    #article = to_tensor(article)
    #article_extend = to_tensor(article_extend)
    #title = to_tensor(title)
    print("preprocess done")

    if args.use_cuda:
        encoder.cuda()
        decoder.cuda()

    print("start training")
    for epoch in range(n_epoch):
        total_loss = 0
        batch_n = int(train_size / batch_size)
        if epoch > 0:
            use_coverage = True
        for b in range(batch_n):
            # initialization
            batch_x = article[b * batch_size:(b + 1) * batch_size]
            batch_y = title[b * batch_size:(b + 1) * batch_size]
            #batch_x_ext = article_extend[b*batch_size: (b+1)*batch_size]
            batch_x, batch_x_ext, batch_y, extend_vocab, extend_lengths = \
                utils.batch_index(batch_x, batch_y, word2idx, target2idx)

            if args.use_cuda:
                batch_x = batch_x.cuda()
                batch_y = batch_y.cuda()
                batch_x_ext = batch_x_ext.cuda()
            x_lengths = source_lengths[b * batch_size:(b + 1) * batch_size]
            y_lengths = target_lengths[b * batch_size:(b + 1) * batch_size]

            # work around to deal with length
            pack = pack_padded_sequence(batch_x_ext,
                                        x_lengths,
                                        batch_first=True)
            batch_x_ext_var, _ = pad_packed_sequence(pack, batch_first=True)
            current_loss = train_on_batch(encoder, decoder, optimizer, batch_x,
                                          batch_y, x_lengths, y_lengths,
                                          word2idx, target2idx,
                                          batch_x_ext_var, extend_lengths,
                                          use_coverage)

            batch_x = batch_x.cpu()
            batch_y = batch_y.cpu()
            batch_x_ext = batch_x_ext.cpu()

            print('epoch:{}/{}, batch:{}/{}, loss:{}'.format(
                epoch + 1, n_epoch, b + 1, batch_n, current_loss))
            if (b + 1) % args.show_decode == 0:
                torch.save(encoder.state_dict(), 'encoder_model')
                torch.save(decoder.state_dict(), 'decoder_model')
                batch_x_val, batch_x_ext_val, batch_y_val, extend_vocab, extend_lengths = \
                    utils.batch_index(val_article, val_title, word2idx, target2idx)
                for i in range(1):
                    idx = np.random.randint(0, val_size)
                    decode.beam_search(encoder, decoder,
                                       batch_x_val[idx].unsqueeze(0),
                                       batch_y_val[idx].unsqueeze(0), word2idx,
                                       target2idx, batch_x_ext_val[idx],
                                       extend_lengths[idx], extend_vocab[idx])

                batch_x_val = batch_x_val.cpu()
                batch_y_val = batch_y_val.cpu()
                batch_x_ext_val = batch_x_ext_val.cpu()

            total_loss += current_loss
            print('-' * 30)

    print()
    print("training finished")
コード例 #16
0
    trf = Transfrom(en_vocab)

    parameters_dict = Load_Parameters(hyper_parameters_file)

    en_embedding_dim = parameters_dict['en_embedding_dim']
    de_embedding_dim = parameters_dict['de_embedding_dim']
    hidden_dim = parameters_dict['hidden_dim']
    num_layers = parameters_dict['num_layers']
    bidirectional = parameters_dict['bidirectional']
    use_lstm = parameters_dict['use_lstm']
    use_cuda = False
    batch_size = 1
    dropout_p = 0.0

    encoder = Encoder(en_embedding_dim, hidden_dim, en_vocab.n_items,
                      num_layers, dropout_p, bidirectional, use_lstm, use_cuda)
    decoder = Decoder(de_embedding_dim, hidden_dim, de_vocab.n_items,
                      num_layers, dropout_p, bidirectional, use_lstm, use_cuda)

    encoder.load_state_dict(torch.load(encoder_model_file, map_location='cpu'))
    decoder.load_state_dict(torch.load(decoder_model_file, map_location='cpu'))

    encoder.eval()
    decoder.eval()

    f_en_test = open('input.txt', 'r', encoding='utf-8')
    f_de_pred = open('output.txt', 'w', encoding='utf-8')

    while True:
        en_sent = f_en_test.readline()
コード例 #17
0
def train():
    # region Process Data
    path_to_files = tf.keras.utils.get_file(
        'english.txt',
        origin=
        'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en',
        extract=False)
    path_to_files = tf.keras.utils.get_file(
        'german.txt',
        origin=
        'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de',
        extract=False)
    path_to_files = os.path.dirname(path_to_files)

    input_tensor, target_tensor, inp_lang, targ_lang = load_wmt_dataset(
        path_to_files, num_examples, dict_size)
    max_length_targ, max_length_inp = max_length(target_tensor), max_length(
        input_tensor)
    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
        input_tensor, target_tensor, test_size=0.2)

    BUFFER_SIZE = len(input_tensor_train)
    steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    # endregion

    # Region: model definition
    encoder = Encoder(dict_size, embedding_dim, units, BATCH_SIZE)
    attention_layer = BahdanauAttention(10)
    decoder = Decoder(dict_size, embedding_dim, units, BATCH_SIZE)
    optimizer = tf.keras.optimizers.Adam()

    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    @tf.function
    def train_step(inp, targ, enc_hidden):
        loss = 0

        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, enc_hidden)

            dec_hidden = enc_hidden

            dec_input = tf.expand_dims([targ_lang.word_index['<start>']] *
                                       BATCH_SIZE, 1)

            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden,
                                                     enc_output)

                loss += loss_function(targ[:, t], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))

        variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss

    for epoch in range(EPOCHS):
        start = time.time()

        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden)
            total_loss += batch_loss

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch + 1, batch, batch_loss.numpy()))
        # saving (checkpoint) the model every 2 epochs
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)

        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
コード例 #18
0
    def __init__(self):
        def Load_Vocab(file):
            with open(file, 'rb') as fd:
                _vocab = pickle.load(fd)
            return _vocab

        def Load_Parameters(file):
            with open(file, 'rb') as fd:
                parameters_dict = pickle.load(fd)
            return parameters_dict

        torch.manual_seed(1)
        torch.set_num_threads(4)
        en_vocab_dur_file = './en_vocab_dur.pkl'
        de_vocab_dur_file = './de_vocab_dur.pkl'

        encoder_dur_model_file = './encoder_dur.10.pt'
        decoder_dur_model_file = './decoder_dur.10.pt'

        en_vocab_key_file = './en_vocab.pkl'
        de_vocab_key_file = './de_vocab.pkl'

        encoder_key_model_file = './encoder.10.pt'
        decoder_key_model_file = './decoder.10.pt'
        hyper_parameters_file = './parameters_dict.pkl'
        self.en_vocab_key = Load_Vocab(en_vocab_key_file)
        self.de_vocab_key = Load_Vocab(de_vocab_key_file)

        self.en_vocab_dur = Load_Vocab(en_vocab_dur_file)
        self.de_vocab_dur = Load_Vocab(de_vocab_dur_file)

        self.trf_key = Transfrom(self.en_vocab_key)
        self.trf_dur = Transfrom(self.en_vocab_dur)

        self.parameters_dict = Load_Parameters(hyper_parameters_file)

        en_embedding_dim = self.parameters_dict['en_embedding_dim']
        de_embedding_dim = self.parameters_dict['de_embedding_dim']
        hidden_dim = self.parameters_dict['hidden_dim']
        num_layers = self.parameters_dict['num_layers']
        bidirectional = self.parameters_dict['bidirectional']
        use_lstm = self.parameters_dict['use_lstm']
        self.use_cuda_dur = self.use_cuda_key = False
        batch_size = 1
        dropout_p = 0.0

        self.encoder_key = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.decoder_key = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.encoder_dur = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)
        self.decoder_dur = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)

        self.encoder_key.load_state_dict(
            torch.load(encoder_key_model_file, map_location='cpu'))
        self.decoder_key.load_state_dict(
            torch.load(decoder_key_model_file, map_location='cpu'))
        self.encoder_dur.load_state_dict(
            torch.load(encoder_dur_model_file, map_location='cpu'))
        self.decoder_dur.load_state_dict(
            torch.load(decoder_dur_model_file, map_location='cpu'))

        self.encoder_key.eval()
        self.decoder_key.eval()
        self.encoder_dur.eval()
        self.decoder_dur.eval()
        """ __init__ return the parameters: {self.trf_dur,self.trf_key;
コード例 #19
0
class Text2song(object):
    def __init__(self):
        def Load_Vocab(file):
            with open(file, 'rb') as fd:
                _vocab = pickle.load(fd)
            return _vocab

        def Load_Parameters(file):
            with open(file, 'rb') as fd:
                parameters_dict = pickle.load(fd)
            return parameters_dict

        torch.manual_seed(1)
        torch.set_num_threads(4)
        en_vocab_dur_file = './en_vocab_dur.pkl'
        de_vocab_dur_file = './de_vocab_dur.pkl'

        encoder_dur_model_file = './encoder_dur.10.pt'
        decoder_dur_model_file = './decoder_dur.10.pt'

        en_vocab_key_file = './en_vocab.pkl'
        de_vocab_key_file = './de_vocab.pkl'

        encoder_key_model_file = './encoder.10.pt'
        decoder_key_model_file = './decoder.10.pt'
        hyper_parameters_file = './parameters_dict.pkl'
        self.en_vocab_key = Load_Vocab(en_vocab_key_file)
        self.de_vocab_key = Load_Vocab(de_vocab_key_file)

        self.en_vocab_dur = Load_Vocab(en_vocab_dur_file)
        self.de_vocab_dur = Load_Vocab(de_vocab_dur_file)

        self.trf_key = Transfrom(self.en_vocab_key)
        self.trf_dur = Transfrom(self.en_vocab_dur)

        self.parameters_dict = Load_Parameters(hyper_parameters_file)

        en_embedding_dim = self.parameters_dict['en_embedding_dim']
        de_embedding_dim = self.parameters_dict['de_embedding_dim']
        hidden_dim = self.parameters_dict['hidden_dim']
        num_layers = self.parameters_dict['num_layers']
        bidirectional = self.parameters_dict['bidirectional']
        use_lstm = self.parameters_dict['use_lstm']
        self.use_cuda_dur = self.use_cuda_key = False
        batch_size = 1
        dropout_p = 0.0

        self.encoder_key = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.decoder_key = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_key.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_key)
        self.encoder_dur = Encoder(en_embedding_dim, hidden_dim,
                                   self.en_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)
        self.decoder_dur = Decoder(de_embedding_dim, hidden_dim,
                                   self.de_vocab_dur.n_items, num_layers,
                                   dropout_p, bidirectional, use_lstm,
                                   self.use_cuda_dur)

        self.encoder_key.load_state_dict(
            torch.load(encoder_key_model_file, map_location='cpu'))
        self.decoder_key.load_state_dict(
            torch.load(decoder_key_model_file, map_location='cpu'))
        self.encoder_dur.load_state_dict(
            torch.load(encoder_dur_model_file, map_location='cpu'))
        self.decoder_dur.load_state_dict(
            torch.load(decoder_dur_model_file, map_location='cpu'))

        self.encoder_key.eval()
        self.decoder_key.eval()
        self.encoder_dur.eval()
        self.decoder_dur.eval()
        """ __init__ return the parameters: {self.trf_dur,self.trf_key;
                                            self.encoder_dur,self.encoder_key;
                                            self.decoder_dur,self.decoder_key;
                                            self.en_vocab_dur,self.en_vocab_key;
                                            self.de_vocab_dur,self.de_vocab_key;
                                            self.use_cuda_dur,self,self.use_cuda_key.}"""

    def get_song(self, lyric):
        def stop_before_eos(li, length):
            if '_EOS_' in li:
                i = li.index('_EOS_')
                li = li[:i]
            while (li.__len__() < length):
                li.append(li[-1])
            return li

        def important_function_in_while_loop(trf, sent, encoder, decoder,
                                             de_vocab, use_cuda, en_sent):
            en_seq, en_seq_len = trf.trans_input(sent)

            en_seq = torch.LongTensor(en_seq)
            encoder_input = en_seq
            encoder_output, encoder_state = encoder(encoder_input, en_seq_len)

            # initial decoder hidden
            decoder_state = decoder.init_state(encoder_state)

            # Start decoding
            decoder_inputs = torch.LongTensor([de_vocab.item2index['_START_']])

            pred_char = ''

            if use_cuda: decoder_inputs = decoder_inputs.cuda()
            decoder_outputs, decoder_state = decoder(decoder_inputs,
                                                     encoder_output,
                                                     decoder_state)

            max_len = len(en_sent.split())

            return (pred_char, encoder_output, decoder_outputs, decoder_state,
                    max_len)

        f_en_test = io.StringIO(lyric)

        pred_list = []

        while True:
            en_sent = f_en_test.readline()

            if not en_sent: break

            sent = en_sent.strip()
            pred_sent_dur = []
            pred_sent_key = []
            pred_char_key, encoder_output_key, decoder_outputs_key, decoder_state_key, max_len_key = \
                important_function_in_while_loop(self.trf_key, sent, self.encoder_key, self.decoder_key, self.de_vocab_key, self.use_cuda_key,
                                                 en_sent)

            pred_char_dur, encoder_output_dur, decoder_outputs_dur, decoder_state_dur, max_len_dur = \
                important_function_in_while_loop(self.trf_dur, sent, self.encoder_dur, self.decoder_dur, self.de_vocab_dur, self.use_cuda_dur,
                                                 en_sent)

            # Greedy search
            while pred_char_key != '_EOS_' and pred_char_dur != '_EOS_':
                log_prob_key, v_idx_key = decoder_outputs_key.data.topk(1)
                pred_char_key = self.de_vocab_key.index2item[v_idx_key.item()]
                pred_sent_key.append(pred_char_key)

                log_prob_dur, v_idx_dur = decoder_outputs_dur.data.topk(1)
                pred_char_dur = self.de_vocab_dur.index2item[v_idx_dur.item()]
                pred_sent_dur.append(pred_char_dur)

                if (len(pred_sent_dur) > max_len_dur
                        or len(pred_sent_dur) > max_len_key):
                    break

                decoder_inputs_dur = torch.LongTensor([v_idx_dur.item()])
                if self.use_cuda_dur:
                    decoder_inputs_dur = decoder_inputs_dur.cuda()
                decoder_outputs_dur, decoder_state_dur = self.decoder_dur(
                    decoder_inputs_dur, encoder_output_dur, decoder_state_dur)

                decoder_inputs_key = torch.LongTensor([v_idx_key.item()])
                if self.use_cuda_key:
                    decoder_inputs_key = decoder_inputs_key.cuda()
                decoder_outputs_key, decoder_state_key = self.decoder_key(
                    decoder_inputs_key, encoder_output_key, decoder_state_key)
            length = len(sent.split())
            pred_list.append({
                'lyrics': sent,
                'key': stop_before_eos(pred_sent_key, length),
                'duration': stop_before_eos(pred_sent_dur, length)
            })
            # pred_list.append({'lyrics': sent, 'key': pred_sent_key, 'duration': pred_sent_dur})

        return pred_list
コード例 #20
0
l_tst_src = pickle.load(open('data/l_tst_src.pkl', 'rb'))
tst_src_p = pickle.load(open('data/tst_src_p.pkl', 'rb'))
l_tst_tgt = pickle.load(open('data/l_tst_tgt.pkl', 'rb'))
tst_tgt_p = pickle.load(open('data/tst_tgt_p.pkl', 'rb'))

l_trn_src = pickle.load(open('data/l_trn_src.pkl', 'rb'))
trn_src_p = pickle.load(open('data/trn_src_p.pkl', 'rb'))
l_trn_tgt = pickle.load(open('data/l_trn_tgt.pkl', 'rb'))
trn_tgt_p = pickle.load(open('data/trn_tgt_p.pkl', 'rb'))

tst_src_t = torch.LongTensor(tst_src_p)
tst_tgt_t = torch.LongTensor(tst_tgt_p)
trn_src_t = torch.LongTensor(trn_src_p)
trn_tgt_t = torch.LongTensor(trn_tgt_p)

enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad])
dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos],
              vocab[eos], vocab[unk])
enc.to('cuda')
dec.to('cuda')
opt_enc = torch.optim.Adam(enc.parameters())
opt_dec = torch.optim.Adam(dec.parameters())

n_batch = len(trn_src_p) // batch_size

for e in range(epochs):
    enc.train()
    dec.train()
    epoch_loss = 0
    for i in range(n_batch):
        opt_enc.zero_grad()
コード例 #21
0
 
 #for i,batch in enumerate(valid_iterator):
 #    print("Train Src Shape: ",str(batch.src.shape))
 #    print("Train Trg Shape: ",str(batch.trg.shape))
 
 # +
 INPUT_DIM = len(src.vocab)
 OUTPUT_DIM = len(trg.vocab)
 ENC_EMB_DIM = 128
 DEC_EMB_DIM = 128
 HID_DIM = 256
 N_LAYERS = 1
 ENC_DROPOUT = 0.0
 DEC_DROPOUT = 0.0
 
 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
 dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
 
 model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE)
 model.apply(init_weights)
 # -
 
 optimizer = optim.Adam(model.parameters())
 TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]
 criterion = nn.CrossEntropyLoss().to(DEVICE)
 print(TRG_PAD_IDX)     
 
 # +
 N_EPOCHS = 1000
 CLIP = 1
 counter = 0
コード例 #22
0
def run_smiles_generator(test_file):

    src = Field(sequential=True,
                tokenize=tokenize_drug,
                init_token='<sos>',
                eos_token='<eos>')

    trg = Field(sequential=True,
                tokenize=tokenize_drug,
                init_token='<sos>',
                eos_token='<eos>')

    #Get the train and test set in torchtext format
    datafields = [
        ("src",
         src),  # we won't be needing the id, so we pass in None as the field
        ("trg", trg)
    ]
    train, test = TabularDataset.splits(path='../data/SMILES_Autoencoder/',
                                        train='all_smiles_revised_final.csv',
                                        test=test_file,
                                        format='csv',
                                        skip_header=True,
                                        fields=datafields)

    #Split the dataset into train and validation set
    train_data, valid_data = train.split(split_ratio=0.99)

    print(f"Number of examples: {len(train_data.examples)}")
    src.build_vocab(train_data, min_freq=2)
    trg.build_vocab(train_data, min_freq=2)

    #Total no of unique words in our vocabulary
    print(f"Unique tokens in source vocabulary: {len(src.vocab)}")
    print(f"Unique tokens in target vocabulary: {len(trg.vocab)}")
    TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]
    print("Padding Id: ", TRG_PAD_IDX)

    #Create the iterator to traverse over test samples for which we need to generate latent space
    BATCH_SIZE = 128
    (train_iterator, test_iterator) = BucketIterator.splits(
        (train_data, test),
        batch_size=BATCH_SIZE,
        device=DEVICE,
        sort=False,
        shuffle=False)
    print(src.vocab.stoi)
    print(trg.vocab.stoi)

    #Define the model once again
    INPUT_DIM = len(src.vocab)
    OUTPUT_DIM = len(trg.vocab)
    ENC_EMB_DIM = 128
    DEC_EMB_DIM = 128
    HID_DIM = 256
    N_LAYERS = 1
    ENC_DROPOUT = 0.0
    DEC_DROPOUT = 0.0

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE)
    model.apply(init_weights)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(DEVICE)
    model.load_state_dict(
        torch.load('../models/lstm_out/torchtext_checkpoint.pt',
                   map_location=torch.device('cpu')))

    #Get latent space for all drugs
    model.eval()
    epoch_loss = 0

    ls_list = []
    encode_list = []
    decode_list = []
    error_list = []
    with torch.no_grad():
        for j, batch in enumerate(test_iterator):
            new_src = batch.src
            new_trg = batch.trg

            #Get output
            outputs = model(new_src, new_trg, 1)  #turn on teacher forcing
            output = outputs[0]
            hidden = outputs[1]
            cell_state = outputs[2]

            #Get latent space
            o1 = torch.argmax(torch.softmax(output, dim=2), dim=2)
            h1 = torch.mean(hidden, dim=0).cpu().detach().tolist()
            c1 = torch.mean(cell_state, dim=0).cpu().detach().tolist()

            for i in range(len(h1)):
                temp_ls = h1[i]
                temp_encode = new_trg[:, i].cpu().detach().tolist()
                temp_decode = o1[:, i].cpu().detach().tolist()
                try:
                    index_1 = temp_decode.index(1)
                except:
                    index_1 = len(temp_decode)
                temp_error = np.array(temp_encode) - np.array(temp_decode)
                error = sum(
                    np.absolute(temp_error[1:index_1]) > 0) / len(temp_error)
                error_list.append(error)
                ls_list.append(temp_ls)
                encode_list.append(temp_encode)
                decode_list.append(temp_decode)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            rev_trg = new_trg[1:].view(-1)

            loss = criterion(output, rev_trg)
            print("Reconstruction Loss for iteration " + str(j) + " is :" +
                  str(round(loss.item(), 3)))
            epoch_loss += loss.item()

    #Print overall average error
    print("Average reconstruction error: ", epoch_loss / len(test_iterator))
    torch.cuda.empty_cache()

    final_list, only_smiles_list = [], []
    for i in range(len(encode_list)):
        temp_encode = encode_list[i]
        temp_decode = decode_list[i]
        temp_encode_str, temp_decode_str, temp_mol_str, temp_error_str = '', '', '', ''

        #Get original string
        for j in range(1, len(temp_encode)):

            #Break when it sees padding
            if (temp_encode[j] == 1):
                break

            #Don't pad end of sentence
            if (temp_encode[j] != 3):
                temp_encode_str += src.vocab.itos[temp_encode[j]]

        #Get decoded string
        for j in range(1, len(temp_decode)):

            if (temp_decode[j] == 1):
                break

            if (temp_decode[j] != 3):
                temp_decode_str += src.vocab.itos[temp_decode[j]]

        #m = Chem.MolFromSmiles(temp_decode_str)
        #if (m is not None):
        #    temp_mol_str = '1'
        #else:
        #    temp_mol_str = '0'

        #string_list = [temp_encode_str, temp_decode_str, temp_mol_str, str(error_list[i])]
        #only_smiles_list.append(string_list)
        #string_list_with_ls = string_list + ls_list[i]
        #final_list.append(string_list_with_ls)

    colids = ['LS_' + str(x) for x in range(len(ls_list[0]))]
    final_out_df = pd.DataFrame(ls_list, columns=colids)
    return (final_out_df)
コード例 #23
0
    
    parameters_dict = {}
    parameters_dict['en_embedding_dim'] = en_embedding_dim
    parameters_dict['de_embedding_dim'] = de_embedding_dim
    parameters_dict['hidden_dim'] = hidden_dim
    parameters_dict['num_layers'] = num_layers
    parameters_dict['bidirectional'] = bidirectional
    parameters_dict['use_lstm'] = use_lstm

    with open('parameters_dict.pkl', 'wb') as fd:
        pickle.dump(parameters_dict, fd)
    
    batch_total = sum([1 for _ in pl.gen_pairs(batch_size)])
    ones_matrix = autograd.Variable(torch.ones(1, de_vocab.n_items))
    
    encoder = Encoder(en_embedding_dim, hidden_dim, en_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda)
    decoder = Decoder(de_embedding_dim, hidden_dim, de_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda)

    
    encoder_model_file = 'encoder_rev.7.pt'
    decoder_model_file = 'decoder_rev.7.pt'
    encoder.load_state_dict(torch.load(encoder_model_file))
    decoder.load_state_dict(torch.load(decoder_model_file))

    '''
    #Load Pre-trained Embedding
    model_file = 'bi_gru.100.100.2.pt'
    if model_file != '' : model.load_state_dict(torch.load(model_file))
    else: model.load_pre_train_emb('cityu_training.char.emb.npy', 'cityu_training.char.dict', vocab)
    '''