Esempio n. 1
0
class Encoder_Decoder(nn.Module):
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super(Encoder_Decoder, self).__init__()
        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.dim_hid = dim_hid
        self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb)
        # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True)
        self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)

        self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)
        # LSTMの128次元の隠れ層を13次元に変換する全結合層
        self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))

    def forward(self, sequence, state=None):
        embedding = self.word_embeddings(sequence)
        hs, (h, c) = self.en_lstm(embedding, state)

        output, (h, c) = self.de_lstm(embedding, (h, c))

        # アテンションを計算
        # t_output = torch.transpose(output, 1, 2)
        # s = torch.bmm(hs, t_output)
        # attention_weight = self.softmax(s)

        output = self.hidden2linear(output)
        return output, (h, c)

    def generate(self, start=None, max_len=17):

        if start is None:
            start = random.choice(self.vocab.index2word)

        idx = self.embed.weight.new_full((1, 1),
                                         self.vocab.get_index(start),
                                         dtype=torch.long)
        decoded = [start]
        state = None
        unk = self.vocab.get_index('<unk>')
        while decoded[-1] != '<eos>' and len(decoded) < max_len:
            x, state = self.forward(idx, state)
            x[:, :, unk] = -float('inf')

            # prob = list(map(self.to_int, x.squeeze().tolist()))

            # idx = torch.tensor(random.choices(
            #     list(range(len(prob))), weights=prob, k=1)).view(1, -1)

            idx = torch.argmax(x, dim=-1)

            word = self.vocab.get_word(idx.item())
            decoded.append(word)
        return ' '.join(decoded)
Esempio n. 2
0
def make_data_set_and_vocab(trainpath=None, vectorpath=None, threshhold=0):
    vocab = Vocabulary()
    if vectorpath is not None:
        vocab.load(vectorpath)

    counter = collections.Counter()
    with open(trainpath, 'r') as f:
        for line in f:
            words = make_wakati(line.strip())
            for word in words:
                counter[word] += 1

    # for word, _ in counter.most_common(self.n_max_word - 2):
    for word, cnt in counter.most_common():
        if cnt <= threshhold:
            break
        if word not in vocab:
            vocab.add_word(word)
    vocab.save('vocab')

    # ここからデータセット作成
    data_set = MyDataset(trainpath=trainpath, vocab=vocab)

    return data_set, vocab
def load_vocab(file):
  v = Vocabulary()
  v.load(file)
  return v
Esempio n. 4
0
def load_vocab(file):
    v = Vocabulary()
    v.load(file)
    return v
Esempio n. 5
0
def sample(train_settings, data_settings, input_settings, model_settings):
    print("Sampling from model...")
    # Load vocab
    vocab = Vocabulary.load(data_settings["input_vocab"])

    # INPUT PIPELINE
    input = tf.placeholder(tf.int32, shape=[None],
                           name="input")  # Integers representing characters
    # Create state placeholders - 2 for each lstm cell.
    state_placeholders = list()
    initial_states = list()
    for i in range(0, model_settings["num_layers"]):
        state_placeholders.append(
            tuple([
                tf.placeholder(tf.float32,
                               shape=[1, model_settings["lstm_size"]],
                               name="lstm_state_c_" +
                               str(i)),  # Batch size x State size
                tf.placeholder(tf.float32,
                               shape=[1, model_settings["lstm_size"]],
                               name="lstm_state_h_" + str(i))
            ]))  # Batch size x State size
        initial_states.append(
            tuple([
                np.zeros(shape=[1, model_settings["lstm_size"]],
                         dtype=np.float32),
                np.zeros(shape=[1, model_settings["lstm_size"]],
                         dtype=np.float32)
            ]))
    state_placeholders = tuple(state_placeholders)
    initial_states = tuple(initial_states)

    # MODEL
    inference_settings = model_settings
    inference_settings[
        "batch_size"] = 1  # Only sample from one example simultaneously
    inference_settings["num_unroll"] = 1  # Only sample one character at a time
    model = LyricsPredictor(inference_settings,
                            vocab.size + 1)  # Include EOS token
    probs, state = model.sample(input, state_placeholders)

    # LOOP
    # Start a prefetcher in the background, initialize variables
    sess = tf.Session()
    tf.train.start_queue_runners(sess=sess)
    init_op = tf.initialize_all_variables()
    sess.run(init_op)

    # CHECKPOINTING
    # Load pretrained model to  sample
    latestCheckpoint = tf.train.latest_checkpoint(
        train_settings["checkpoint_dir"])
    restorer = tf.train.Saver(tf.global_variables(),
                              write_version=tf.train.SaverDef.V2)
    restorer.restore(sess, latestCheckpoint)
    print('Pre-trained model restored')

    inference = [probs, state]

    current_seq = "never"
    current_seq_ind = vocab.char2index(current_seq)

    # Warm up RNN with initial sequence
    s = initial_states
    for ind in current_seq_ind:
        # Create feed dict for states
        feed = dict()
        for i in range(0, model_settings["num_layers"]):
            for c in range(0, len(s[i])):
                feed[state_placeholders[i][c]] = s[i][c]
                feed[state_placeholders[i][c]] = s[i][c]

        feed[input] = [ind]  # Add new input symbol to feed
        [p, s] = sess.run(inference, feed_dict=feed)

    # Sample until we receive an end-of-lyrics token
    iteration = 0
    while iteration < 100000:
        # Now p contains probability of upcoming char, as estimated by model, and s the last RNN state
        ind_sample = np.random.choice(range(0, vocab.size + 1),
                                      p=np.squeeze(p))
        if ind_sample == vocab.size:  # EOS token
            print("Model decided to stop generating!")
            break

        current_seq_ind.append(ind_sample)

        # Create feed dict for states
        feed = dict()
        for i in range(0, model_settings["num_layers"]):
            for c in range(0, len(s[i])):
                feed[state_placeholders[i][c]] = s[i][c]
                feed[state_placeholders[i][c]] = s[i][c]

        feed[input] = [ind_sample]  # Add new input symbol to feed
        [p, s] = sess.run(inference, feed_dict=feed)

        iteration += 1

    c_sample = vocab.index2char(current_seq_ind)
    print("".join(c_sample))

    sess.close()
Esempio n. 6
0
class LM(torch.nn.Module):
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super().__init__()

        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.embed = torch.nn.Embedding(len(self.vocab), dim_emb)
        self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True)
        self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        self.out = torch.nn.Linear(dim_hid, len(self.vocab))

    def forward(self, x, state1=None, state2=None):
        out = self.embed(x)
        out, state1 = self.rnn1(out, state1)
        out, state2 = self.rnn2(out, state2)
        #         out, (h, c) = self.rnn3(out, None)
        #         out, (h, c) = self.rnn4(out, None)
        out = self.out(out)
        return out, state1, state2

    # def to_int(self, a):
    #     if a == -float('inf'):
    #         return 0
    #     else:
    #         return int(1e9*a)

    def generate(self, prefix, max_len=30):
        cost = 0
        softmax = torch.nn.Softmax(dim=-1)
        start = '<bos>'

        idx = self.embed.weight.new_full((1, 1),
                                         self.vocab.get_index(start),
                                         dtype=torch.long)
        decoded = [start]
        state1, state2 = None, None
        unk = self.vocab.get_index('<unk>')
        while decoded[-1] != '<eos>' and len(decoded) < max_len:
            x, state1, state2 = self.forward(idx, state1, state2)

            if 0 < len(prefix):
                word = prefix.pop()
                idx = self.vocab.get_index(word)

                idx = torch.tensor(idx).view(1, 1).to(device)
            else:
                x[:, :, unk] = -float('inf')
                x = softmax(x)
                # idx = torch.argmax(x, dim=-1)
                x = x.squeeze().to('cpu').detach().numpy()
                accum = list(accumulate(x))
                idx = bisect(accum, random.random() * accum[-1])
                # word = self.vocab.get_word(idx.item())
                cost += np.log2(x[idx])
                word = self.vocab.get_word(idx)
                idx = torch.tensor(idx).view(1, 1).to(device)

            decoded.append(word)
        cost /= len(decoded)
        return ' '.join(decoded), cost