Python Vocabulary.load Examples

Programming Language: Python

Namespace/Package Name: Vocabulary

Class/Type: Vocabulary

Method/Function: load

Examples at hotexamples.com: 6

Python Vocabulary.load - 6 examples found. These are the top rated real world Python examples of Vocabulary.Vocabulary.load extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Vocabulary(30)

add_token(5)

load(5)

add_word(5)

save(3)

get_vocab(3)

from_serializable(3)

get_word(3)

index(2)

build_from_token(2)

make_vocab_charts(2)

readPostProcessingVoc(2)

get_index(2)

getIndex(2)

fetch(2)

addSentence(1)

load_bigquery_vocab_from_indexed(1)

load_vocab_from_local(1)

load_word_from_data(1)

make_array_of_words_from_sentences(1)

prune(1)

restore_text(1)

add_sentence_pair(1)

save_dict(1)

loadIndexFile(1)

sentence2indices(1)

similar(1)

size(1)

sorted_tokens(1)

startSymbolWordID(1)

symbol(1)

text2ids(1)

to_index(1)

unknownWordID(1)

sentence2index(1)

incrementDF(1)

addSymbol(1)

from_serialiable(1)

add_words(1)

build_vocabulary(1)

checkIndex(1)

create(1)

create_from_text(1)

de_tokenize_data(1)

endSymbolWordID(1)

expand(1)

export_vocabulary(1)

addWord(1)

isATerm(1)

getCF(1)

Example #1

Show file

class Encoder_Decoder(nn.Module):
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super(Encoder_Decoder, self).__init__()
        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.dim_hid = dim_hid
        self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb)
        # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True)
        self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)

        self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True)
        # LSTMの128次元の隠れ層を13次元に変換する全結合層
        self.hidden2linear = nn.Linear(dim_hid, len(self.vocab))

    def forward(self, sequence, state=None):
        embedding = self.word_embeddings(sequence)
        hs, (h, c) = self.en_lstm(embedding, state)

        output, (h, c) = self.de_lstm(embedding, (h, c))

        # アテンションを計算
        # t_output = torch.transpose(output, 1, 2)
        # s = torch.bmm(hs, t_output)
        # attention_weight = self.softmax(s)

        output = self.hidden2linear(output)
        return output, (h, c)

    def generate(self, start=None, max_len=17):

        if start is None:
            start = random.choice(self.vocab.index2word)

        idx = self.embed.weight.new_full((1, 1),
                                         self.vocab.get_index(start),
                                         dtype=torch.long)
        decoded = [start]
        state = None
        unk = self.vocab.get_index('<unk>')
        while decoded[-1] != '<eos>' and len(decoded) < max_len:
            x, state = self.forward(idx, state)
            x[:, :, unk] = -float('inf')

            # prob = list(map(self.to_int, x.squeeze().tolist()))

            # idx = torch.tensor(random.choices(
            #     list(range(len(prob))), weights=prob, k=1)).view(1, -1)

            idx = torch.argmax(x, dim=-1)

            word = self.vocab.get_word(idx.item())
            decoded.append(word)
        return ' '.join(decoded)

Example #2

Show file

def make_data_set_and_vocab(trainpath=None, vectorpath=None, threshhold=0):
    vocab = Vocabulary()
    if vectorpath is not None:
        vocab.load(vectorpath)

    counter = collections.Counter()
    with open(trainpath, 'r') as f:
        for line in f:
            words = make_wakati(line.strip())
            for word in words:
                counter[word] += 1

    # for word, _ in counter.most_common(self.n_max_word - 2):
    for word, cnt in counter.most_common():
        if cnt <= threshhold:
            break
        if word not in vocab:
            vocab.add_word(word)
    vocab.save('vocab')

    # ここからデータセット作成
    data_set = MyDataset(trainpath=trainpath, vocab=vocab)

    return data_set, vocab

Example #3

Show file

File: enc-dec-tranceration.py Project: lrlab/LSTM_haseshun

def load_vocab(file):
  v = Vocabulary()
  v.load(file)
  return v

Example #4

Show file

def load_vocab(file):
    v = Vocabulary()
    v.load(file)
    return v

Example #5

Show file

File: Sample.py Project: luxiaolei/Tensorflow-Char-LSTM

def sample(train_settings, data_settings, input_settings, model_settings):
    print("Sampling from model...")
    # Load vocab
    vocab = Vocabulary.load(data_settings["input_vocab"])

    # INPUT PIPELINE
    input = tf.placeholder(tf.int32, shape=[None],
                           name="input")  # Integers representing characters
    # Create state placeholders - 2 for each lstm cell.
    state_placeholders = list()
    initial_states = list()
    for i in range(0, model_settings["num_layers"]):
        state_placeholders.append(
            tuple([
                tf.placeholder(tf.float32,
                               shape=[1, model_settings["lstm_size"]],
                               name="lstm_state_c_" +
                               str(i)),  # Batch size x State size
                tf.placeholder(tf.float32,
                               shape=[1, model_settings["lstm_size"]],
                               name="lstm_state_h_" + str(i))
            ]))  # Batch size x State size
        initial_states.append(
            tuple([
                np.zeros(shape=[1, model_settings["lstm_size"]],
                         dtype=np.float32),
                np.zeros(shape=[1, model_settings["lstm_size"]],
                         dtype=np.float32)
            ]))
    state_placeholders = tuple(state_placeholders)
    initial_states = tuple(initial_states)

    # MODEL
    inference_settings = model_settings
    inference_settings[
        "batch_size"] = 1  # Only sample from one example simultaneously
    inference_settings["num_unroll"] = 1  # Only sample one character at a time
    model = LyricsPredictor(inference_settings,
                            vocab.size + 1)  # Include EOS token
    probs, state = model.sample(input, state_placeholders)

    # LOOP
    # Start a prefetcher in the background, initialize variables
    sess = tf.Session()
    tf.train.start_queue_runners(sess=sess)
    init_op = tf.initialize_all_variables()
    sess.run(init_op)

    # CHECKPOINTING
    # Load pretrained model to  sample
    latestCheckpoint = tf.train.latest_checkpoint(
        train_settings["checkpoint_dir"])
    restorer = tf.train.Saver(tf.global_variables(),
                              write_version=tf.train.SaverDef.V2)
    restorer.restore(sess, latestCheckpoint)
    print('Pre-trained model restored')

    inference = [probs, state]

    current_seq = "never"
    current_seq_ind = vocab.char2index(current_seq)

    # Warm up RNN with initial sequence
    s = initial_states
    for ind in current_seq_ind:
        # Create feed dict for states
        feed = dict()
        for i in range(0, model_settings["num_layers"]):
            for c in range(0, len(s[i])):
                feed[state_placeholders[i][c]] = s[i][c]
                feed[state_placeholders[i][c]] = s[i][c]

        feed[input] = [ind]  # Add new input symbol to feed
        [p, s] = sess.run(inference, feed_dict=feed)

    # Sample until we receive an end-of-lyrics token
    iteration = 0
    while iteration < 100000:
        # Now p contains probability of upcoming char, as estimated by model, and s the last RNN state
        ind_sample = np.random.choice(range(0, vocab.size + 1),
                                      p=np.squeeze(p))
        if ind_sample == vocab.size:  # EOS token
            print("Model decided to stop generating!")
            break

        current_seq_ind.append(ind_sample)

        # Create feed dict for states
        feed = dict()
        for i in range(0, model_settings["num_layers"]):
            for c in range(0, len(s[i])):
                feed[state_placeholders[i][c]] = s[i][c]
                feed[state_placeholders[i][c]] = s[i][c]

        feed[input] = [ind_sample]  # Add new input symbol to feed
        [p, s] = sess.run(inference, feed_dict=feed)

        iteration += 1

    c_sample = vocab.index2char(current_seq_ind)
    print("".join(c_sample))

    sess.close()

Example #6

Show file

class LM(torch.nn.Module):
    def __init__(self,
                 dim_emb,
                 dim_hid,
                 vocab_file='./data/preprocessed/vocab_file.vocab'):
        super().__init__()

        self.vocab = Vocabulary()
        self.vocab.load(vocab_file=vocab_file)
        self.embed = torch.nn.Embedding(len(self.vocab), dim_emb)
        self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True)
        self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        #         self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True)
        self.out = torch.nn.Linear(dim_hid, len(self.vocab))

    def forward(self, x, state1=None, state2=None):
        out = self.embed(x)
        out, state1 = self.rnn1(out, state1)
        out, state2 = self.rnn2(out, state2)
        #         out, (h, c) = self.rnn3(out, None)
        #         out, (h, c) = self.rnn4(out, None)
        out = self.out(out)
        return out, state1, state2

    # def to_int(self, a):
    #     if a == -float('inf'):
    #         return 0
    #     else:
    #         return int(1e9*a)

    def generate(self, prefix, max_len=30):
        cost = 0
        softmax = torch.nn.Softmax(dim=-1)
        start = '<bos>'

        idx = self.embed.weight.new_full((1, 1),
                                         self.vocab.get_index(start),
                                         dtype=torch.long)
        decoded = [start]
        state1, state2 = None, None
        unk = self.vocab.get_index('<unk>')
        while decoded[-1] != '<eos>' and len(decoded) < max_len:
            x, state1, state2 = self.forward(idx, state1, state2)

            if 0 < len(prefix):
                word = prefix.pop()
                idx = self.vocab.get_index(word)

                idx = torch.tensor(idx).view(1, 1).to(device)
            else:
                x[:, :, unk] = -float('inf')
                x = softmax(x)
                # idx = torch.argmax(x, dim=-1)
                x = x.squeeze().to('cpu').detach().numpy()
                accum = list(accumulate(x))
                idx = bisect(accum, random.random() * accum[-1])
                # word = self.vocab.get_word(idx.item())
                cost += np.log2(x[idx])
                word = self.vocab.get_word(idx)
                idx = torch.tensor(idx).view(1, 1).to(device)

            decoded.append(word)
        cost /= len(decoded)
        return ' '.join(decoded), cost