Esempi in Python per Vocabulary.add_word

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: Vocabulary

Classe/tipologia: Vocabulary

Metodo/funzione: add_word

Esempi su hotexamples.com: 5

Vocabulary.add_word in Python: 5 esempi trovati. Questi sono i migliori esempi reali in Python per Vocabulary.Vocabulary.add_word, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Vocabulary(30)

add_token(5)

load(5)

add_word(5)

save(3)

get_vocab(3)

from_serializable(3)

get_word(3)

index(2)

build_from_token(2)

make_vocab_charts(2)

readPostProcessingVoc(2)

get_index(2)

getIndex(2)

fetch(2)

addSentence(1)

load_bigquery_vocab_from_indexed(1)

load_vocab_from_local(1)

load_word_from_data(1)

make_array_of_words_from_sentences(1)

prune(1)

restore_text(1)

add_sentence_pair(1)

save_dict(1)

loadIndexFile(1)

sentence2indices(1)

similar(1)

size(1)

sorted_tokens(1)

startSymbolWordID(1)

symbol(1)

text2ids(1)

to_index(1)

unknownWordID(1)

sentence2index(1)

incrementDF(1)

addSymbol(1)

from_serialiable(1)

add_words(1)

build_vocabulary(1)

checkIndex(1)

create(1)

create_from_text(1)

de_tokenize_data(1)

endSymbolWordID(1)

expand(1)

export_vocabulary(1)

addWord(1)

isATerm(1)

getCF(1)

Esempio n. 1

Mostra file

File: Util.py Progetto: shrivardhan/project

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i + 1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

Esempio n. 2

Mostra file

    def build_vocabulary(self, threshold):
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                caption.lower())
            if len(tokens) > self.input_maxlen - 2:
                self.input_maxlen = len(tokens) + 2
            counter.update(tokens)

        for id in self.story_data:
            temp_in = 0
            temp_out = 0
            for seq in self.story_data[id]:
                caption = seq[2]
                tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                    caption.lower())
                counter.update(tokens)
                temp_out = temp_out + len(tokens)
                caption_in = self.image_desc[seq[1]]
                tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                    caption_in.lower())
                temp_in = temp_in + len(tokens)
                counter.update(tokens)
            if temp_out > self.output_maxlen - 2:
                self.output_maxlen = temp_out + 2
            if temp_in > self.input_maxlen - 2:
                self.input_maxlen = temp_out + 2

        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)

        f = open("./Dataset/vocabulary.pkl", "wb")
        pickle.dump(vocabulary, f)
        f.close()
        return vocabulary

Esempio n. 3

Mostra file

def make_data_set_and_vocab(trainpath=None, vectorpath=None, threshhold=0):
    vocab = Vocabulary()
    if vectorpath is not None:
        vocab.load(vectorpath)

    counter = collections.Counter()
    with open(trainpath, 'r') as f:
        for line in f:
            words = make_wakati(line.strip())
            for word in words:
                counter[word] += 1

    # for word, _ in counter.most_common(self.n_max_word - 2):
    for word, cnt in counter.most_common():
        if cnt <= threshhold:
            break
        if word not in vocab:
            vocab.add_word(word)
    vocab.save('vocab')

    # ここからデータセット作成
    data_set = MyDataset(trainpath=trainpath, vocab=vocab)

    return data_set, vocab

Esempio n. 4

Mostra file

File: StoryGenerationDataset.py Progetto: shrivardhan/project

    def build_vocabulary(self, threshold):
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)
        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)
        return vocabulary

Esempio n. 5

Mostra file

    def build_vocabulary(self, threshold):
        '''if os.path.exists("./Dataset/vocabulary.pkl"):
            f = open("./Dataset/vocabulary.pkl","rb")
            vocabulary = pickle.load(f)
            return vocabulary'''
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)

        for annot in self.coco_desc:
            caption = annot['caption']
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)

        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)

        f = open("./Dataset/vocabulary.pkl", "wb")
        pickle.dump(vocabulary, f)
        f.close()

        return vocabulary