Esempio n. 1
0
def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i + 1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i + 1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Esempio n. 2
0
    def build_vocabulary(self, threshold):
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                caption.lower())
            if len(tokens) > self.input_maxlen - 2:
                self.input_maxlen = len(tokens) + 2
            counter.update(tokens)

        for id in self.story_data:
            temp_in = 0
            temp_out = 0
            for seq in self.story_data[id]:
                caption = seq[2]
                tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                    caption.lower())
                counter.update(tokens)
                temp_out = temp_out + len(tokens)
                caption_in = self.image_desc[seq[1]]
                tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(
                    caption_in.lower())
                temp_in = temp_in + len(tokens)
                counter.update(tokens)
            if temp_out > self.output_maxlen - 2:
                self.output_maxlen = temp_out + 2
            if temp_in > self.input_maxlen - 2:
                self.input_maxlen = temp_out + 2

        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)

        f = open("./Dataset/vocabulary.pkl", "wb")
        pickle.dump(vocabulary, f)
        f.close()
        return vocabulary
Esempio n. 3
0
def make_data_set_and_vocab(trainpath=None, vectorpath=None, threshhold=0):
    vocab = Vocabulary()
    if vectorpath is not None:
        vocab.load(vectorpath)

    counter = collections.Counter()
    with open(trainpath, 'r') as f:
        for line in f:
            words = make_wakati(line.strip())
            for word in words:
                counter[word] += 1

    # for word, _ in counter.most_common(self.n_max_word - 2):
    for word, cnt in counter.most_common():
        if cnt <= threshhold:
            break
        if word not in vocab:
            vocab.add_word(word)
    vocab.save('vocab')

    # ここからデータセット作成
    data_set = MyDataset(trainpath=trainpath, vocab=vocab)

    return data_set, vocab
    def build_vocabulary(self, threshold):
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)
        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)
        return vocabulary
Esempio n. 5
0
    def build_vocabulary(self, threshold):
        '''if os.path.exists("./Dataset/vocabulary.pkl"):
            f = open("./Dataset/vocabulary.pkl","rb")
            vocabulary = pickle.load(f)
            return vocabulary'''
        vocabulary = Vocabulary()
        counter = Counter()
        for id in self.image_desc:
            caption = self.image_desc[id]
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)

        for annot in self.coco_desc:
            caption = annot['caption']
            tokens = nltk.tokenize.word_tokenize(caption.lower())
            if len(tokens) > self.max_length - 2:
                self.max_length = len(tokens) + 2
            counter.update(tokens)

        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Create a vocab wrapper and add some special tokens.
        vocabulary = Vocabulary()
        vocabulary.add_word('<pad>')
        vocabulary.add_word('<start>')
        vocabulary.add_word('<end>')
        vocabulary.add_word('<unk>')

        # Add the words to the vocabulary.
        for i, word in enumerate(words):
            vocabulary.add_word(word)

        f = open("./Dataset/vocabulary.pkl", "wb")
        pickle.dump(vocabulary, f)
        f.close()

        return vocabulary