コード例 #1
0
    def from_dataframe(cls, review_df, cutoff=25):
        """
        Instantiate the vectorizer from the dataset dataframe.

        Args:
            review_df (pandas.Dataframe): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
コード例 #2
0
def build_vocabs():
    train, dev, test = load_boknilev()
    samples = [
        s for r in train + dev + test
        for s in boknilev_record_to_hcpd_samples(r)
    ]

    gold_pos_vocab = Vocabulary('GOLD_POS')
    gold_pos_vocab.add_words(
        set([hc.next_pos for s in samples for hc in s.x.head_cands]))
    gold_pos_vocab.add_word(None)

    words_vocab = Vocabulary('WORDS')
    words_vocab.add_words(
        set([hc.word for s in samples for hc in s.x.head_cands]))
    words_vocab.add_words(set([s.x.pp.word for s in samples]))
    words_vocab.add_words(set([s.x.child.word for s in samples]))
    words_vocab.add_word(None)

    words_to_lemmas = {}
    words_to_lemmas.update({s.x.child.word: s.x.child.lemma for s in samples})
    words_to_lemmas.update(
        {hc.word: hc.lemma
         for s in samples for hc in s.x.head_cands})

    return [gold_pos_vocab, words_vocab, words_to_lemmas]
コード例 #3
0
 def __init__(self, mode=None):
     super(RegexModel, self).__init__()
     self.mode = mode
     if mode is None or mode == "replacements":
         self.replacements = self._load_replacements()
     if mode is None or mode == "vocabulary":
         self.wikipedia_voc = Vocabulary("wikipedia_lower")
         self.ingredients_voc = Vocabulary(
             "ingredients_fr_tokens") | Vocabulary("ingredients_fr")
コード例 #4
0
 def __init__(self, cfg):
     """
     :param  cfg:  config
     """
     vocab_in_path = f'{cfg.rsc_src}/vocab.in'
     self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, SPECIAL_CHARS)
     vocab_out_path = f'{cfg.rsc_src}/vocab.out'
     self.vocab_out = Vocabulary(vocab_out_path, 0, None)
     restore_dic_path = f'{cfg.rsc_src}/restore.dic'
     self.restore_dic = self._load_restore_dic(restore_dic_path)
コード例 #5
0
    def __init__(self,
                 data_file,
                 character_level=None,
                 phoneme_level=None,
                 vocabulary=None,
                 transform=None):
        self.data_file = data_file
        self.data = joblib.load(open(self.data_file, 'rb'))
        self.character_level = character_level
        self.phoneme_level = phoneme_level
        self.transcription_processor = lambda words: words

        if self.character_level:
            characters = [chr(c) for c in range(ord('a'), ord('z') + 1)]
            characters += [' ']
            character_vocab = Vocabulary()
            for character in characters:
                character_vocab.add_word(character)
            self.vocabulary = character_vocab
            self.transcription_processor = self._character_level_transcription_processor
        elif self.phoneme_level:
            cmu_phones = list(map(lambda x: x[0], cmudict.phones()))
            cmu_phones += [' ']
            phones_vocab = Vocabulary(custom_unk_word=' ')
            for phone in cmu_phones:
                phones_vocab.add_word(phone)
            self.vocabulary = phones_vocab
            self.phones_dict = cmudict.dict()
            self.transcription_processor = self._phone_level_transcription_processor
        elif vocabulary is None:
            data_file_dir = os.path.dirname(self.data_file)
            data_file_prefix = os.path.splitext(self.data_file)[0]
            pickle_file_name = f'{data_file_prefix}_SpeechDataset.pickle'
            pickle_file_path = os.path.join(data_file_dir, pickle_file_name)
            if not os.path.isfile(pickle_file_path):
                dataset_info = self.build_vocabulary_from_dataset(self.data)
                pickle.dump(dataset_info, open(pickle_file_path, 'wb'))
            else:
                dataset_info = pickle.load(open(pickle_file_path, 'rb'))
            self.vocabulary = dataset_info['vocabulary']
        else:
            self.vocabulary = vocabulary
        self.transform = transform
        self.max_transcription_length = max([
            len(transcription)
            for transcription in self.data['transcription_tokens']
        ])
        self.max_input_length = max([
            spectrogram.shape[1]
            for spectrogram in self.data['audio_spectrograms']
        ])
コード例 #6
0
def read_instances_from_file(files, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    lb_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents, labels = [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            for l in lines:
                l = l.strip().split('\t')
                if len(l) < 2:
                    continue
                label = l[0]
                sent = l[1]
                if not keep_case:
                    sent = sent.lower()
                word_lst = sent.split()
                if len(word_lst) > max_len:
                    word_lst = word_lst[:max_len]
                    trimmed_sent += 1
                if word_lst:
                    sents.append(word_lst)
                    labels.append(label)
                    vocab.add_word_lst(word_lst)
                    lb_vocab.add_word(label)

        assert len(sents) == len(labels)

        sets.append({'sents': sents, 'labels': labels})

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.info(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.add_word_lst(['<cls>'] * 6)
    vocab.build_vocab()
    lb_vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}. # Class: {}.'.format(
        len(vocab), len(lb_vocab)))
    logger.info('<pad>: {}'.format(vocab.to_index('<pad>')))
    logger.info('<unk>: {}'.format(vocab.to_index('<unk>')))
    logger.info('<cls>: {}'.format(vocab.to_index('<cls>')))

    return sets, vocab, lb_vocab
コード例 #7
0
 def __init__(self, coco_ann_file, train, vocabThreshold=None, transforms = None):
     super().__init__()
     self.train = train
     self.ann_file = coco_ann_file
     self.tokenizer = RegexpTokenizer(r'\w+')
     self.transform = transforms
     if train:
         self.caption_len, self._coco = getcaption_len(self.ann_file, self.tokenizer, train=True)
         self.vocab = Vocabulary(True,self._coco, vocabThreshold)
         self.ids   = list(self._coco.anns.keys())
     else:
         self._coco = COCO(coco_ann_file)
         self.ids  = list(self._coco.anns.keys())
         self.vocab = Vocabulary(train=False)
コード例 #8
0
def create_index(args):
    reader = DocumentStreamReader(args[2:])
    if args[1] == 'varbyte':
        vocabulary = Vocabulary(Simple9)
    elif args[1] == 'simple9':
        vocabulary = Vocabulary(Simple9)
    else:
        raise AssertionError('Expected varbyte|simple9 as a compressor')

    for doc in reader:
        for word in extract_words(doc.text):
            vocabulary.append(word, doc.url)

    dump(args[0], vocabulary)
コード例 #9
0
ファイル: __init__.py プロジェクト: bityangke/ikelos
def to_vocab(data, frequency_cutoff=None, size_cutoff=None):
    if not utils.xor(frequency_cutoff, size_cutoff):
        raise Exception("one or the other cutoffs please")

    counter = Counter(word for sent in data for word in sent)

    if frequency_cutoff is not None:
        print("Using a frequency of {} to reduce vocabulary size.".format(
            frequency_cutoff))
        words = [
            word for word, count in counter.most_common()
            if count > frequency_cutoff
        ]
        print("Vocabulary size reduced. {} -> {}".format(
            len(counter), len(words)))

    elif size_cutoff is not None:
        print("Using a cutoff of {} to reduce vocabulary size.".format(
            size_cutoff))
        words = [word for word, count in counter.most_common(size_cutoff)]
        print("Vocabulary size reduced. {} -> {}".format(
            len(counter), len(words)))

    else:
        raise Exception("should never happen...")

    vocab = Vocabulary(use_mask=True)
    vocab.add_many(['<START>', "<END>"])
    vocab.add_many(words)
    return vocab
コード例 #10
0
ファイル: data_loader.py プロジェクト: koren-v/Image2Text
    def __init__(self, transform, mode, batch_size, vocab_threshold,
                 vocab_file, glove_file, start_word, end_word, unk_word,
                 annotations_file, vocab_from_file, img_folder):
        self.transform = transform
        self.mode = mode
        self.batch_size = batch_size
        self.vocab = Vocabulary(vocab_threshold,
                                vocab_file,
                                glove_file,
                                start_word,
                                end_word,
                                unk_word,
                                annotations_file,
                                vocab_from_file,
                                dataset='coco')
        self.img_folder = img_folder
        self.sel_length = None

        if self.mode == 'train' or self.mode == 'val':
            self.coco = COCO(annotations_file)
            self.ids = list(self.coco.anns.keys())
            print('Obtaining caption lengths...')
            all_tokens = [
                nltk.tokenize.word_tokenize(
                    str(self.coco.anns[self.ids[index]]['caption']).lower())
                for index in tqdm(np.arange(len(self.ids)))
            ]
            self.caption_lengths = [len(token) for token in all_tokens]
コード例 #11
0
 def __init__(self, transform, mode, batch_size, vocab_threshold,
              vocab_file, start_word, end_word, unk_word, annotations_file,
              vocab_from_file, img_folder):
     self.transform = transform
     self.mode = mode
     self.batch_size = batch_size
     self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word,
                             end_word, unk_word, annotations_file,
                             vocab_from_file)
     self.img_folder = img_folder
     if self.mode == 'train':
         self.coco = COCO(annotations_file)
         self.ids = list(self.coco.anns.keys())
         print('Obtaining caption lengths...')
         all_tokens = [
             nltk.tokenize.word_tokenize(
                 str(self.coco.anns[self.ids[index]]['caption']).lower())
             for index in tqdm(np.arange(len(self.ids)))
         ]
         self.caption_lengths = [len(token) for token in all_tokens]
     else:
         self.coco = COCO(annotations_file)
         self.ids = list(self.coco.anns.keys())
         print('Obtaining caption lengths...')
         all_tokens = [
             nltk.tokenize.word_tokenize(
                 str(self.coco.anns[self.ids[index]]['caption']).lower())
             for index in tqdm(np.arange(len(self.ids)))
         ]
         self.caption_lengths = [len(token) for token in all_tokens]
         test_info = json.loads(open(annotations_file).read())
         self.paths = [item['file_name'] for item in test_info['images']]
コード例 #12
0
def main():
    os.makedirs(args.wiki_preprocess, exist_ok=True)

    vocab = Vocabulary(os.path.join(args.wiki_preprocess, 'entity_vocab.txt'))

    paths = list(glob.glob(os.path.join(args.wiki_dump, '*.xml-*')))
    paths = sorted(paths,
                   key=lambda p: int(os.path.basename(p).split('-')[4][11:-4]))
    params = [(path, vocab) for path in paths]

    inlinks = dict()
    total_pages = 0
    with mp.Pool(processes=args.cpu) as pool, \
            tqdm(total=len(paths), dynamic_ncols=True) as pbar:
        for i, res in enumerate(pool.imap_unordered(process_stream, params)):
            part_inlinks, page_counter = res
            # update
            for e, links in part_inlinks.items():
                if e not in inlinks:
                    inlinks[e] = set()
                inlinks[e].update(links)
            # dump
            if i % 10 == 0:
                dump(inlinks)
            # log
            total_pages += page_counter
            pbar.write(f'pages: {total_pages}, '
                       f'vocab size: {len(vocab)}, '
                       f'len(inlinks): {len(inlinks)}')
            pbar.update()
            del part_inlinks
    dump(inlinks)
コード例 #13
0
 def from_file(cls,
               text_file: str,
               num_prev_chars: int,
               vocab: Vocabulary = None):
     examples = []
     counter: Counter = Counter()
     with open(text_file, "r") as reviews:
         for line in reviews:
             string = line.strip("\n")
             counter.update(string)
             # prepend BOS (num_prev_chars times) and EOS to each line
             chars = ([SSTLanguageModelingDataset.BOS] * num_prev_chars +
                      list(string) + [SSTLanguageModelingDataset.EOS])
             examples.extend(examples_from_characters(
                 chars, num_prev_chars))
     if not vocab:
         vocab = Vocabulary(
             counter,
             special_tokens=(
                 Vocabulary.UNK,
                 SSTLanguageModelingDataset.BOS,
                 SSTLanguageModelingDataset.EOS,
             ),
         )
     return cls(examples, vocab)
コード例 #14
0
def main():
    """Quick tests."""

    a = Attribute('hour', ['0,...,23'])
    a2 = Attribute('minute', ['0,...,59'])
    r_ahead = Relation('R1(h1,m1,h2,m2) <=> h1 > h2 or (h1 = h2 and m1 > m2)',
                       ['hour', 'minute', 'hour', 'minute'], 1)
    r_behind = Relation('R2(h1,m1,h2,m2) <=> h1 < h2 or (h1 = h2 and m1 < m2)',
                        ['hour', 'minute', 'hour', 'minute'], 2)
    r_pm = Relation('R3(h1) <=> h1 > 12', ['hour'], 3)
    r_am = Relation('R4(h1) <=> h1 < 12', ['hour'], 4)
    attribute_structure = AttributeStructure(a, a2, r_ahead, r_behind, r_pm,
                                             r_am)

    ahead_rs = RelationSymbol('Ahead', 4)
    behind_rs = RelationSymbol('Behind', 4)
    pm_rs = RelationSymbol('PM', 1)
    vocabulary = Vocabulary(['C1', 'C2'], [ahead_rs, behind_rs, pm_rs],
                            ['V1', 'V2'])

    profiles = [[
        ahead_rs, ('hour', 1), ('minute', 1), ('hour', 2), ('minute', 2)
    ], [behind_rs, ('hour', 1), ('minute', 1), ('hour', 2), ('minute', 2)],
                [pm_rs, ('hour', 1)]]

    mapping = {ahead_rs: 1, behind_rs: 2, pm_rs: 3}

    ai = AttributeInterpretation(vocabulary, attribute_structure, mapping,
                                 profiles)
    print ai == ai
def vocabularyBuilding(config):
    LOG.log('Building Vocabulary')
    setting = {
        "cased": False,
        "rmDigit" : True,
        "sortBy": "output",
        "minFreq": 5,
        "dim": 100,
        "initPath": "others/glove.6B.100d.txt",
        "inputCorpus":[config.train_prefix + ".Ndocument", config.valid_prefix + ".Ndocument"],
        "outputCorpus":[config.train_prefix + ".Nsummary", config.valid_prefix + ".Nsummary"]
    }
    Vocab = Vocabulary(setting)
    saveToPKL('settings/vocab/newData.Vocab', Vocab)
    f = open('newData.i2w', 'w', encoding='utf-8')
    for item in Vocab.i2w:
        if (item == '<pad>' or item == '<unk>' or item == '<bos>' or item == '<eos>' or item == '<mask>'):
            print(item, 'NAN', file=f)
        else:
            print(item, Vocab.typeFreq[item], file=f)
    setting["full_size"] = Vocab.full_size
    setting["input_size"] = Vocab.n_in
    setting["output_size"] = Vocab.n_out
    setting["savePath"] = "settings/vocab/newData.json"
    saveToJson(setting["savePath"], setting)
    return setting
コード例 #16
0
def build_vocab(caption, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    n = len(caption.keys())
    for i, key in enumerate(caption.keys()):
        for sentence in caption[key]:
            tokens = nltk.tokenize.word_tokenize(sentence)
            counter.update(tokens)

        if i % 1000 == 0:
            print("[%d/%d] Tokenized the captions." % (i, n))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Creates a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Adds the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
コード例 #17
0
    def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word,
                 end_word, unk_word, annotations_file, vocab_from_file, img_folder):
        # Preprocessing transform
        self.transform = transform
        # Train, valid, or test
        self.mode = mode
        self.batch_size = batch_size
        # Dictionaries of stoi and itos for words
        self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word,
                                end_word, unk_word, annotations_file, vocab_from_file)
        # Where the images are
        self.img_folder = img_folder

        # Train on all captions
        if self.mode == "train":
            self.coco = COCO(annotations_file)
            self.ids = list(self.coco.anns.keys())

            print("Obtaining caption lengths...")

            all_tokens = [
                nltk.tokenize.word_tokenize(str(self.coco.anns[self.ids[index]]['caption']).lower())
                for index in np.arange(len(self.ids))
            ]

            self.caption_lengths = [len(token) for token in all_tokens]

        # Caption all images
        else:
            self.coco = COCO(annotations_file)
            self.img_ids = list(self.coco.imgToAnns.keys()) if self.mode == "valid" else self.coco.getImgIds()
コード例 #18
0
def build_vocab(cleaned_captions):
    """ 
    Parses training set token file captions and builds a Vocabulary object
    Args:
        cleaned_captions (str list): cleaned list of human captions to build vocab with

    Returns:
        vocab (Vocabulary): Vocabulary object
    """

    # QUESTION 1.1
    # TODO collect words
    word_count = {}
    for caption in cleaned_captions:
        for word in caption.split():
            if word not in word_count:
                word_count[word] = 0
            word_count[word] += 1

    # create a vocab instance
    vocab = Vocabulary()

    # add the token words
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # TODO add the rest of the words from the cleaned captions here
    # vocab.add_word('word')
    for word, n in word_count.items():
        if n > 3:
            vocab.add_word(word)

    return vocab
コード例 #19
0
ファイル: utils.py プロジェクト: ZVengin/Conversation-Model
def test(args):

    vocab = Vocabulary()
    vocab.load_vocab(os.path.join(args['data_dir'], 'vocabulary.json'))
    args['voca_size'] = vocab.get_vocab_size()
    test_data = get_dataloader(
        os.path.join(args['data_dir'], 'encoded_test_dialogue_pair.json'),
        os.path.join(args['data_dir'], 'vocabulary.json'), 1)
    test_sent_pair_list = []

    model = Seq2Seq(args).eval()
    if torch.cuda.is_available():
        model = model.cuda()

    path = Checkpoint.get_latest_checkpoint(args['exp_dir'])
    model.load_state_dict(torch.load(os.path.join(path, 'model.pt')))

    for batch_idx, (sour, sour_len, targ, targ_len) in enumerate(test_data):
        if torch.cuda.is_available():
            sour = sour.cuda()
            targ = targ.cuda()
        enco_hidd_state = model.encoder.encoder_forward(sour, sour_len)
        out_prob = model.decoder.decoder_forward(targ, targ_len,
                                                 enco_hidd_state, 0)
        sent_list = [(out_prob.topk(1)[1].view(-1).tolist(), 0)]
        test_sent_pair_list += process_sent_list(vocab, sour, targ, sent_list)
#   logger.info('batch_idx:{} \nsent:{}'.format(batch_idx,test_sent_pair_list))

    save_test_sent(args['exp_data'], 'generated_test_sent.txt',
                   test_sent_pair_list)
def build_vocabulary(train_recipes):
    """
    Building vocabulary.
    In our case, we replace words with <unk> when they have less than 5 frequency.
    """
    counter = Counter()

    for recipe in train_recipes:
        context = recipe["context"]
        for step in context:
            token = step["token"]
            counter.update(token)

    min_count = 5
    word_counts = [x for x in counter.items() if x[1] >= min_count]
    word_counts.sort(key=lambda x: x[1], reverse=True)
    print("# Words in Vocabulary : ", len(word_counts))
    reverse_vocab = [x[0] for x in word_counts]

    unk_id = len(reverse_vocab)
    sos_id = len(reverse_vocab) + 1
    eos_id = len(reverse_vocab) + 2
    pad_id = len(reverse_vocab) + 3

    vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
    vocab = Vocabulary(vocab_dict, unk_id, sos_id, eos_id, pad_id)
    return vocab
コード例 #21
0
    def __init__(self, transform, mode, batch_size, vocab_threshold, vocab_file, start_word, 
        end_word, unk_word, annotations_file, vocab_from_file, img_folder):
        self.transform = transform
        self.mode = mode
        self.batch_size = batch_size
        self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word,
            end_word, unk_word, annotations_file, vocab_from_file)
        self.img_folder = img_folder

        test_info = json.loads(open(annotations_file).read())
        
        self.ids, self.paths, self.cap_dict = [], [], {}
        
        for item in test_info['images']:
            self.ids.append(item['id'])
            self.cap_dict[item['id']] = {'file_name':item['file_name'], 'captions':[]}
            self.paths.append(item['file_name'])
        
        for item in test_info['annotations']:
            #if item['image_id'] in self.dict:
            tokens = nltk.tokenize.word_tokenize(str(item['caption']).lower())
            caption = []
            caption.extend([self.vocab(token) for token in tokens])
            caption = torch.Tensor(caption).long()
            self.cap_dict[item['image_id']]['captions'].append(caption)
コード例 #22
0
 def from_corpus(cls, corpus, vocab_size):
     vocab = Vocabulary()
     for token in corpus:
         vocab.add(token)
     vocab_subset = vocab.get_topk_subset(vocab_size)
     vocab_subset.shuffle()
     return cls(vocab_subset)
 def __init__(self,transform,mode,batch_size,
              threshold,sample_size,file,start_seq,end_seq,unk_word,
             annotations_file,load_vocab,image_dir):
     self.transform = transform
     self.mode = mode
     self.batch_size = batch_size
     self.vocab = Vocabulary(threshold,file,start_seq,end_seq,unk_word,annotations_file,load_vocab)
     self.image_dir = image_dir
     self.sample_size = sample_size
     
     if mode in ['train','val']:
         self.coco = COCO(annotations_file)
         self.ids = list(self.coco.anns.keys())
         print("IDS",len(self.ids))
         self.ids = self.ids[:self.sample_size]
         
         tokens = list()
         for idx in tqdm(np.arange(len(self.ids))):
             caption = str(self.coco.anns[self.ids[idx]]['caption']).lower()
             tokens.append(nltk.tokenize.word_tokenize(caption))
         
         self.caption_lengths=[len(token) for token in tokens]
         
     else:
         test_anns = json.load(open(annotations_file))
         self.paths = [item['file_name'] for item in test_anns['images']]
コード例 #24
0
    def __init__(self, transform, mode, batch_size, vocab_threshold,
                 vocab_file, start_word, end_word, unk_word, annotations_file,
                 vocab_from_file, img_folder):
        # transform - defined transformation (e.g. Rescale, ToTensor, RandomCrop and etc.)
        self.transform = transform
        self.mode = mode
        self.batch_size = batch_size
        self.vocab = Vocabulary(vocab_threshold, vocab_file, start_word,
                                end_word, unk_word, annotations_file,
                                vocab_from_file)
        self.img_folder = img_folder

        # if training and validation
        if self.mode == 'train' or self.mode == 'valid':
            # JSON file, where the annotations are stored
            self.coco = COCO(annotations_file)
            # each annotatin contains multiple attributes, such as task e.g. segmentation,
            # image_id, bounding box and etc.
            # in order to load an image, for instance,
            # image URL we will use self.coco.loadImgs(image_id) based on image id
            self.ids = list(self.coco.anns.keys())
            print('Obtaining caption lengths...')
            # get all_tokens - a big list of lists. Each is a list of tokens for specific caption
            all_tokens = [
                nltk.tokenize.word_tokenize(
                    str(self.coco.anns[self.ids[index]]['caption']).lower())
                for index in tqdm(np.arange(len(self.ids)))
            ]
            # list of token lengths (number of words for each caption)
            self.caption_lengths = [len(token) for token in all_tokens]
        else:
            # if we are in testing mode
            test_info = json.loads(open(annotations_file).read())
            self.paths = [item['file_name'] for item in test_info['images']]
コード例 #25
0
def vocabulary_demo():
    # We used up a few lines in the previous example, so we set up
    # our data generator again.
    corpus = smart_reader(train_e_path)

    # Let's create a vocabulary given our (tokenized) corpus
    vocabulary = Vocabulary(corpus=corpus)
    print("Original vocabulary size: {}".format(len(vocabulary)))

    # Now we only keep the highest-frequency words
    vocabulary_size = 1000
    vocabulary.trim(vocabulary_size)
    print("Trimmed vocabulary size: {}".format(len(vocabulary)))

    # Now we can get word indexes using v.get_word_id():
    for t in ["<PAD>", "<UNK>", "the"]:
        print("The index of \"{}\" is: {}".format(t,
                                                  vocabulary.get_token_id(t)))

    # And the inverse too, using v.i2t:
    for i in range(10):
        print("The token with index {} is: {}".format(i,
                                                      vocabulary.get_token(i)))

    # Now let's try to get a word ID for a word not in the vocabulary
    # we should get 1 (so, <UNK>)
    for t in ["!@!_not_in_vocab_!@!"]:
        print("The index of \"{}\" is: {}".format(t,
                                                  vocabulary.get_token_id(t)))
コード例 #26
0
    def __init__(self, params, mode, batch_size, vocab_from_file, img_folder,
                 annotations_file):
        self.params = params
        self.mode = mode
        self.batch_size = batch_size
        self.vocab = Vocabulary(params, vocab_from_file, annotations_file)
        self.img_folder = img_folder
        if self.mode == 'train':
            self.coco = COCO(annotations_file)
            self.ids = list(self.coco.anns.keys())
            print('Obtaining caption lengths...')

            # Here, nltk.tokenize.word_tokenize(str(sentence).lower()) takes a string type sentence, changes all letters
            # into lower case and returns a list containing all words in that sentence.
            # Use np.arange because it runs faster than range
            all_tokens = [
                nltk.tokenize.word_tokenize(
                    str(self.coco.anns[self.ids[index]]['caption']).lower())
                for index in tqdm(np.arange(len(self.ids)))
            ]

            self.caption_lengths = [len(token) for token in all_tokens]
        else:
            test_info = json.loads(open(annotations_file).read())

            # paths stores names of all images in test set (e.g. COCO_test2014_000000264794.jpg)
            self.paths = [item['file_name'] for item in test_info['images']]
コード例 #27
0
ファイル: data_loader.py プロジェクト: koren-v/Image2Text
 def __init__(self, transform, mode, batch_size, vocab_threshold,
              vocab_file, glove_file, start_word, end_word, unk_word,
              annotations_file, vocab_from_file, img_folder):
     self.transform = transform
     self.mode = mode
     self.batch_size = batch_size
     self.vocab = Vocabulary(vocab_threshold,
                             vocab_file,
                             glove_file,
                             start_word,
                             end_word,
                             unk_word,
                             annotations_file,
                             vocab_from_file,
                             dataset='insta')
     self.img_folder = img_folder
     self.sel_length = None
     if self.mode == 'train' or self.mode == 'val':
         import time
         start = time.time()
         print('Start reading...')
         self.insta = pickle.load(open(annotations_file, 'rb'))
         print('Done: ', time.time() - start)
         self.ids = list(self.insta.keys())
         print('Obtaining caption lengths...')
         all_tokens = [
             regex_tokenizer.tokenize(
                 str(self.insta[index]['caption']).lower())
             for index in tqdm(self.ids)
         ]
         self.caption_lengths = [len(token) for token in all_tokens]
コード例 #28
0
 def build_vocabulary_from_dataset(self, data):
     vocabulary = Vocabulary(custom_unk_word=' ')
     for transcription in data['transcription_tokens']:
         for word in transcription:
             vocabulary.add_word(word)
     dataset_info = {'vocabulary': vocabulary}
     return dataset_info
コード例 #29
0
ファイル: 01-build_vocab.py プロジェクト: JoshuaZe/img2tags
def build_vocab(annotation_path, threshold):
    """Build a simple vocabulary wrapper."""
    df_annotation = pd.read_csv(annotation_path, keep_default_na=False)
    counter = Counter()
    for _, each_annotation in df_annotation.iterrows():
        attribute_tags = each_annotation['attribute_tags']
        tokens = list(re.split('[,]', attribute_tags))
        if len(tokens) > 0:
            tokens = [
                token.strip() for token in tokens if not has_numbers(token)
            ]
            counter.update(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]
    # print(words)

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
コード例 #30
0
def main(args):

    # Create vocabulary. Assumes that the `vocab.pkl` file already exists.
    vocabulary = Vocabulary(vocab_from_file=True)

    # Create transformations
    transformations = transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor()
    ])    

    # Create the model instance.
    model = CaptioningModel(
        embed_size=256,
        hidden_size=256,
        vocab_size = len(vocabulary),
        num_layers=2,
        use_pretrained_encoder=False
    )

    # Load state dict for encoder and decoder
    model.encoder.load_state_dict(torch.load('./models/encoder.pkl')['encoder_state_dict'])
    model.decoder.load_state_dict(torch.load('./models/decoder.pkl')['decoder_state_dics'])

    # Set model to eval state
    model.eval()

    image = Image.open(args.image_path)
    tensor = transformations(image).unsqueeze(0)

    result = model.sample(tensor)
    text = result_to_text(result, vocabulary)

    print(text)