def build_vocab(cls, json, tokenized_captions, threshold):
        print("Building vocabulary")
        coco = COCO(json)
        counter = Counter()
        ids = coco.anns.keys()
        for i, id in enumerate(ids):
            """
            caption = str(coco.anns[id]['caption'])
            tokens = CocoDataset.tokenize(caption)
            """
            tokens = tokenized_captions[id]
            counter.update(tokens)

        # If the word frequency is less than 'threshold', then the word is discarded.
        words = [word for word, cnt in counter.items() if cnt >= threshold]

        # Creates a vocab wrapper and add some special tokens.
        vocab = Vocabulary()

        # Adds the words to the vocabulary.
        for word in words:
            vocab.add_word(word)

        print("Total vocabulary size: %d" % len(vocab))
        return vocab
Exemple #2
0
    def load_embeddings(self, src_embeddings, tgt_embeddings,
                        vocabulary: Vocabulary):
        aligned_embeddings = torch.div(torch.randn(vocabulary.size(), 300), 10)
        found_count = 0
        for i in range(len(vocabulary.index2word)):
            word = vocabulary.get_word(i)
            language = vocabulary.get_language(i)
            if language == "src" and word in src_embeddings.wv:
                aligned_embeddings[i] = torch.FloatTensor(
                    src_embeddings.wv[word])
                found_count += 1
            elif language == "src" and word.lower() in src_embeddings.wv:
                aligned_embeddings[i] = torch.FloatTensor(
                    src_embeddings.wv[word.lower()])
                found_count += 1

            if language == "tgt" and word in tgt_embeddings.wv:
                aligned_embeddings[i] = torch.FloatTensor(
                    tgt_embeddings.wv[word])
                found_count += 1
            elif language == "tgt" and word.lower() in tgt_embeddings.wv:
                aligned_embeddings[i] = torch.FloatTensor(
                    tgt_embeddings.wv[word.lower()])
                found_count += 1
        logger.info("Embeddings filled: " + str(found_count) + " of " +
                    str(vocabulary.size()))

        enable_training = self.encoder.embedding.weight.requires_grad
        self.encoder.embedding.weight = nn.Parameter(
            aligned_embeddings, requires_grad=enable_training)
        self.decoder.embedding.weight = nn.Parameter(
            aligned_embeddings, requires_grad=enable_training)
 def setup(self, stage: Optional[str] = None):
     if not path.exists(path.join(self._dataset_dir,
                                  Vocabulary.vocab_file)):
         Vocabulary.build_from_scratch(
             path.join(self._dataset_dir,
                       f"{self._config.dataset}.{self._train}.jsonl"))
     self._vocabulary = Vocabulary(self._config)
def prepare_train_data(config):
    """ Prepare the data for training the model. """

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Processing the captions...")

    annotations = pd.read_csv(config.temp_annotation_file)
    captions = annotations['caption'].values
    image_ids = annotations['image_id'].values
    image_files = annotations['image_file'].values

    data = np.load(config.temp_data_file).item()
    word_idxs = data['word_idxs']
    masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" % (len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size, word_idxs,
                      masks, True, True)
    print("Dataset built.")
    return dataset
def main2():
    vocabs = [Vocabulary.load("./vocabs/v{0}".format(i)) for i in range(8)]
    print("loaded vocabs!")
    master_vocab = Vocabulary.merge_vocabularies(vocabs)
    master_vocab.save("./vocabs/final_vocab")
    import pdb
    pdb.set_trace()
def _counters_to_vocab(config: Dict,
                       counters: Dict[str, TypeCounter[str]]) -> Vocabulary:
    additional_tokens = [SOS, EOS, PAD, UNK
                         ] if config["token"]["is_wrapped"] else [PAD, UNK]
    token_to_id = _counter_to_dict(counters["token"],
                                   config["token"]["vocabulary_size"],
                                   additional_tokens)
    additional_targets = [SOS, EOS, PAD, UNK
                          ] if config["target"]["is_wrapped"] else [PAD, UNK]
    label_to_id = _counter_to_dict(counters["target"],
                                   config["target"]["vocabulary_size"],
                                   additional_targets)
    additional_nodes = [SOS, EOS, PAD, UNK
                        ] if config["path"]["is_wrapped"] else [PAD, UNK]
    node_to_id = _counter_to_dict(counters["path"],
                                  config["path"]["vocabulary_size"],
                                  additional_nodes)

    vocabulary = Vocabulary(token_to_id, node_to_id, label_to_id)
    if "type" in counters:
        additional_types = [SOS, EOS, PAD, UNK
                            ] if config["type"]["is_wrapped"] else [PAD, UNK]
        vocabulary.type_to_id = _counter_to_dict(
            counters["type"], config["type"]["vocabulary_size"],
            additional_types)
    return vocabulary
Exemple #7
0
    def init_mapping(bi_dict_filename: str, vocabulary: Vocabulary, first_lang,
                     second_lang):
        mapping = defaultdict(set)
        with open(bi_dict_filename, "r", encoding='utf-8') as r:
            for line in r:
                first_word, second_word = line.strip().split()

                first_index = vocabulary.get_unk(first_lang)
                if vocabulary.has_word(first_word, first_lang):
                    first_index = vocabulary.get_index(first_word, first_lang)
                elif vocabulary.has_word(first_word.capitalize(), first_lang):
                    first_index = vocabulary.get_index(first_word.capitalize(),
                                                       first_lang)

                second_index = vocabulary.get_unk(second_lang)
                if vocabulary.has_word(second_word, second_lang):
                    second_index = vocabulary.get_index(
                        second_word, second_lang)
                elif vocabulary.has_word(second_word.capitalize(),
                                         second_lang):
                    second_index = vocabulary.get_index(
                        second_word.capitalize(), second_lang)

                mapping[first_index].add(second_index)
        return mapping
def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if i % 1000 == 0:
            print("[%d/%d] Tokenized the captions." % (i, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Creates a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Adds the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Exemple #9
0
    def build_corpus(self):
        print(f'Loading training trees from `{self.train_path}`...')
        if self.multitask == 'ccg':
            train_treebank = ccg.fromfile(self.train_path)
        else:
            with open(self.train_path) as f:
                train_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading development trees from `{self.dev_path}`...')
        with open(self.dev_path) as f:
            dev_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading test trees from `{self.test_path}`...')
        with open(self.test_path) as f:
            test_treebank = [fromstring(line.strip()) for line in f]

        if self.multitask == 'spans':
            # need trees with span-information
            train_treebank = [tree.convert() for tree in train_treebank]
            dev_treebank = [tree.convert() for tree in dev_treebank]
            test_treebank = [tree.convert() for tree in test_treebank]

        print("Constructing vocabularies...")
        if self.vocab_path is not None:
            print(f'Using word vocabulary specified in `{self.vocab_path}`')
            with open(self.vocab_path) as f:
                vocab = json.load(f)
            words = [word for word, count in vocab.items() for _ in range(count)]
        else:
            words = [word for tree in train_treebank for word in tree.words()]

        if self.multitask == 'none':
            labels = []
        else:
            labels = [label for tree in train_treebank for label in tree.labels()]

        if self.multitask == 'none':
            words = [UNK, START] + words
        else:
            words = [UNK, START, STOP] + words

        word_vocab = Vocabulary.fromlist(words, unk_value=UNK)
        label_vocab = Vocabulary.fromlist(labels)

        self.word_vocab = word_vocab
        self.label_vocab = label_vocab

        self.train_treebank = train_treebank
        self.dev_treebank = dev_treebank
        self.test_treebank = test_treebank

        print('\n'.join((
            'Corpus statistics:',
            f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals',
            f'Train: {len(train_treebank):,} sentences',
            f'Dev: {len(dev_treebank):,} sentences',
            f'Test: {len(test_treebank):,} sentences')))
 def get_vocabulary(cls, vocab_path, captions_path, tokenized_captions, threshold=1):
     # Load or construct vocabulary
     if os.path.exists(vocab_path):
         vocab = Vocabulary.load(vocab_path)
     else:
         vocab = cls.build_vocab(captions_path, tokenized_captions, threshold)
         #TODO: check if saving is safe
         Vocabulary.save(vocab, vocab_path)
         print("Saved the vocabulary to '%s'" %vocab_path)
     return vocab
Exemple #11
0
    def from_serializable(cls, contents):
        """Instantiate a ReviewVectorizer from a serializable dictionary

        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])

        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
Exemple #12
0
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe

        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)
Exemple #13
0
    def __init__(self, args):
        self.args = args
        train = DataLoader(self.args.trainpath)
        dev = DataLoader(self.args.devpath)

        self.train_words, self.train_poss, self.train_chunks, self.train_labels = train.get_all_train_tokens(
        )
        self.train_max_sentence_len, self.train_max_word_len = train.get_required_max_len(
        )
        self.dev_words, self.dev_poss, self.dev_chunks, self.dev_labels = dev.get_all_train_tokens(
        )
        self.dev_max_sentence_len, self.dev_max_word_len = dev.get_required_max_len(
        )

        vocabulary = Vocabulary(self.train_words)
        self.vocab = vocabulary.get_word_vocab()
        self.char_vocab = vocabulary.get_char_vocab()

        self.train_vect = Vectorizer(self.train_max_sentence_len,
                                     self.train_max_word_len, self.vocab,
                                     self.char_vocab, self.train_words)
        self.dev_vect = Vectorizer(self.train_max_sentence_len,
                                   self.train_max_word_len, self.vocab,
                                   self.char_vocab, self.dev_words)

        self.poss_vect = LabelEncoderModel(self.train_poss,
                                           self.train_max_sentence_len)
        self.chunks_vect = LabelEncoderModel(self.train_chunks,
                                             self.train_max_sentence_len)
        self.labels_vect = LabelEncoderModel(self.train_labels,
                                             self.train_max_sentence_len)

        #st wrong here
        self.pos_emb_weights = self.poss_vect.get_emb_weights()
        self.chunk_emb_weights = self.chunks_vect.get_emb_weights()
        self.word_emb_weights, self.word_emb_dimensions = PretrainedEmbedder(
            self.vocab, self.args.pretrained_path).pretrained_embedder()
        self.model = ModelTraining(
            self.args.dropout,
            self.args.lr,
            len(set(sum(self.train_labels, []))),
            len(self.vocab),
            len(self.char_vocab),
            self.train_max_word_len,
            len(set(sum(self.train_poss, []))),
            len(set(sum(self.train_chunks, []))),
            word_emb_dimensions=self.word_emb_dimensions,
            word_emb_weights=self.word_emb_weights,
            pos_emb_weights=self.pos_emb_weights,
            chunk_emb_weights=self.chunk_emb_weights).model_build()
    def train(self, epoch):
        trace('making vocabularies ...')
        self.trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')

        trace('epoch %d/%d: ' % (epoch + 1, self.epoch))
        opt = optimizers.AdaGrad(lr=0.01)
        opt.setup(self.encdec)
        opt.add_hook(optimizer.GradientClipping(5))
        gen1 = gens.word_list(self.target)
        gen = gens.batch(gen1, self.minibatch)

        for trg_batch in gen:
            self.batch_size = len(trg_batch)
            self.trg_batch = fill_batch(trg_batch)
            if len(trg_batch) != self.minibatch:
                break
            self.encdec.clear(self.batch_size)
            self.__forward_img()
            self.encdec.reset(self.batch_size)
            loss, hyp_batch = self.__forward_word(self.trg_batch, self.encdec, True, 0)
            loss.backward()
            opt.update()
            K = len(self.trg_batch) - 2
            self.print_out(K, hyp_batch, epoch)
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file)

    if config.is_person_model == 'Y':
        file_data = pd.read_csv(config.person_eval_caption_file)
        image_ids = file_data['image_id'].values
        image_files = file_data['image_file'].values
    else:
        image_ids = list(coco.imgs.keys())
        image_files = [os.path.join(config.eval_image_dir,
                                    coco.imgs[image_id]['file_name'])
                       for image_id in image_ids]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")

    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
Exemple #16
0
    def train(self, epoch):
        trace('making vocabularies ...')
        self.trg_vocab = Vocabulary.new(gens.word_list(self.target),
                                        self.vocab)

        trace('making model ...')

        trace('epoch %d/%d: ' % (epoch + 1, self.epoch))
        opt = optimizers.AdaGrad(lr=0.01)
        opt.setup(self.encdec)
        opt.add_hook(optimizer.GradientClipping(5))
        gen1 = gens.word_list(self.target)
        gen = gens.batch(gen1, self.minibatch)

        for trg_batch in gen:
            self.batch_size = len(trg_batch)
            self.trg_batch = fill_batch(trg_batch)
            if len(trg_batch) != self.minibatch:
                break
            self.encdec.clear(self.batch_size)
            self.__forward_img()
            self.encdec.reset(self.batch_size)
            loss, hyp_batch = self.__forward_word(self.trg_batch, self.encdec,
                                                  True, 0)
            loss.backward()
            opt.update()
            K = len(self.trg_batch) - 2
            self.print_out(K, hyp_batch, epoch)
    def test_forward(self):
        with initialize_config_dir(config_dir=get_config_directory()):
            data_folder, dataset_name = get_test_data_info()
            config = compose("main",
                             overrides=[
                                 f"data_folder={data_folder}",
                                 f"dataset.name={dataset_name}"
                             ])

        dataset_folder = join(config.data_folder, config.dataset.name)
        vocabulary = Vocabulary.load_vocabulary(
            join(dataset_folder, config.vocabulary_name))
        data_file_path = join(
            dataset_folder,
            f"{config.dataset.name}.{config.train_holdout}.c2s")
        dataset = PathContextDataset(data_file_path, config, vocabulary, False)
        batch = PathContextBatch(
            [dataset[i] for i in range(config.hyper_parameters.batch_size)])

        model = PathEncoder(
            config.encoder,
            config.decoder.decoder_size,
            len(vocabulary.token_to_id),
            vocabulary.token_to_id[PAD],
            len(vocabulary.node_to_id),
            vocabulary.node_to_id[PAD],
        )
        output = model(batch.contexts)

        true_shape = (sum(batch.contexts_per_label),
                      config.decoder.decoder_size)
        self.assertTupleEqual(true_shape, output.shape)
def main():
  args = get_args()

  vocab = Vocabulary.load(args.vocab_prefix.strip())
  output_dir = path.realpath(args.output_dir.strip())
  if args.soseos:
    line2arr = partial(line2arr_with_soseos, vocab)
    print("sos-eos!!!")
  else:
    line2arr = partial(line2arr_no_soseos, vocab)

  counter = -1
  for line in sys.stdin.readlines():
    counter += 1
#    print(counter)
#    sys.stdout.flush()
#    counter += 1
    if counter % 100 == 0:
      print(counter)
      sys.stdout.flush()
    fname = line.strip()
#    try:
    lines = open(fname, 'r').readlines()
    stripped = map(lambda x: x.strip(), lines)
    non_empty = filter(lambda x: x != "", stripped)
    file_arr = [line2arr(line) for line in non_empty ]
    np_arr = np.array(file_arr)
    try:
      new_fname = path.join(output_dir, path.split(fname)[1].replace(".tok", ".npy"))
      np.save(new_fname, np_arr)
    except:
      print("errored out on: {0}".format(fname))
def prepare_eval_data(config):
    """ Prepare the data for evaluating the model. """
    coco = COCO(config.eval_caption_file)
    image_ids = list(coco.imgs.keys())
    image_files = [
        os.path.join(config.eval_image_dir, coco.imgs[image_id]['file_name'])
        for image_id in image_ids
    ]

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    if (config.eval_data_count_limit > 0):
        print("-----------------------------------------------")
        print("Restricting Sz:\t", config.eval_data_count_limit)
        print("Batch Sz:\t", config.batch_size)
        image_ids = image_ids[0:config.eval_data_count_limit]
        image_files = image_files[0:config.eval_data_count_limit]
        """ Dump the image paths to a file """
        filepath = 'eval_images.csv'
        with open(filepath, 'w') as file_handler:
            for i in range(0, config.eval_data_count_limit):
                file_handler.write("{}\n".format(image_files[i]))
        #print(image_files)
        print("-----------------------------------------------")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return coco, dataset, vocabulary
Exemple #20
0
 def __init__(self, args, data_mode='train', single_pass=False, batch_size=None):
     self.args = copy.deepcopy(args)
     if batch_size is not None:
         self.args.batch_size = batch_size
     self.vocab = Vocabulary(self.args)
     self.data_mode = data_mode
     self.single_pass = single_pass
Exemple #21
0
def get_huffman_tree(params):
    if "huff_tree_loc" in params:
        with open(params["huff_tree_loc"], 'rb') as f:
            huff_tree = pickle.load(f)
    else:
        vocab_size = params["n_vocab"]
        soseos_counts_estim = [40114695 for i in range(2)]
        vocab = Vocabulary.load(
            "/hdd/data/nlp/raw/unzipped/ff15_book/vocabs/final_vocab")
        sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
        sorted_counts = [x[1] for x in sorted_vocab]
        cutoff_counts = sorted_counts[0:vocab_size]
        oov_counts = [sum(sorted_counts[vocab_size:])]
        #    print("#words: {0}".format(len(sorted_vocab)))
        #    print("cutoff oov = {0}".format(sorted_vocab[vocab_size]))
        #    print("oov words right after cutoff:")
        #    print([x[0] for x in sorted_vocab[vocab_size:vocab_size + 50]])
        #    print("randomly sampled oov words:")
        #    print(random.sample([x[0] for x in sorted_vocab[vocab_size:]], 50))
        oov_percent = (100.0 * oov_counts[0]) / sum(cutoff_counts)
        #    print("oov % = {0:.5f}".format(oov_percent))
        all_counts = soseos_counts_estim + cutoff_counts + oov_counts
        params["vocab_counts"] = all_counts
        as_hash = {i: v for (i, v) in enumerate(all_counts)}
        huff_tree = chainer.links.BinaryHierarchicalSoftmax.create_huffman_tree(
            as_hash)
        print("loaded huffman tree")
    return huff_tree
Exemple #22
0
def prepare_test_data(config, image_path=None):
    """ Prepare the data for testing the model. """
    if image_path is None:
        files = os.listdir(config.test_image_dir)
        image_files = [
            os.path.join(config.test_image_dir, f) for f in files
            if f.lower().endswith('.jpg') or f.lower().endswith('.jpeg')
        ]
    else:
        image_files = [image_path]

    image_ids = list(range(len(image_files)))

    print("Building the vocabulary...")
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))

    print("Building the dataset...")
    dataset = DataSet(image_ids, image_files, config.batch_size)
    print("Dataset built.")
    return dataset, vocabulary
Exemple #23
0
def dataset():
    vocab = Vocabulary(args)
    dataset = Dataset(args, vocab)
    source_files = sorted(glob.glob(args.dataset_file_path + 'train_source*.dat'))
    target_files = sorted(glob.glob(args.dataset_file_path + 'train_target*.dat'))

    print('========== Begin someting about vocabulary:')
    print('Vocab Size:', dataset.vocab.vocab_size)
    print('First 10 Word2cnt:', list(dataset.vocab._word2cnt.items())[:10])
    print()

    print('========== Begin someting about dataset:')
    X_lens = [len(sen.split()) for source_file in source_files for sen in open(source_file)]
    y_lens = [len(sen.split()) for target_file in target_files for sen in open(target_file)]
    print('Number of Source Sentences:', len(X_lens))
    print('Number of Sarget Sentences:', len(y_lens))
    print()
    print('Mean Length of Source Sentences:', np.mean(X_lens))
    print('Max Length of Source Sentences:', np.max(X_lens))
    print('Min Length of Source Sentences:', np.min(X_lens))
    print()
    print('Mean Length of Target Sentences:', np.mean(y_lens))
    print('Max Length of Target Sentences:', np.max(y_lens))
    print('Min Length of Target Sentences:', np.min(y_lens))
    print()
Exemple #24
0
def main3():
    master_vocab = Vocabulary.load("./vocabs/final_vocab")
    offsets = master_vocab.get_offsets()
    with open("./vocabs/offsets.pkl", 'wb') as out_f:
        pickle.dump(offsets, out_f)
    import pdb
    pdb.set_trace()
    print(32)
Exemple #25
0
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    vqa = VQA(config.train_answer_file, config.train_question_file)
    vqa.filter_by_ques_len(config.max_question_length)
    vqa.filter_by_ans_len(1)

    question_ids = list(vqa.qa.keys())
    questions = [vqa.qqa[k]['question'] for k in question_ids]
    answers = [vqa.qa[k]['best_answer'] for k in question_ids]

    vocabulary = Vocabulary()
    for question in tqdm(questions):
        vocabulary.add_words(word_tokenize(question))
    for answer in tqdm(answers):
        vocabulary.add_words(word_tokenize(answer))
    vocabulary.compute_frequency()
    vocabulary.save(config.vocabulary_file)
    return vocabulary
Exemple #26
0
def train(config: DictConfig):
    filter_warnings()
    print_config(config)
    seed_everything(config.seed)

    known_models = {"code2seq": get_code2seq, "code2class": get_code2class, "typed-code2seq": get_typed_code2seq}
    if config.name not in known_models:
        print(f"Unknown model: {config.name}, try on of {known_models.keys()}")

    vocabulary = Vocabulary.load_vocabulary(join(config.data_folder, config.dataset.name, config.vocabulary_name))
    model, data_module = known_models[config.name](config, vocabulary)

    # define logger
    wandb_logger = WandbLogger(
        project=f"{config.name}-{config.dataset.name}", log_model=True, offline=config.log_offline
    )
    wandb_logger.watch(model)
    # define model checkpoint callback
    checkpoint_callback = ModelCheckpoint(
        dirpath=wandb_logger.experiment.dir,
        filename="{epoch:02d}-{val_loss:.4f}",
        period=config.save_every_epoch,
        save_top_k=-1,
    )
    upload_checkpoint_callback = UploadCheckpointCallback(wandb_logger.experiment.dir)
    # define early stopping callback
    early_stopping_callback = EarlyStopping(
        patience=config.hyper_parameters.patience, monitor="val_loss", verbose=True, mode="min"
    )
    # define callback for printing intermediate result
    print_epoch_result_callback = PrintEpochResultCallback("train", "val")
    # use gpu if it exists
    gpu = 1 if torch.cuda.is_available() else None
    # define learning rate logger
    lr_logger = LearningRateMonitor("step")
    trainer = Trainer(
        max_epochs=config.hyper_parameters.n_epochs,
        gradient_clip_val=config.hyper_parameters.clip_norm,
        deterministic=True,
        check_val_every_n_epoch=config.val_every_epoch,
        log_every_n_steps=config.log_every_epoch,
        logger=wandb_logger,
        gpus=gpu,
        progress_bar_refresh_rate=config.progress_bar_refresh_rate,
        callbacks=[
            lr_logger,
            early_stopping_callback,
            checkpoint_callback,
            upload_checkpoint_callback,
            print_epoch_result_callback,
        ],
        resume_from_checkpoint=config.resume_from_checkpoint,
    )

    trainer.fit(model=model, datamodule=data_module)
    trainer.test()
Exemple #27
0
    def __init__(self,
                 root,
                 split,
                 vocabulary='./utils/vocabulary.txt',
                 transform=None):
        self.root = root
        self.split = split

        with open(os.path.join(self.root, 'talk2car_w_rpn_no_duplicates.json'),
                  'rb') as f:
            data = json.load(f)[self.split]
            self.data = {int(k): v for k, v in data.items()}  # Map to int
        self.img_dir = os.path.join(self.root, 'images')
        self.transform = transform
        self.vocabulary = Vocabulary(vocabulary)

        if self.split in ['val', 'train']:
            self.add_train_annos = True  # Add extra info when reading out items for training
        else:
            self.add_train_annos = False

        self.ignore_index = 255  # Ignore index when all RPNs < 0.5 IoU
        self.num_rpns_per_image = 8  # We only use 32 RPN per image
        # self.text_encoder = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

        # Filter out rpns we are not going to use
        # RPNS were obtained from center after soft NMS
        # We order the scores, and take the top k.
        assert (self.num_rpns_per_image < 65)
        rpns = {k: sample['centernet'] for k, sample in self.data.items()}
        rpns_score_ordered_idx = {
            k: np.argsort([rpn['score'] for rpn in v])
            for k, v in rpns.items()
        }
        rpns = {
            k: [
                v[idx]
                for idx in rpns_score_ordered_idx[k][-self.num_rpns_per_image:]
            ]
            for k, v in rpns.items()
        }
        for k in self.data.keys():
            self.data[k]['centernet'] = rpns[k]
Exemple #28
0
def build_vocabulary(config, max_ann_num=None):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file, config.max_train_ann_num)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    if not config.max_train_ann_num:
        vocabulary.build(coco.all_captions())
    else:
        vocabulary.build((coco.all_captions())[:config.max_train_ann_num])
    vocabulary.save(config.vocabulary_file)
    return vocabulary
Exemple #29
0
def build_vocabulary(config, captions, oracle_file):
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size, config.ctrl_symbols)
    if True:  #not os.path.exists(config.vocabulary_file):
        vocabulary.build(captions)
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    #print("Vocabulary built.")
    print("Number of words = %d" % (vocabulary.size))
    #return vocabulary

    print("NUM CAPTIONS: " + str(len(captions)))
    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        sent_lens = []
        for caption in captions:
            current_word_idxs, current_length = vocabulary.process_sentence(
                caption)
            current_num_words = min(config.max_caption_length - 2,
                                    current_length)

            pad_length = config.max_caption_length - current_length - 2
            current_word_idxs = [config._START_
                                 ] + current_word_idxs[:current_num_words] + [
                                     config._END_
                                 ] + [config._PAD_] * pad_length

            word_idxs.append(current_word_idxs)
            sent_lens.append(current_num_words + 2)
        word_idxs = np.array(word_idxs)
        data = {'word_idxs': word_idxs, 'sentence_len': sent_lens}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file).item()
        word_idxs = data['word_idxs']
        sent_lens = data['sentence_len']

    if oracle_file is not None:
        with open(oracle_file, 'w') as outfile:
            paras = ""
            for line in word_idxs:
                for word in line:
                    paras += (str(word) + ' ')
                paras += '\n'
                outfile.write(paras)

    return vocabulary
def prepare_test_data(config):
    """ Prepare the data for testing the model. """
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size,
                                config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))
    return vocabulary
Exemple #31
0
def prepare_test_data(config):
    """ Prepare the data for testing the model. """
    image_files = [config.test_file_name]
    image_ids = list(range(len(image_files)))
    if os.path.exists(config.vocabulary_file):
        vocabulary = Vocabulary(config.vocabulary_size, config.vocabulary_file)
    else:
        vocabulary = build_vocabulary(config)
    dataset = DataSet(image_ids, image_files, config.batch_size)
    return dataset, vocabulary
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    if not os.path.exists(config.prepare_annotation_dir):
        os.mkdir(config.prepare_annotation_dir)
    coco = COCO(config, config.train_caption_file, config.val_caption_file)
    
    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        coco.filter_by_cap_len(config.max_caption_length)
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
        vocabulary.save_counts(config.word_count_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    
    print("Processing the captions...")
    if not os.path.exists(config.train_csv_file):
                    
        coco.filter_by_words(set(vocabulary.words))
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [ 
            os.path.join(config.dataset_image_dir,
            'train' if coco.imgs[image_id]['file_name'].find('train2014')>=0 else 'val',
            coco.imgs[image_id]['file_name'])
                        for image_id in image_ids ] 
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.train_csv_file)
    else:
        annotations = pd.read_csv(config.train_csv_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values
def build_vocab():
    vocab = Vocabulary(args)
    with open(processed_cnn_vocab_file, 'r') as vocab_f:
        for line in vocab_f:
            pieces = line.split()
            if len(pieces) != 2:
                print('Warning: incorrectly formatted line in vocabulary file: %s\n' % line)
                continue
            w = pieces[0]
            cnt = int(pieces[1])
            if w in [opt.SENTENCE_START, opt.SENTENCE_END, opt.UNKNOWN_TOKEN, opt.PAD_TOKEN, opt.BOS, opt.EOS]:
                raise Exception(
                    '<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w)
            if w in vocab._word2id:
                raise Exception('Duplicated word in vocabulary file: %s' % w)
            vocab._word2id[w] = vocab.vocab_size
            vocab._id2word[vocab.vocab_size] = w
            vocab.vocab_size += 1
            vocab._word2cnt[w] = cnt
            if vocab.args.max_vocab_size != 0 and vocab.vocab_size >= vocab.args.max_vocab_size:
                print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (vocab.args.max_vocab_size, vocab.vocab_size))
                vocab.save_vocab()
                break
    print("Finished constructing vocabulary of %i total words. Last word added: %s" % (vocab.vocab_size, vocab._id2word[vocab.vocab_size - 1]))
    def __init__(self, use_gpu, gpu_id):
        self.parameter_dict = {}
        train_path = APP_ROOT + "/../../Chainer_Image_Caption_Neural_Network/Code/Data/"
        self.resize_image_path = APP_ROOT + "/../../Chainer_Image_Caption_Neural_Network/Code/"

        self.parameter_dict["id2image"]         = train_path + "index2img_exclude.txt"
        self.parameter_dict["id2caption"]       = train_path + "index2caption.txt"
        self.parameter_dict["target"]           = train_path + "index2caption.txt"
        self.parameter_dict["vocab"]            = 5000
        self.parameter_dict["embed"]            = 300
        self.parameter_dict["hidden"]           = 200
        self.parameter_dict["epoch"]            = 20
        self.parameter_dict["minibatch"]        = 110 
        self.parameter_dict["generation_limit"] = 256
        self.parameter_dict["use_gpu"]          = use_gpu
        self.parameter_dict["gpu_id"]           = gpu_id
        self.parameter_dict["choose_model"] = "Alex_Model"

        if self.parameter_dict["choose_model"] == "Alex_Model":
            self.insize = 224
        if self.parameter_dict["choose_model"] == "AlexBn_Model":
            self.insize = 227

        mean_image = pickle.load(open("mean.npy", 'rb'))

        cropwidth = 256 - self.insize
        self.start = cropwidth // 2
        self.stop = self.start + self.insize
        self.mean_image = mean_image[:, self.start:self.stop, self.start:self.stop].copy()

        self.x_batch = np.ndarray((self.parameter_dict["minibatch"], 3,
                                   self.insize, self.insize), dtype=np.float32)
        self.y_batch = np.ndarray((self.parameter_dict["minibatch"]),
                                  dtype=np.int32)

        self.trg_vocab = Vocabulary.new(gens.word_list(self.parameter_dict["target"]), self.parameter_dict["vocab"])
        self.read_data = Read_Data(self.parameter_dict["id2image"],
                                   "Data/val2014_resize",
                                   self.parameter_dict["id2caption"])
        self.read_data.load_image_list()
        self.read_data.load_caption_list()
    def test(self):
        trace('loading model ...')
        self.trg_vocab = Vocabulary.load("model/" + self.model + '.trgvocab')
        self.batch_size = len(trg_batch)
        encdec = EncoderDecoder.load_spec("model/" + self.model + '.spec')
        serializers.load_hdf5("model/" + self.model + '.weights', encdec)

        trace('generating translation ...')
        generated = 0

        with open(self.target, 'w') as fp:
            self.__forward_img()
            trace('sample %8d ...' % (generated + 1))
            hyp_batch = self.__forward_word(self.trg_batch, encdec, False, self.generation_limit)

            for hyp in hyp_batch:
                hyp.append('</s>')
                hyp = hyp[:hyp.index('</s>')]
                print('hyp : ' +''.join(hyp))
                print(' '.join(hyp), file=fp)

        trace('finished.')
    def test(self):
        trace('loading model ...')
        trg_vocab = Vocabulary.load(self.model + '.trgvocab')
        self.encdec = EncoderDecoderAttention.load_spec(self.model + '.spec')
        serializers.load_hdf5(self.model + '.weights', self.encdec)

        trace('generating translation ...')
        generated = 0

        trace('sample %8d - %8d ...' % (generated + 1, generated))
        hyp_batch = self.forward(trg_vocab, False, self.generation_limit)

        source_cuont = 0
        with open(self.target, 'w') as fp:
            for hyp in hyp_batch:
                hyp.append('</s>')
                hyp = hyp[: hyp.index('</s>')]
                print('hyp : ' + ''.join(hyp))
                fp.write(' '.join(hyp))
                source_cuont = source_cuont + 1

        trace('finished.')
    def train(self):
        trace('making vocabularies ...')
        trg_vocab = Vocabulary.new(gens.word_list(self.target), self.vocab)

        trace('making model ...')

        for epoch in range(self.epoch):
            trace('epoch %d/%d: ' % (epoch + 1, self.epoch))
            trained = 0
            opt = optimizers.AdaGrad(lr=0.01)
            opt.setup(self.encdec)
            opt.add_hook(optimizer.GradientClipping(5))
            gen1 = gens.word_list(self.target)
            gen = gens.batch(gen1, self.minibatch)

            random_number = random.randint(0, self.minibatch - 1)
            for trg_batch in gen:
                self.trg_batch = fill_batch(trg_batch)
                if len(self.trg_batch) != self.minibatch:
                    break
                hyp_batch, loss = self.forward(trg_vocab, self.use_gpu, self.gpu_id)
                loss.backward()
                opt.update()
                K = len(self.trg_batch)

                if trained == 0:
                    self.print_out(random_number, epoch, trained, hyp_batch)

                trained += K

        trace('saving model ...')
        prefix = self.model
        trg_vocab.save(prefix + '.trgvocab')
        self.encdec.save_spec(prefix + '.spec')
        serializers.save_hdf5(prefix + '.weights', self.encdec)

        trace('finished.')
Exemple #38
0
class Dataset(object):

    def __init__(self, args, data_mode='train', single_pass=False, batch_size=None):
        self.args = copy.deepcopy(args)
        if batch_size is not None:
            self.args.batch_size = batch_size
        self.vocab = Vocabulary(self.args)
        self.data_mode = data_mode
        self.single_pass = single_pass

    @property
    def examples(self):
        source_files = sorted(glob.glob(self.args.dataset_file_path + self.data_mode + '_source*.dat'))
        target_files = sorted(glob.glob(self.args.dataset_file_path + self.data_mode + '_target*.dat'))
        extract_id_files = sorted(glob.glob(self.args.dataset_file_path + self.data_mode + '_tag*.dat'))
        assert len(source_files) != 0
        assert len(source_files) == len(target_files)
        assert len(extract_id_files) == len(source_files)
        example_files = list(zip(source_files, target_files, extract_id_files))

        if self.data_mode == 'test':
            example_files = sorted(example_files)
        else:
            random.shuffle(example_files)

        for (source_file, target_file, extract_id_file) in example_files:
            if self.args.original_result:
                source_file = './tmp/cnn_dm_processed/dataset/train_source_108.dat'
                target_file = './tmp/cnn_dm_processed/dataset/train_target_108.dat'
            # source_file = './tmp/cnn_dm_processed/dataset/train_source_108.dat'
            # target_file = './tmp/cnn_dm_processed/dataset/train_target_108.dat'
            # source_file = './tmp/cnn_dm_processed/dataset/train_source_83.dat'
            # target_file = './tmp/cnn_dm_processed/dataset/train_target_83.dat'  # debug for specify file
            # source_file = './tmp/byte_cup_2018/dataset/test_source_59.dat'
            # target_file = './tmp/byte_cup_2018/dataset/test_target_59.dat'
            print(self.data_mode + 'ing:', source_file)
            print(self.data_mode + 'ing:', target_file)
            print(self.data_mode + 'ing:', extract_id_file)

            with open(source_file) as f:
                train_X_list = [sentence.strip() for sentence in f]
            with open(target_file) as f:
                train_y_list = [sentence.strip() for sentence in f]
            with open(extract_id_file) as f:
                train_extract_list = [sentence.strip() for sentence in f]

            if not self.args.original_result:
                train_unity = sorted(zip(train_X_list, train_y_list, train_extract_list), key=lambda x: len(x[0].split()), reverse=True)
                train_X_list, train_y_list, train_extract_list = zip(*train_unity)  # tuple

            for (article, abstract, extract_id_str) in zip(train_X_list, train_y_list, train_extract_list):
                if self.args.target_split:
                    abstract_sentences = [sent.strip() for sent in self.vocab.abstract2sents(abstract)]  # Use the <s> and </s> tags in abstract to get a list of sentences.
                    abstract = ' '.join(abstract_sentences)  # abstract_sentences will be used in beam_search, not clear...
                example = Example(self.args, article, abstract, extract_id_str, self.vocab)  # Process into an Example.
                # if example.X_len < self.args.min_source_length \
                #         or example.X_len > self.args.max_source_length:

                # 原来设置的是截取句子长度, 现在这里是根据句子数量来截取
                if len(example.extract_ids) < 4 or len(example.extract_ids) > self.args.pos_num:  # doesn't care max_source_length because of sentence selector
                    continue
                yield example

    @property
    def batches(self):
        example_generator = self.examples
        while True:
            try:
                if self.data_mode != 'train' and self.single_pass:  # beam search decode mode single example repeated in the batch
                    ex = next(example_generator)
                    batch = [ex for _ in range(self.args.batch_size)]
                    yield Batch(self.args, batch, self.vocab)
                else:
                    inputs = [next(example_generator) for _ in
                              range(self.args.batch_size * (self.args.bucket_cache_size if self.data_mode == 'train' else 1))]
                    inputs = sorted(inputs, key=lambda inp: inp.X_len, reverse=True)  # sort by length of encoder sequence
                    batches = [inputs[i:i + self.args.batch_size] for i in range(0, len(inputs), self.args.batch_size)]
                    shuffle(batches)
                    for batch in batches:  # each b is a list of Example objects
                        batch = Batch(self.args, batch, self.vocab)
                        if sum(batch.X_doc_lens) == sum([len(ex_extract_ids) for ex_extract_ids in batch.batch_extract_ids]):  # maybe they are different
                            yield batch
                        else:
                            print("my warning: number of batch_extract_ids doesn't match X_doc_lens")
            except StopIteration:
                print('StopIteration, Examples of this epoch is done in dataset batches, data mode', self.data_mode)
                break