Ejemplo n.º 1
0
    def get_tokens(self, sentences):
        if not isinstance(sentences, list):
            sentences = [sentences]
        final_sentences = []
        for _, sentence in enumerate(sentences):
            tokens = tokenize(sentence)
            final_sentences.append(tokens)

        return final_sentences
Ejemplo n.º 2
0
def get_imdb(ann_path: str, quest_path: str, split: str,
             answer_vocab_path: str) -> np.ndarray:

    imdb = [{"dataset_name": "okvqa"}]

    with PathManager.open(answer_vocab_path, "r") as f:
        answer_vocab = set(f.read().splitlines())

    with PathManager.open(ann_path, "r") as f:
        annotations = json.load(f)["annotations"]

    with PathManager.open(quest_path, "r") as f:
        questions = json.load(f)["questions"]

    gt_answers = {}
    for ann in annotations:
        gt_answers[ann["question_id"]] = ann["answers"]

    count = 0
    for quest in tqdm(questions):
        image_name = f"COCO_{split}_{quest['image_id']:012d}"
        q_id = quest["question_id"]
        all_answers = [item['answer'] for item in gt_answers[q_id]]
        answers = [ans for ans in all_answers if ans in answer_vocab]

        if len(answers) == 0:
            answers = ["<unk>"]
            count += 1

        entry = {
            "image_name": image_name,
            "image_id": quest["image_id"],
            "feature_path": f"{image_name}.npy",
            "question_id": q_id,
            "question_str": quest["question"],
            "question_tokens": tokenize(quest["question"]),
            "answers": answers,
            "all_answers": all_answers,
        }

        imdb.append(entry)
    print("Unknown questions:", count)

    return np.array(imdb)
    def extract(self):
        os.makedirs(self.out_dir, exist_ok=True)

        word_count = Counter()

        texts = self.get_text()
        text_lengths = [None] * len(texts)

        for inx, text in enumerate(texts):
            words = tokenize(text)
            text_lengths[inx] = len(words)
            word_count.update(words)

        # UNK token will added on fly if you use Vocab class in core/text
        vocabulary = [w[0] for w in word_count.items() if w[1] >= self.min_freq]
        vocabulary.sort()

        self.save_vocabulary(vocabulary)

        print("min text len=", min(text_lengths))
        print("max text len=", max(text_lengths))
Ejemplo n.º 4
0
    def __getitem__(self, idx):
        data = self.questions[idx]

        # Each call to __getitem__ from dataloader returns a Sample class object which
        # collated by our special batch collator to a SampleList which is basically
        # a attribute based batch in layman terms
        current_sample = Sample()

        question = data["question"]
        tokens = tokenize(question, keep=[";", ","], remove=["?", "."])
        processed = self.text_processor({"tokens": tokens})
        current_sample.text = processed["text"]

        processed = self.answer_processor({"answers": [data["answer"]]})
        current_sample.answers = processed["answers"]
        current_sample.targets = processed["answers_scores"]

        image_path = os.path.join(self.image_path, data["image_filename"])
        image = np.true_divide(Image.open(image_path).convert("RGB"), 255)
        image = image.astype(np.float32)
        current_sample.image = torch.from_numpy(image.transpose(2, 0, 1))

        return current_sample
Ejemplo n.º 5
0
    def test_tokenize(self):
        tokens = text_utils.tokenize(self.TOKENIZE_EXAMPLE)

        self.assertEqual(list(tokens), self.TOKENS)