コード例 #1
0
    def test_contains_verb_noun_pair_5(self):
        caption = "a man is sitting on a chair."
        nouns = {"man"}
        verbs = {"sit"}

        pos_tagged_caption = self.nlp_pipeline(caption).sentences[0]

        expected_pattern = (True, True, True)
        self.assertEqual(
            expected_pattern, contains_verb_noun_pair(pos_tagged_caption, nouns, verbs)
        )
コード例 #2
0
def count_verb_noun_pairs(nouns_file, verbs_file, preprocessed_data_folder):
    with open(nouns_file, "r") as json_file:
        nouns = json.load(json_file)

    with open(verbs_file, "r") as json_file:
        verbs = json.load(json_file)

    with open(
        os.path.join(preprocessed_data_folder, POS_TAGGED_CAPTIONS_FILENAME), "rb"
    ) as pickle_file:
        captions = pickle.load(pickle_file)

    first_noun = nouns[0]
    first_verb = verbs[0]

    print("Looking for pairs: {} - {}".format(verbs, nouns))

    data = {}
    data[NOUNS] = nouns
    data[VERBS] = verbs

    occurrence_data = {}

    for coco_id, tagged_caption in tqdm(captions.items()):
        occurrence_data[coco_id] = {}
        occurrence_data[coco_id][PAIR_OCCURENCES] = 0
        occurrence_data[coco_id][VERB_OCCURRENCES] = 0
        occurrence_data[coco_id][NOUN_OCCURRENCES] = 0
        occurrence_data[coco_id][DATA_COCO_SPLIT] = tagged_caption[DATA_COCO_SPLIT]

        for caption in tagged_caption["pos_tagged_captions"]:
            noun_is_present, verb_is_present, combination_is_present = contains_verb_noun_pair(
                caption, nouns, verbs
            )
            if combination_is_present:
                print(" ".join([token.text for token in caption.tokens]))
                occurrence_data[coco_id][PAIR_OCCURENCES] += 1
            if verb_is_present:
                occurrence_data[coco_id][VERB_OCCURRENCES] += 1
            if noun_is_present:
                occurrence_data[coco_id][NOUN_OCCURRENCES] += 1

    data[OCCURRENCE_DATA] = occurrence_data

    data_path = "{}_{}.json".format(first_verb, first_noun)
    print("\nSaving results to {}".format(data_path))
    with open(data_path, "w") as json_file:
        json.dump(data, json_file)

    for n in range(1, 6):
        noun_occurences = len(
            [d for d in occurrence_data.values() if d[NOUN_OCCURRENCES] >= n]
        )
        verb_occurences = len(
            [d for d in occurrence_data.values() if d[VERB_OCCURRENCES] >= n]
        )
        pair_occurences = len(
            [d for d in occurrence_data.values() if d[PAIR_OCCURENCES] >= n]
        )

        print(
            "\nFound {}\timages where the noun occurs at least {} time(s).".format(
                noun_occurences, n
            )
        )
        print(
            "Found {}\timages where the verb occurs at least {} time(s).".format(
                verb_occurences, n
            )
        )
        print(
            "Found {}\timages where the pair occurs at least {} time(s).".format(
                pair_occurences, n
            )
        )