def test_contains_verb_noun_pair_5(self): caption = "a man is sitting on a chair." nouns = {"man"} verbs = {"sit"} pos_tagged_caption = self.nlp_pipeline(caption).sentences[0] expected_pattern = (True, True, True) self.assertEqual( expected_pattern, contains_verb_noun_pair(pos_tagged_caption, nouns, verbs) )
def count_verb_noun_pairs(nouns_file, verbs_file, preprocessed_data_folder): with open(nouns_file, "r") as json_file: nouns = json.load(json_file) with open(verbs_file, "r") as json_file: verbs = json.load(json_file) with open( os.path.join(preprocessed_data_folder, POS_TAGGED_CAPTIONS_FILENAME), "rb" ) as pickle_file: captions = pickle.load(pickle_file) first_noun = nouns[0] first_verb = verbs[0] print("Looking for pairs: {} - {}".format(verbs, nouns)) data = {} data[NOUNS] = nouns data[VERBS] = verbs occurrence_data = {} for coco_id, tagged_caption in tqdm(captions.items()): occurrence_data[coco_id] = {} occurrence_data[coco_id][PAIR_OCCURENCES] = 0 occurrence_data[coco_id][VERB_OCCURRENCES] = 0 occurrence_data[coco_id][NOUN_OCCURRENCES] = 0 occurrence_data[coco_id][DATA_COCO_SPLIT] = tagged_caption[DATA_COCO_SPLIT] for caption in tagged_caption["pos_tagged_captions"]: noun_is_present, verb_is_present, combination_is_present = contains_verb_noun_pair( caption, nouns, verbs ) if combination_is_present: print(" ".join([token.text for token in caption.tokens])) occurrence_data[coco_id][PAIR_OCCURENCES] += 1 if verb_is_present: occurrence_data[coco_id][VERB_OCCURRENCES] += 1 if noun_is_present: occurrence_data[coco_id][NOUN_OCCURRENCES] += 1 data[OCCURRENCE_DATA] = occurrence_data data_path = "{}_{}.json".format(first_verb, first_noun) print("\nSaving results to {}".format(data_path)) with open(data_path, "w") as json_file: json.dump(data, json_file) for n in range(1, 6): noun_occurences = len( [d for d in occurrence_data.values() if d[NOUN_OCCURRENCES] >= n] ) verb_occurences = len( [d for d in occurrence_data.values() if d[VERB_OCCURRENCES] >= n] ) pair_occurences = len( [d for d in occurrence_data.values() if d[PAIR_OCCURENCES] >= n] ) print( "\nFound {}\timages where the noun occurs at least {} time(s).".format( noun_occurences, n ) ) print( "Found {}\timages where the verb occurs at least {} time(s).".format( verb_occurences, n ) ) print( "Found {}\timages where the pair occurs at least {} time(s).".format( pair_occurences, n ) )