def test_1(self): filename = "ANC__110CYL067" fnparsed_reader = FNParsedReader() parsed_conll_file = options.framenet_parsed / (filename + ".conll") reader = framenetreader.FulltextReader( options.fulltext_corpus / (filename + ".xml"), False) frame = reader.frames[1] sentence_id, sentence_text, tree = list( fnparsed_reader.sentence_trees(parsed_conll_file))[ frame.sentence_id] self.assertEqual(headwordextractor.headword(frame.args[0], tree), { 'top_headword': ('PRP', 'you'), 'content_headword': ('PRP', 'you') }) frame = reader.frames[25] sentence_id, sentence_text, tree = list( fnparsed_reader.sentence_trees(parsed_conll_file))[ frame.sentence_id] self.assertEqual( headwordextractor.headword(frame.args[0], tree), { 'top_headword': ('NNS', 'people'), 'content_headword': ('NNS', 'people') })
def test_1(self): filename = "ANC__110CYL067" fnparsed_reader = FNParsedReader() parsed_conll_file = options.framenet_parsed / (filename + ".conll") reader = framenetreader.FulltextReader(options.fulltext_corpus / (filename + ".xml"), False) frame = reader.frames[1] sentence_id, sentence_text, tree = list(fnparsed_reader.sentence_trees(parsed_conll_file))[frame.sentence_id] self.assertEqual(headwordextractor.headword(frame.args[0], tree), {'top_headword': ('PRP', 'you'), 'content_headword': ('PRP', 'you')}) frame = reader.frames[25] sentence_id, sentence_text, tree = list(fnparsed_reader.sentence_trees(parsed_conll_file))[frame.sentence_id] self.assertEqual(headwordextractor.headword(frame.args[0], tree), {'top_headword': ('NNS', 'people'), 'content_headword': ('NNS', 'people')})
def add_syntactic_information(self, frame, sentence_tree): """ Add information to a frame using the syntactic annotation: whether it is passive or not, and whether the predicate has been found in the tree. In some cases (five for the training set), our parser produces multiple roots, which mean the resulting tree could only cover one part of the sentence. In those cases, the function returns False and the frame is not handled by our labeler. :param frame: The frame :type frame: FrameInstance """ # Search verb + passive status try: search = frame.predicate.text.split()[0].lower() predicate_node = [ node for node in frame.tree if node.word == search ][0] frame.passive = FNAllReader.is_passive(predicate_node) except IndexError: raise PredicateNotFound("\nframenetparsedreader : predicate" " \"{}\" not found in sentence {}".format( search, frame.tree.flat())) # Read headwords for i, arg in enumerate(frame.args): if not arg.instanciated: continue frame.headwords[i] = headwordextractor.headword(arg, sentence_tree)
def add_syntactic_information(self, frame, sentence_tree): """ Add information to a frame using the syntactic annotation: whether it is passive or not, and whether the predicate has been found in the tree. In some cases (five for the training set), our parser produces multiple roots, which mean the resulting tree could only cover one part of the sentence. In those cases, the function returns False and the frame is not handled by our labeler. :param frame: The frame :type frame: FrameInstance """ # Search verb + passive status try: search = frame.predicate.text.split()[0].lower() predicate_node = [node for node in frame.tree if node.word == search][0] frame.passive = FNAllReader.is_passive(predicate_node) except IndexError: raise PredicateNotFound("\nframenetparsedreader : predicate" " \"{}\" not found in sentence {}".format( search, frame.tree.flat())) # Read headwords for i, arg in enumerate(frame.args): if not arg.instanciated: continue frame.headwords[i] = headwordextractor.headword(arg, sentence_tree)
def test_classes(self): filename = "ANC__110CYL067" fnparsed_reader = FNParsedReader() parsed_conll_file = options.framenet_parsed / (filename + ".conll") reader = framenetreader.FulltextReader(options.fulltext_annotations[0], False) for frame in reader.frames: sentence_id, sentence_text, tree = list(fnparsed_reader.sentence_trees(parsed_conll_file))[frame.sentence_id] for arg in frame.args: headwordextractor.headword(arg, tree) self.assertEqual(headwordextractor.get_class("soda"), "physical_entity.n.01") #self.assertEqual(headwordextractor.get_class("i"), "pronoun") # get_class should return None for words out of WordNet self.assertEqual(headwordextractor.get_class("abcde"), None)
def test_classes(self): filename = "ANC__110CYL067" fnparsed_reader = FNParsedReader() parsed_conll_file = options.framenet_parsed / (filename + ".conll") reader = framenetreader.FulltextReader(options.fulltext_annotations[0], False) for frame in reader.frames: sentence_id, sentence_text, tree = list( fnparsed_reader.sentence_trees(parsed_conll_file))[ frame.sentence_id] for arg in frame.args: headwordextractor.headword(arg, tree) self.assertEqual(headwordextractor.get_class("soda"), "physical_entity.n.01") #self.assertEqual(headwordextractor.get_class("i"), "pronoun") # get_class should return None for words out of WordNet self.assertEqual(headwordextractor.get_class("abcde"), None)
def _sentence_predicates_iterator(self, sentence_id, sentence, tree, filename): """ Extracts frames from one sentence and iterate over them """ logger.debug("_sentence_predicates_iterator %s" % sentence_id) for node in tree: # For every verb, looks for its infinitive form in VerbNet, and # builds a frame occurrence if it is found logger.debug("_sentence_predicates_iterator on %s" % node.lemma) if node.lemma not in self.frames_for_verb: #logger.debug("_sentence_predicates_iterator node.lemma {} not in frames_for_verb".format(node.lemma)) continue if self._is_predicate(node): logger.debug( "_sentence_predicates_iterator node.lemma {} is a predicate" .format(node.lemma)) predicate = Predicate(node.begin_word, node.begin_word + len(node.word) - 1, node.word, node.lemma, node.word_id) if options.Options.heuristic_rules: args = [self._nodeToArg(x, node) for x in find_args(node)] else: args = self._find_args(node) args = [ x for x in args if self._is_good_phrase_type(x.phrase_type) ] # Read headwords headwords = [None] * len(args) for i, arg in enumerate(args): if not arg.instanciated: continue headwords[i] = headwordextractor.headword(arg, tree) logger.debug( '_sentence_predicates_iterator yielding {} {}…'.format( predicate, args)) yield FrameInstance( sentence=sentence, predicate=predicate, args=args, words=[Word(x.begin, x.end, x.pos) for x in tree], frame_name="", sentence_id=sentence_id, filename=filename, tree=tree, headwords=headwords)
def bootstrap_algorithm(vn_frames, probability_model, verbnet_classes): # See Swier and Stevenson, Unsupervised Semantic Role Labelling, 2004, 5.4 # for information about the parameters' values log_ratio = 8 log_ratio_step = 0.5 min_evidence = [1, 1, 10] # [1, 3, 10] -> [17, 65, 2076] # [3, 5, 10] -> [17, 65, 2076] total = [0, 0, 0] while log_ratio >= 1: # Update probability model with resolved slots (only one role) for frame_occurrence in vn_frames: for slot_position, role_set in enumerate(frame_occurrence.roles): if len(role_set) != 1: continue headword = headwordextractor.headword( frame_occurrence.args[slot_position], frame_occurrence.tree)['content_headword'][1] probability_model.add_data_bootstrap( next(iter(role_set)), frame_occurrence.predicate, verbnet_classes[frame_occurrence.predicate], frame_occurrence.slot_types[slot_position], frame_occurrence.slot_preps[slot_position], headword, headwordextractor.get_class(headword)) # According to the article, there is no longer a min evidence threshold # when log_ratio reaches 1 if log_ratio == 1: min_evidence = [1, 1, 1] for frame_occurrence in vn_frames: for slot_position in range(frame_occurrence.num_slots): role_set = frame_occurrence.roles[slot_position] if len(role_set) <= 1: continue headword = headwordextractor.headword( frame_occurrence.args[slot_position], frame_occurrence.tree)['content_headword'][1] role = None for backoff_level in [0, 1, 2]: role1, role2, ratio = probability_model.best_roles_bootstrap( role_set, frame_occurrence.predicate, # Choosing the first class here is arbitrary verbnet_classes[frame_occurrence.predicate], frame_occurrence.slot_types[slot_position], frame_occurrence.slot_preps[slot_position], headword, headwordextractor.get_class(headword), backoff_level, min_evidence[backoff_level]) if (role1 is not None and ((role2 is not None and log(ratio) > log_ratio) or log_ratio <= 1)): role = role1 total[backoff_level] += 1 break if role is not None: frame_occurrence.restrict_slot_to_role(slot_position, role) frame_occurrence.select_likeliest_matches() log_ratio -= log_ratio_step
def bootstrap_algorithm(vn_frames, probability_model, verbnet_classes): # See Swier and Stevenson, Unsupervised Semantic Role Labelling, 2004, 5.4 # for information about the parameters' values log_ratio = 8 log_ratio_step = 0.5 min_evidence = [1, 1, 10] # [1, 3, 10] -> [17, 65, 2076] # [3, 5, 10] -> [17, 65, 2076] total = [0, 0, 0] while log_ratio >= 1: # Update probability model with resolved slots (only one role) for frame_occurrence in vn_frames: for slot_position, role_set in enumerate(frame_occurrence.roles): if len(role_set) != 1: continue headword = headwordextractor.headword( frame_occurrence.args[slot_position], frame_occurrence.tree)['content_headword'][1] probability_model.add_data_bootstrap( next(iter(role_set)), frame_occurrence.predicate, verbnet_classes[frame_occurrence.predicate], frame_occurrence.slot_types[slot_position], frame_occurrence.slot_preps[slot_position], headword, headwordextractor.get_class(headword) ) # According to the article, there is no longer a min evidence threshold # when log_ratio reaches 1 if log_ratio == 1: min_evidence = [1, 1, 1] for frame_occurrence in vn_frames: for slot_position in range(frame_occurrence.num_slots): role_set = frame_occurrence.roles[slot_position] if len(role_set) <= 1: continue headword = headwordextractor.headword( frame_occurrence.args[slot_position], frame_occurrence.tree)['content_headword'][1] role = None for backoff_level in [0, 1, 2]: role1, role2, ratio = probability_model.best_roles_bootstrap( role_set, frame_occurrence.predicate, # Choosing the first class here is arbitrary verbnet_classes[frame_occurrence.predicate], frame_occurrence.slot_types[slot_position], frame_occurrence.slot_preps[slot_position], headword, headwordextractor.get_class(headword), backoff_level, min_evidence[backoff_level] ) if (role1 is not None and ((role2 is not None and log(ratio) > log_ratio) or log_ratio <= 1)): role = role1 total[backoff_level] += 1 break if role is not None: frame_occurrence.restrict_slot_to_role(slot_position, role) frame_occurrence.select_likeliest_matches() log_ratio -= log_ratio_step