def train(self, graphs): """ Trains a ProbabilisticDependencyGrammar based on the list of input DependencyGraphs. This model is an implementation of Eisner's (1996) Model C, which derives its statistics from head-word, head-tag, child-word, and child-tag relationships. :param graphs: A list of dependency graphs to train from. :type: list(DependencyGraph) """ productions = [] events = defaultdict(int) tags = {} for dg in graphs: for node_index in range(1, len(dg.nodes)): children = dg.nodes[node_index]['deps'] nr_left_children = dg.left_children(node_index) nr_right_children = dg.right_children(node_index) nr_children = nr_left_children + nr_right_children for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2): head_word = dg.nodes[node_index]['word'] head_tag = dg.nodes[node_index]['tag'] if head_word in tags: tags[head_word].add(head_tag) else: tags[head_word] = set([head_tag]) child = 'STOP' child_tag = 'STOP' prev_word = 'START' prev_tag = 'START' if child_index < 0: array_index = child_index + nr_left_children if array_index >= 0: child = dg.nodes[children[array_index]]['word'] child_tag = dg.nodes[children[array_index]]['tag'] if child_index != -1: prev_word = dg.nodes[children[array_index + 1]]['word'] prev_tag = dg.nodes[children[array_index + 1]]['tag'] if child != 'STOP': productions.append(DependencyProduction(head_word, [child])) head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag) mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag) events[head_event] += 1 events[mod_event] += 1 elif child_index > 0: array_index = child_index + nr_left_children - 1 if array_index < nr_children: child = dg.nodes[children[array_index]]['word'] child_tag = dg.nodes[children[array_index]]['tag'] if child_index != 1: prev_word = dg.nodes[children[array_index - 1]]['word'] prev_tag = dg.nodes[children[array_index - 1]]['tag'] if child != 'STOP': productions.append(DependencyProduction(head_word, [child])) head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag) mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag) events[head_event] += 1 events[mod_event] += 1 self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
def train(self, graphs): """ Trains a ProbabilisticDependencyGrammar based on the list of input DependencyGraphs. This model is an implementation of Eisner's (1996) Model C, which derives its statistics from head-word, head-tag, child-word, and child-tag relationships. :param graphs: A list of dependency graphs to train from. :type: list(DependencyGraph) """ productions = [] events = defaultdict(int) tags = {} for dg in graphs: for node_index in range(1, len(dg.nodes)): # children = dg.nodes[node_index]['deps'] children = list( chain.from_iterable(dg.nodes[node_index]["deps"].values()) ) nr_left_children = dg.left_children(node_index) nr_right_children = dg.right_children(node_index) nr_children = nr_left_children + nr_right_children for child_index in range( 0 - (nr_left_children + 1), nr_right_children + 2 ): head_word = dg.nodes[node_index]["word"] head_tag = dg.nodes[node_index]["tag"] if head_word in tags: tags[head_word].add(head_tag) else: tags[head_word] = {head_tag} child = "STOP" child_tag = "STOP" prev_word = "START" prev_tag = "START" if child_index < 0: array_index = child_index + nr_left_children if array_index >= 0: child = dg.nodes[children[array_index]]["word"] child_tag = dg.nodes[children[array_index]]["tag"] if child_index != -1: prev_word = dg.nodes[children[array_index + 1]]["word"] prev_tag = dg.nodes[children[array_index + 1]]["tag"] if child != "STOP": productions.append(DependencyProduction(head_word, [child])) head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( child, child_tag, prev_tag, head_word, head_tag, ) mod_event = "(mods ({}, {}, {}) left))".format( prev_tag, head_word, head_tag, ) events[head_event] += 1 events[mod_event] += 1 elif child_index > 0: array_index = child_index + nr_left_children - 1 if array_index < nr_children: child = dg.nodes[children[array_index]]["word"] child_tag = dg.nodes[children[array_index]]["tag"] if child_index != 1: prev_word = dg.nodes[children[array_index - 1]]["word"] prev_tag = dg.nodes[children[array_index - 1]]["tag"] if child != "STOP": productions.append(DependencyProduction(head_word, [child])) head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( child, child_tag, prev_tag, head_word, head_tag, ) mod_event = "(mods ({}, {}, {}) right))".format( prev_tag, head_word, head_tag, ) events[head_event] += 1 events[mod_event] += 1 self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)