def __init__(self, rule=None, ants=None, features=None): # rule: the rule from which this deduction is genrated # ants: antecedent items # features: the Feature object used to score this deduction Edge.__init__(self) if ants is not None: for ant in ants: self.add_tail(ant) self.rule = rule self.features = features # list of feature values, first stateful features, # then stateless features self.fcosts = [] self.cost = 0
def deserialize(self, line): deduction_str, rule_str = line.split('|||||') #edge_str, ngram_str, feature_str = deduction_str.split('|||') edge_str = deduction_str # init with Edge deserialization tail_ids, head_id = Edge.deserialize(self, edge_str) # load ngrams max_n = 3 # TODO: remove magic number self.ngrams = [{} for i in range(max_n)] wordlist = [] #for token in ngram_str.split(): # if len(token) >= 3 and \ # token.startswith('$') and \ # token.endswith('$'): # end of ngram # count = int(token[1:-1]) # ngram = tuple(wordlist) # wordlist = [] # self.ngrams[len(ngram)-1][ngram] = count # else: # add another word to current ngram # wordlist.append(token) # # load feature costs #self.fcosts = [] #for fcost in feature_str.split(): # self.fcosts.append(float(fcost)) # load rule self.rule = Rule() self.rule.fromstr(rule_str) return tail_ids, head_id
def serialize(self): """extends hypergraph.Edge.serialize()""" edge_str = Edge.serialize(self) if hasattr(self, 'ngrams'): ngram_str = '' for i in range(len(self.ngrams)): for ngram, count in self.ngrams[i].items(): ngram_str += '%s $%s$ ' % (' '.join( word for word in ngram), count) feature_str = ' '.join(str(fcost) for fcost in self.fcosts) deduction_str = ' ||| '.join([edge_str, ngram_str, feature_str]) result = ' ||||| '.join([deduction_str, str(self.rule)]) else: result = '%s ||||| %s' % (edge_str, self.rule) return result
def make_path(self, subpaths): """Extends the base class make_path method to generate a Path object attached with: 1) weight, (done in base class) 2) translation hypothesis, 3) list of accumulated feature values""" path = Edge.make_path(self, subpaths) if FLAGS.preprocess: self.rule.align_special_symbols() path.composed_rule = self.rule.compose( [p.composed_rule for p in subpaths]) path.translation = path.composed_rule.e path.fcosts = list(self.fcosts) # copy for p in subpaths: if p is not None: for i, fcost in enumerate(p.fcosts): path.fcosts[i] += fcost return path
def __init__(self): Edge.__init__(self)
def deserialize(self): edge_str, rule_str = line.split('|||||') tail_ids, head_id = Edge.deserialize(self, edge_str) self.rule = Rule() self.rule.fromstr(rule_str) return tail_ids, head_id
def serialize(self): edge_str = Edge.serialize(self) rule_str = str(self.rule) return ' ||||| '.join([edge_str, rule_str])
def __init__(self, rule=None): Edge.__init__(self) self.rule = rule
def add_experimental_virtual_edges(target_tree, source_tree, s2t_node_alignments, t2s_node_alignments, target_terminals): def project(source_node): alignments = s2t_node_alignments[source_node] #assert len(alignments) <= 1 # TODO: Could unaligned words invalidate this? return list(alignments)[0] if len(alignments) == 1 else None # Derivation[source_node] will hold the minimal way(s) of representing source_node using minimal constituents. # For terminals and well-aligned NTs, there is only one such way: using the node itself. # For NTs that are not node aligned, we will find sets of minimally aligned children that cover source_node. derivations = {} for source_node in source_tree.topsort(): derivations[source_node] = [] if source_node.is_terminal_flag: derivation = (source_node,) derivations[source_node].append((derivation, [])) elif project(source_node) != None: derivation = (source_node,) derivations[source_node].append((derivation, [])) else: for edge in source_tree.head_index[source_node]: for subset in enumerate_subsets([derivations[tail] for tail in edge.tails]): derivation = reduce(operator.add, [derivation for derivation, _ in subset]) skipped_edges = reduce(operator.add, [edges for _, edges in subset]) for node in derivation: assert len(s2t_node_alignments[node]) >= 1 or node.is_terminal_flag derivations[source_node].append((derivation, [edge] + skipped_edges)) for edge in source_tree.edges.copy(): source_head = edge.head for target_head in s2t_node_alignments[source_head]: for source_subset in enumerate_subsets([derivations[tail] for tail in edge.tails]): source_tails = reduce(operator.add, [derivation for derivation, _ in source_subset]) composed_edge = Edge(source_head, source_tails) skipped_edges = reduce(operator.add, [edges for _, edges in source_subset]) if len(skipped_edges) > 0: composed_edge.composed_edges = tuple([edge] + skipped_edges) composed_edge.is_composed = True assert len(edge.composed_edges) == 0 if composed_edge != edge: assert len(skipped_edges) > 0 source_tree.add(composed_edge) for target_subset in enumerate_subsets([list(s2t_node_alignments[tail]) for tail in source_tails if not tail.is_terminal_flag]): target_tails = target_subset for i in range(*target_head.span): is_included = False for tail in target_tails: if i >= tail.span.start and i < tail.span.end: is_included = True break if not is_included: target_tails.append(target_terminals[i]) target_tails = tuple(sorted(target_tails, key=lambda node: node.span.start)) virtual_edge = Edge(target_head, target_tails) target_tree.add(virtual_edge) return for source_node in source_tree.topsort(): head = project(source_node) if head == None: print >>sys.stderr, str(source_node), 'is unaligned' continue else: print >>sys.stderr, str(source_node), 'is aligned to', str(head) for edge in source_tree.head_index[source_node]: tails = [] valid = True for tail in edge.tails: projection = project(tail) if projection is None: valid = False break tails.append(projection) if valid: virtual_edge = Edge(head, tuple(tails)) target_tree.add(virtual_edge) print >>sys.stderr, head, tails