Beispiel #1
0
 def __init__(self, rule=None, ants=None, features=None):
     # rule: the rule from which this deduction is genrated
     # ants: antecedent items
     # features: the Feature object used to score this deduction
     Edge.__init__(self)
     if ants is not None:
         for ant in ants:
             self.add_tail(ant)
     self.rule = rule
     self.features = features
     # list of feature values, first stateful features,
     # then stateless features
     self.fcosts = []
     self.cost = 0
Beispiel #2
0
    def deserialize(self, line):
        deduction_str, rule_str = line.split('|||||')
        #edge_str, ngram_str, feature_str = deduction_str.split('|||')
        edge_str = deduction_str

        # init with Edge deserialization
        tail_ids, head_id = Edge.deserialize(self, edge_str)
        # load ngrams
        max_n = 3  # TODO: remove magic number
        self.ngrams = [{} for i in range(max_n)]
        wordlist = []
        #for token in ngram_str.split():
        #    if len(token) >= 3 and \
        #       token.startswith('$') and \
        #       token.endswith('$'):  # end of ngram
        #        count = int(token[1:-1])
        #        ngram = tuple(wordlist)
        #        wordlist = []
        #        self.ngrams[len(ngram)-1][ngram] = count
        #    else:  # add another word to current ngram
        #        wordlist.append(token)
        #
        # load feature costs
        #self.fcosts = []
        #for fcost in feature_str.split():
        #    self.fcosts.append(float(fcost))
        # load rule
        self.rule = Rule()
        self.rule.fromstr(rule_str)

        return tail_ids, head_id
Beispiel #3
0
 def serialize(self):
     """extends hypergraph.Edge.serialize()"""
     edge_str = Edge.serialize(self)
     if hasattr(self, 'ngrams'):
         ngram_str = ''
         for i in range(len(self.ngrams)):
             for ngram, count in self.ngrams[i].items():
                 ngram_str += '%s $%s$ ' % (' '.join(
                     word for word in ngram), count)
         feature_str = ' '.join(str(fcost) for fcost in self.fcosts)
         deduction_str = ' ||| '.join([edge_str, ngram_str, feature_str])
         result = ' ||||| '.join([deduction_str, str(self.rule)])
     else:
         result = '%s ||||| %s' % (edge_str, self.rule)
     return result
Beispiel #4
0
 def make_path(self, subpaths):
     """Extends the base class make_path method to generate a Path object
     attached with:
         1) weight, (done in base class)
         2) translation hypothesis,
         3) list of accumulated feature values"""
     path = Edge.make_path(self, subpaths)
     if FLAGS.preprocess:
         self.rule.align_special_symbols()
     path.composed_rule = self.rule.compose(
         [p.composed_rule for p in subpaths])
     path.translation = path.composed_rule.e
     path.fcosts = list(self.fcosts)  # copy
     for p in subpaths:
         if p is not None:
             for i, fcost in enumerate(p.fcosts):
                 path.fcosts[i] += fcost
     return path
Beispiel #5
0
 def __init__(self):
     Edge.__init__(self)
 def deserialize(self):
     edge_str, rule_str = line.split('|||||')
     tail_ids, head_id = Edge.deserialize(self, edge_str)
     self.rule = Rule()
     self.rule.fromstr(rule_str)
     return tail_ids, head_id
 def serialize(self):
     edge_str = Edge.serialize(self)
     rule_str = str(self.rule)
     return ' ||||| '.join([edge_str, rule_str])
 def __init__(self, rule=None):
     Edge.__init__(self)
     self.rule = rule
def add_experimental_virtual_edges(target_tree, source_tree, s2t_node_alignments, t2s_node_alignments, target_terminals):
	def project(source_node):
		alignments = s2t_node_alignments[source_node]
		#assert len(alignments) <= 1 # TODO: Could unaligned words invalidate this?
		return list(alignments)[0] if len(alignments) == 1 else None

	# Derivation[source_node] will hold the minimal way(s) of representing source_node using minimal constituents.
	# For terminals and well-aligned NTs, there is only one such way: using the node itself.
	# For NTs that are not node aligned, we will find sets of minimally aligned children that cover source_node.
	derivations = {}
	for source_node in source_tree.topsort():
		derivations[source_node] = []
		if source_node.is_terminal_flag:
			derivation = (source_node,)
			derivations[source_node].append((derivation, []))
		elif project(source_node) != None:
			derivation = (source_node,)
			derivations[source_node].append((derivation, []))
		else:	
			for edge in source_tree.head_index[source_node]:
				for subset in enumerate_subsets([derivations[tail] for tail in edge.tails]):
					derivation = reduce(operator.add, [derivation for derivation, _ in subset])
					skipped_edges = reduce(operator.add, [edges for _, edges in subset])
					for node in derivation:
						assert len(s2t_node_alignments[node]) >= 1 or node.is_terminal_flag
					derivations[source_node].append((derivation, [edge] + skipped_edges))	

	for edge in source_tree.edges.copy():
		source_head = edge.head
		for target_head in s2t_node_alignments[source_head]:
			for source_subset in enumerate_subsets([derivations[tail] for tail in edge.tails]):
				source_tails = reduce(operator.add, [derivation for derivation, _ in source_subset])
				composed_edge = Edge(source_head, source_tails)
				skipped_edges = reduce(operator.add, [edges for _, edges in source_subset])
				if len(skipped_edges) > 0:
					composed_edge.composed_edges = tuple([edge] + skipped_edges)
					composed_edge.is_composed = True
					assert len(edge.composed_edges) == 0
				if composed_edge != edge:
					assert len(skipped_edges) > 0
					source_tree.add(composed_edge)
				for target_subset in enumerate_subsets([list(s2t_node_alignments[tail]) for tail in source_tails if not tail.is_terminal_flag]):
					target_tails = target_subset
					for i in range(*target_head.span):
						is_included = False
						for tail in target_tails:
							if i >= tail.span.start and i < tail.span.end:
								is_included = True
								break
						if not is_included:
							target_tails.append(target_terminals[i])
					target_tails = tuple(sorted(target_tails, key=lambda node: node.span.start))
					virtual_edge = Edge(target_head, target_tails)	
					target_tree.add(virtual_edge)

	return
		
	for source_node in source_tree.topsort():
		head = project(source_node)
		if head == None:
			print >>sys.stderr, str(source_node), 'is unaligned'
			continue
		else:
			print >>sys.stderr, str(source_node), 'is aligned to', str(head)
		for edge in source_tree.head_index[source_node]:
			tails = []
			valid = True
			for tail in edge.tails:
				projection = project(tail)
				if projection is None:
					valid = False
					break
				tails.append(projection)
			if valid:
				virtual_edge = Edge(head, tuple(tails))
				target_tree.add(virtual_edge)
				print >>sys.stderr, head, tails