def extract(self, graph_pair): """ Extract features from aligned graph pair, limiting scope to selected nodes @keyword graph_pair: an instance of GraphMapping. NB Any node alignment involving a filtered node (cf. node_selector) is removed from the alignment! @return: a numpy record array with an instance for each possible pair of selected source node and target node. """ graphs = graph_pair.get_graphs() self._apply_pp_graph_hooks(graphs) instances = self._empty_instances(graphs) # source and target node counters, counting from one n_count = Pair(0, 0) # instance counter inst_count= 0 for source_node in graphs.source: source_selected = self.node_selector(source_node, graphs.source) if source_selected: n_count.source += 1 n_count.target = 0 for target_node in graphs.target: nodes = Pair(source_node, target_node) target_selected = self.node_selector(target_node, graphs.target) if source_selected and target_selected: self._apply_pp_node_hooks(nodes, graphs, graph_pair) n_count.target += 1 for feat in self.descriptor: # Each feature function is called with the node # counters, a pair of nodes, a pair of graphs, and an # alignment instances[inst_count][feat.name] = feat.function( n_count=n_count, nodes=nodes, graphs=graphs, alignment=graph_pair) inst_count += 1 else: # Remove alignment (if any) between skipped nodes from # alignment, because it should not occur in the true pgc. # A not so elegant side effect, but more effcient than # updating the alignment afterwards. try: graph_pair.del_align(nodes) except networkx.NetworkXError: pass # original ndarray was n x m, but actual size may be less due to node # selection, so here we get rid of empty rows at the bottom return instances[:inst_count]
def test_1(self): gb = SparseGraphBank("data/source-gb-1.xml", "alpino") # create a strong reference to the graph stub object, # otherwise it will vanish immediately :-) graph_stub1 = gb.get_graph_stub("s100") graph_stub2 = gb.get_graph_stub("s200") graph_pair = GraphPair(Pair(gb, gb), Pair(graph_stub1, graph_stub2)) # add a backlink to graph_pair graph_stub1.add_client(graph_pair) graph_stub2.add_client(graph_pair) self.assertTrue(isinstance(gb.get_graph("s100"), GraphStub)) self.assertTrue(isinstance(gb.get_graph("s200"), GraphStub)) gb.load() self.assertEqual(len(gb), 2) self.assertTrue(isinstance(graph_pair._graphs.source, AlpinoGraph)) self.assertTrue(isinstance(graph_pair._graphs.target, AlpinoGraph)) self.assertTrue(isinstance(gb.get_graph("s100"), AlpinoGraph)) self.assertTrue(isinstance(gb.get_graph("s200"), AlpinoGraph)) del graph_pair # force garbage collection gc.collect() # make sure the graphs are gone now that the referring graph pair is # no longer alive self.assertEqual(len(gb), 0) self.assertRaises(KeyError, gb.get_graph, "s100") self.assertRaises(KeyError, gb.get_graph, "s200")
def test_roots_share_suffix(self): graphs = Pair(AlpinoGraph(), AlpinoGraph()) graphs.source.add_node(1, "x") graphs.target.add_node(2, "y") nodes = Pair(1, 2) # no roots self.assertEqual(ff_roots_share_suffix(nodes, graphs), "-") graphs.source.node[1]["root"] = "woon_wagen" graphs.target.node[2]["root"] = "eet_tafel" self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F") graphs.source.node[1]["root"] = "woon_wagen" graphs.target.node[2]["root"] = "woon_wagen" self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F") graphs.source.node[1]["root"] = "woon_wagen" graphs.target.node[2]["root"] = "wagen" self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F") graphs.source.node[1]["root"] = "woon_wagen_bewoner" graphs.target.node[2]["root"] = "wagen_bewoner" self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F") graphs.source.node[1]["root"] = "woon_wagen" graphs.target.node[2]["root"] = "mest_wagen" self.assertEqual(ff_roots_share_suffix(nodes, graphs), "T") graphs.source.node[1]["root"] = "woon_wagen_trekker" graphs.target.node[2]["root"] = "mest_wagen_trekker" self.assertEqual(ff_roots_share_suffix(nodes, graphs), "T")
def align_corpus(self, corpus, doc_trees=None, clear=True): """ Align a parallel text corpus @param corpus: parallel text corpus instance (HitaextDoc) @keyword doc_trees: pair of source and target document trees; only useful in experiments to prevent repeatedly rereading of the document trees @keyword clear: if true all existing alignments involving elements with focus tags are removed Alignments are added to the <alignment> section of the corpus. """ if clear: clear_alignments(corpus, self.focus_tags) if not doc_trees: doc_trees = Pair( get_doc_tree(corpus, "from", self.ignore_tags.source), get_doc_tree(corpus, "to", self.ignore_tags.target)) # copy alignments from <aligment> section in corpus # to "_alignments" attribute on elements corpus.inject_alignments(doc_trees.source, doc_trees.target) if self.scope_tags: scope_tags = self.scope_tags else: # when scope is not specified, assume that scope tag is root tag, # and that roots are aligned source_root = doc_trees.source.getroot() target_root = doc_trees.target.getroot() scope_tags = Pair([source_root.tag], [target_root.tag]) source_root.set("_alignments", [target_root]) # TODO: semantics not entirely clear. # - what happens if scope tags are embedded? # - What happens if scope elements are aligned 1-to-n? for source_scope_elem in doc_trees.source.getiterator(): if source_scope_elem.tag not in scope_tags.source: continue for target_scope_elem in source_scope_elem.get("_alignments"): if target_scope_elem.tag not in scope_tags.target: continue scope_elems = Pair(source_scope_elem, target_scope_elem) self._align_within_scope(doc_trees, scope_elems) # finally copy alignment from "_alignments" attribute on elements # to <aligment> section in corpus corpus.extract_alignments(doc_trees.source, doc_trees.target)
def greedy_align_phrases(corpus): # greedy align phrases with the same lower-cased words as strings and with # the same lower-cased roots as restates for graph_pair in corpus: graph_pair.clear() graphs = graph_pair.get_graphs() lc_roots(graphs.source, graphs.source.root) lc_roots(graphs.target, graphs.target.root) target_nodes = [ tn for tn in graphs.target if ( not graphs.target.node_is_punct(tn) and not graphs.target.node_is_empty(tn) ) ] target_words = [ graphs.target.get_node_token_string(tn).lower() for tn in target_nodes ] target_roots = [ graphs.target.node[tn].get("_lc_roots", []) for tn in target_nodes ] for sn in graphs.source: if ( graphs.source.node_is_punct(sn) or graphs.source.node_is_empty(sn) ): continue sw = graphs.source.get_node_token_string(sn).lower() sr = graphs.source.node[sn].get("_lc_roots") try: j = target_words.index(sw) except: try: j = target_roots.index(sr) except: continue else: tn = target_nodes[j] graph_pair.add_align(Pair(sn, tn), "restates") #print "RESTATES" #print " ".join(sr) #print " ".join(target_roots[j]) del target_nodes[j] del target_words[j] del target_roots[j] else: tn = target_nodes[j] graph_pair.add_align(Pair(sn, tn), "equals") #print "EQUALS" #print sw #print target_words[j] del target_nodes[j] del target_words[j] del target_roots[j]
def __init__(self): self._corpus = ParallelGraphCorpus() # the domain model self._changed = False self._filename = None self._graph_pair = None self._graph_pair_index = None self._graphs = Pair(None, None) self._nodes = Pair(None, None) # the special relation which stands for "no relation" self._no_relation = "none" self._co_node_selection = False
class Test_(unittest.TestCase): def setUp(self): self.p = Pair("x", "y") def test__repr__(self): print repr(self.p) def test__iter__(self): self.assertTrue(list(iter(self.p))) def test__eq__(self): p2 = Pair("x", "y") self.assertEqual(self.p, p2) def test_set(self): self.p.set(3, 4) self.assertEqual(self.p.source, 3)
def ff_same_parent_lc_phrase(nodes, graphs, **kwargs): """ parent nodes have same lower-cased phrase """ parent_nodes = Pair(graphs.source.get_parent_node(nodes.source), graphs.target.get_parent_node(nodes.target)) return ff_same_lc_phrase(parent_nodes, graphs)
def goto_graph_pair(self, index): # don't use try-except here, because negative index is allowed for list if 0 <= index < len(self._corpus): self._graph_pair = self._corpus[index] self._graph_pair_index = index self._graphs = self._graph_pair.get_graphs() self._nodes = Pair(None, None) send(self.goto_graph_pair, "newGraphPair.viz") send(self.goto_graph_pair, "newGraphPair.gui")
def __init__(self, tokenizer=None, alpino=None, graph_aligner=None): self.init_tokenizer(tokenizer) self.init_alpino(alpino) self.init_graph_xml_parser() self.init_graph_aligner(graph_aligner) self.init_others() # a pair of graphbank dummies, which are needed when creating a new # GraphMapping instance self._graphbanks = Pair(GraphBank("", "alpino"), GraphBank("", "alpino"))
def test_roots_subsumption(self): graphs = Pair(AlpinoGraph(), AlpinoGraph()) graphs.source.add_node(1, "x") graphs.target.add_node(2, "y") nodes = Pair(1, 2) # no roots self.assertEqual(ff_roots_subsumption(nodes, graphs), "-") graphs.source.node[1]["root"] = "wagen" graphs.target.node[2]["root"] = "wagen" self.assertEqual(ff_roots_subsumption(nodes, graphs), "equals") graphs.source.node[1]["root"] = "brandweer_wagen" graphs.target.node[2]["root"] = "brandweer" self.assertEqual(ff_roots_subsumption(nodes, graphs), "has_prefix") graphs.source.node[1]["root"] = "brandweer" graphs.target.node[2]["root"] = "brandweer_wagen" self.assertEqual(ff_roots_subsumption(nodes, graphs), "is_prefix") graphs.source.node[1]["root"] = "brandweer_wagen" graphs.target.node[2]["root"] = "wagen" self.assertEqual(ff_roots_subsumption(nodes, graphs), "has_suffix") graphs.source.node[1]["root"] = "wagen" graphs.target.node[2]["root"] = "brandweer_wagen" self.assertEqual(ff_roots_subsumption(nodes, graphs), "is_suffix") graphs.source.node[1]["root"] = "woon_wagen_bewoners_kamp_ingang" graphs.target.node[2]["root"] = "wagen_bewoners" self.assertEqual(ff_roots_subsumption(nodes, graphs), "has_infix") graphs.source.node[1]["root"] = "wagen_bewoners" graphs.target.node[2]["root"] = "woon_wagen_bewoners_kamp_ingang" self.assertEqual(ff_roots_subsumption(nodes, graphs), "is_infix") # no subsumption graphs.source.node[1]["root"] = "brandweer_wagen" graphs.target.node[2]["root"] = "kamp_ingang" self.assertEqual(ff_roots_subsumption(nodes, graphs), "none")
def get_doc_trees(self, search=False, update=True): """ Get pair of document trees """ from_tree = self.get_doc_tree("from", search=search) to_tree = self.get_doc_tree("to", search=search) if update: from_tree.update() to_tree.update() return Pair(from_tree, to_tree)
def _determine_focus_elems(self, scope_elems): source_list = [ elem for elem in scope_elems.source.findall(".//" + self.focus_tags.source) if not elem.get("_ignore") ] target_list = [ elem for elem in scope_elems.target.findall(".//" + self.focus_tags.target) if not elem.get("_ignore") ] return Pair(source_list, target_list)
def _score_sim(self, focus_elem_lists): scores = [] for source_focus_elem in focus_elem_lists.source: for target_focus_elem in focus_elem_lists.target: sim = self.sim_func( source_focus_elem.get("_terms"), target_focus_elem.get("_terms"), source_focus_elem.get("_weights"), target_focus_elem.get("_weights")) focus_elems = Pair(source_focus_elem, target_focus_elem) scores.append((sim, focus_elems)) return scores
def __init__(self, focus_tags, scope_tags=None, ignore_tags=None): """ Create a new TextAligner instance @param focus_tags: a pair of soure and target focus tags @keyword scope_tags: a pair of source and target scope tag lists; defaults to the labels of the roots of the source and target document trees. @keyword ignore_tags: a pair of source and target ignore tag lists """ self.focus_tags = focus_tags self.scope_tags = scope_tags self.ignore_tags = ignore_tags or Pair([],[])
def merge(self, graph_inst, graph_pair): """ Merges matched relations from graph instances into a graph pair as node alignments @param graph_inst: a Numpy record array containing the instances for a pair of graphs; it should contain the fields source_node, target_node and match_relation @param graph_pair: a GraphPair instance """ assert isinstance(graph_inst, numpy.ndarray) assert isinstance(graph_pair, GraphPair) for inst in graph_inst: if inst["match_relation"] != self.no_rel: nodes = Pair(inst["source_node"], inst["target_node"]) graph_pair.add_align(nodes, inst["match_relation"])
def dump(self, graph_pair, instances): graphs = graph_pair.get_graphs() feat_names = [t[0] for t in instances.dtype.descr[4:-4]] for i, inst in enumerate(instances): nodes = Pair(inst["source_node"], inst["target_node"]) print "instance:", i print "source: %s: %s: %s" % ( nodes.source, graphs.source.node[nodes.source]["label"], graphs.source.get_node_token_string(nodes.source)) print "target: %s: %s: %s" % ( nodes.target, graphs.target.node[nodes.target]["label"], graphs.target.get_node_token_string(nodes.target)) for fn in feat_names: print "%s: %s" % (fn, inst[fn]) print 40 * "-"
def test_merge_corpus(self): st = create_setting() corpus_inst = CorpusInst() inst_fname = st.dev_inst_fns[0] corpus_inst.loadtxt(inst_fname, st.descriptor.dtype) true_fname = st.dev_true_fns[0] true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE) pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) self.assertTrue(len(pred_corpus)) for graph_inst, graph_pair in zip(corpus_inst, pred_corpus): for inst in graph_inst: rel = inst["match_relation"] if rel != str(None): nodes = Pair(inst["source_node"], inst["target_node"] ) self.assertEqual(graph_pair.get_align(nodes), rel)
def align(self, source_sent, target_sent): # the strings received here after transport through XML-RPC are either # plain ascii or unicode sent_pair = Pair(source_sent, target_sent) tok_sent_pair = self._tokenize(sent_pair) parse_pair = self._parse(tok_sent_pair) graph_pair = self._load_graphs(parse_pair) instances = self._align_graphs(graph_pair) parse_align, phrase_align = self._get_alignment(instances, graph_pair) return dict(source_sent=sent_pair.source, target_sent=sent_pair.target, source_tok=tok_sent_pair.source, target_tok=tok_sent_pair.target, source_parse=parse_pair.source, target_parse=parse_pair.target, parse_align=parse_align, phrase_align=phrase_align)
def greedy_align_equal_words_roots(corpus): # if words are equal, align as equals # elif roots are equals, align as restates for graph_pair in corpus: graph_pair.clear() graphs = graph_pair.get_graphs() target_nodes = graphs.target.terminals(with_punct=False, with_empty=False) target_words = [ graphs.target.node[tn]["word"].lower() for tn in target_nodes ] target_roots = [ graphs.target.node[tn]["root"] for tn in target_nodes ] for sn in graphs.source.terminals_iter(with_punct=False, with_empty=False): sw = graphs.source.node[sn]["word"].lower() sr = graphs.source.node[sn]["root"] try: j = target_words.index(sw) except: try: j = target_roots.index(sr) except: continue else: relation = "restates" else: relation = "equals" tn = target_nodes[j] graph_pair.add_align(Pair(sn, tn), relation) del target_nodes[j] del target_words[j] del target_roots[j]
def test_parser_node_pairs(self): """ check if all node_pair are correctly read """ parser = PGCParser() pg_corpus = parser.parse("data/corpus-2.pgc") true_align = [(Pair("4", "4"), "equals"), (Pair("8", "11"), "equals"), (Pair("5", "5"), "equals"), (Pair("11", "10"), "intersects"), (Pair("19", "8"), "intersects"), (Pair("1", "1"), "restates"), (Pair("0", "0"), "restates")] read_align = pg_corpus[1].alignments() self.assertEqual(len(read_align), len(true_align)) for e in read_align: true_align.remove(e) self.assertFalse(true_align)
def greedy_align_equal_words(corpus): for graph_pair in corpus: graph_pair.clear() graphs = graph_pair.get_graphs() target_nodes = graphs.target.terminals(with_punct=False, with_empty=False) target_words = [ graphs.target.node[tn]["word"].lower() for tn in target_nodes ] for sn in graphs.source.terminals_iter(with_punct=False, with_empty=False): sw = graphs.source.node[sn]["word"].lower() try: j = target_words.index(sw) except: continue tn = target_nodes[j] graph_pair.add_align(Pair(sn, tn), "equals") del target_nodes[j] del target_words[j]
def _parse(self, tok_sent_pair): return Pair(self._parse_single_sent(tok_sent_pair.source), self._parse_single_sent(tok_sent_pair.target))
def setUp(self): self.p = Pair("x", "y")
def pp_term_align(nodes, graphs, alignment, **kwargs): """ A node preprocessing function that computes the number of aligned terminals for a given pair of source and target nodes. Assumes the "_yield" attribute on nodes as computed by the pp_yield grap preprocessing functions. Provides the node attributes: _inside: terminals aligned to terminals inside the other node _outside: aligned outside the other node or aligned to non-terminals _none: unaligned terminals """ sn_attr = graphs.source.node[nodes.source] tn_attr = graphs.target.node[nodes.target] # handle source node sn_attr["_inside"] = {} sn_attr["_outside"] = [] sn_attr["_none"] = [] for st in sn_attr["_yield"]: # find aligned target node, if any tt = alignment.get_aligned_target_node(st) if tt: if tt in tn_attr["_yield"]: relation = alignment.get_align(Pair(st, tt)) try: sn_attr["_inside"][relation].append(st) except KeyError: sn_attr["_inside"][relation] = [st] else: # if non-terminal alignments are available, # this includes all cases where a source terminal is aligned # to target *non-terminal*, even if it is within the scope of # nodes.target! sn_attr["_outside"].append(st) else: sn_attr["_none"].append(st) # handle target node # align inside count is by definition identical for source an target node tn_attr["_inside"] = sn_attr["_inside"] tn_attr["_outside"] = [] tn_attr["_none"] = [] for tt in tn_attr["_yield"]: # find aligned source node, if any st = alignment.get_aligned_source_node(tt) if st: if st not in sn_attr["_yield"]: # if non-terminal alignments are available, # this includes all case where a target terminal is aligned to # source *non-terminal*, even if it is within the scope of # nodes.source! tn_attr["_outside"].append(tt) else: tn_attr["_none"].append(tt)
type=int, metavar="N", default=0, help='minimum difference in tokens allowed between the aligned sentences ' '(default is 0)') args = parser.parse_args() if len(args.parallel_text_corpora) != len(args.source_graphbanks): exit("Error: too few or to many source graphbanks") if len(args.parallel_text_corpora) != len(args.target_graphbanks): exit("Error: too few or to many target graphbanks") for text_corpus, source_graphbank, target_graphbank in zip( args.parallel_text_corpora, args.source_graphbanks, args.target_graphbanks): graph_corpus = pgc_from_ptc(text_corpus, source_graphbank, target_graphbank, focus_tags=Pair(args.source_tag, args.target_tag), graph_formats=Pair(args.source_format, args.target_format), relations=args.relations, min_token_diff=args.min_token_diff, max_token_len=args.max_token_len) outfn = os.path.splitext(os.path.basename(text_corpus))[0] + ".pgc" graph_corpus.write(outfn, pprint=True)
def greedy_align_words(corpus): # if words are equal -> equals # if roots are equals -> restates # if source in target root and len(source)>3 -> generalizes # if target in source root and len(target)>3-> specifies # if target and source root share a morph segment ->intersects for graph_pair in corpus: graph_pair.clear() graphs = graph_pair.get_graphs() target_nodes = graphs.target.terminals(with_punct=False, with_empty=False) target_words = [ graphs.target.node[tn]["word"].lower() for tn in target_nodes ] target_roots = [ graphs.target.node[tn]["root"] for tn in target_nodes ] for sn in graphs.source.terminals_iter(with_punct=False, with_empty=False): sw = graphs.source.node[sn]["word"].lower() relation = None # align identical words for i, tw in enumerate(target_words): if sw == tw: relation = "equals" break if not relation: sr = graphs.source.node[sn]["root"] # align identical roots for i, tr in enumerate(target_roots): if sr == tr: relation = "restates" break if not relation: sparts = set(sr.split("_")) # check for spec, gen, or intersect for i, tr in enumerate(target_roots): tw = target_words[i] if sr in tr and len(sw) > 3: relation = "generalizes" break elif tr in sr and len(tw) > 3: relation = "specifies" break # check if roots share a morphological segment elif sparts.intersection(tr.split("_")): relation = "intersects" break if relation: tn = target_nodes[i] graph_pair.add_align(Pair(sn, tn), relation) del target_nodes[i] del target_words[i] del target_roots[i]
def pgc_from_ptc(text_corpus_file, source_graphbank_file, target_graphbank_file, focus_tags=Pair("s", "s"), graph_formats=Pair("alpino", "alpino"), relations=RELATIONS, min_token_diff=0, max_token_len=99999): """ Create a new parallel graph corpus from a parallel text corpus and a pair of graphbanks @PARAM text_corpus_file: parallel text corpus filename @PARAM source_bank: source graphank filename @PARAM target_bank: target graphbank filname @KEYWORD focus_tags: pair of focus tags @KEYWORD graph_format: pair of graphbank formats @KEYWORD relations: list of alignment relations @keyword min_token_diff: minimum number of different tokens @keyword max_token_len: maximum number of tokens per focus element @RETURN: ParallelGraphCorpus object """ # read parallel text corpus text_corpus = HitaextDoc(file=text_corpus_file) doc_trees = text_corpus.get_doc_trees(search=True) # read graph banks source_bank = GraphBank(source_graphbank_file, graph_formats.source) source_bank.load() target_bank = GraphBank(target_graphbank_file, graph_formats.target) target_bank.load() graph_banks = Pair(source_bank, target_bank) # create an empty parallel graph corpus graph_corpus = ParallelGraphCorpus(relations=relations) for alignment in text_corpus.alignment: if (alignment.get("from_tag") != focus_tags.source or alignment.get("to_tag") != focus_tags.target): continue source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source, alignment.get("from_id")) target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target, alignment.get("to_id")) if len(source_tokens) > max_token_len or len( target_tokens) > max_token_len: continue if (min_token_diff and _token_diff(source_tokens, target_tokens) < min_token_diff): continue # the crucial assumption is that id's of the aligned focus # elements in the marked-up text have corresponding graphs with # the same id in the graph banks source_graph_id = alignment.get("from_id") target_graph_id = alignment.get("to_id") graphs = Pair(source_bank.get_graph(source_graph_id), target_bank.get_graph(target_graph_id)) graph_pair = GraphPair(graph_banks, graphs) graph_corpus.append(graph_pair) return graph_corpus
class AlignServer(object): # the maximum number that the parser for Alpino XML will be reused max_alpino_parser_reuse = 1000 # regexp to detect unescaped ampercent, that is, not part of an entity such &, ', etc. regexp = re.compile(r"&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+);)") def __init__(self, tokenizer=None, alpino=None, graph_aligner=None): self.init_tokenizer(tokenizer) self.init_alpino(alpino) self.init_graph_xml_parser() self.init_graph_aligner(graph_aligner) self.init_others() # a pair of graphbank dummies, which are needed when creating a new # GraphMapping instance self._graphbanks = Pair(GraphBank("", "alpino"), GraphBank("", "alpino")) def init_tokenizer(self, tokenizer=None): self._tokenizer = tokenizer def init_alpino(self, alpino): # "if alpino: ..." does not work because op peculiarities of xml-rpc # implementation if alpino is None: host = "http://%s:%d" % (ALPINO_HOST, ALPINO_PORT) self._alpino = ServerProxy(host, encoding="iso-8859-1") else: self._alpino = alpino self._alpino.parse("test") def init_graph_xml_parser(self): self._alpino_xml_parser = AlpinoParser() self._alpino_parser_reused = 0 # feed fake root node to the xml parser self._alpino_xml_parser.parse_string( '<?xml version="1.0" encoding="utf-8"?>\n<treebank>') def init_graph_aligner(self, graph_aligner): if graph_aligner: self._graph_aligner = graph_aligner else: self._graph_aligner = GraphAligner() self.no_rel = self._graph_aligner.descriptor.no_rel def init_others(self): # hook for subclasses pass def align(self, source_sent, target_sent): # the strings received here after transport through XML-RPC are either # plain ascii or unicode sent_pair = Pair(source_sent, target_sent) tok_sent_pair = self._tokenize(sent_pair) parse_pair = self._parse(tok_sent_pair) graph_pair = self._load_graphs(parse_pair) instances = self._align_graphs(graph_pair) parse_align, phrase_align = self._get_alignment(instances, graph_pair) return dict(source_sent=sent_pair.source, target_sent=sent_pair.target, source_tok=tok_sent_pair.source, target_tok=tok_sent_pair.target, source_parse=parse_pair.source, target_parse=parse_pair.target, parse_align=parse_align, phrase_align=phrase_align) def _tokenize(self, sent_pair): if self._tokenizer: return self._tokenizer(sent_pair) else: return sent_pair def _parse(self, tok_sent_pair): return Pair(self._parse_single_sent(tok_sent_pair.source), self._parse_single_sent(tok_sent_pair.target)) def _parse_single_sent(self, tok_sent): # Sentence will be of type unicode if the original sentence passed # to the server proxy (client) contained any non-ascii chars, but # will be of type str otherwise. Input to the alpino server proxy # must be iso-8859-1 encoded, so we have to convert tok_sent = tok_sent.encode("iso-8859-1") graph = self._alpino.parse(tok_sent) # The returned parse is string of type unicode or str, regardless # of what the xml header produced by alpino says. First we get rid # of this xml header. return graph.split("\n", 1)[1] def _load_graphs(self, parse_pair): # The AlpinoParser instance can be reused to avoid the overhead of # creating a new one. It seems that there is maximum to the number of # lines though. After that we get an error like: # # xml.parsers.expat.ExpatError: not well-formed (invalid token): # line 2654543, column 300 # # We therefore count the number of reuses and create a new instance # when self.max_parser_reuse is reached. if self._alpino_parser_reused < self.max_alpino_parser_reuse: self._alpino_parser_reused += 1 else: self.init_graph_xml_parser() # The xml parser for graphbanks wants utf-8, # so we encode as utf-8 xml_string = (parse_pair.source.encode("utf-8") + parse_pair.target.encode("utf-8")) # Alpino outputs ill-formed xml because some "&" are not escaped # e.g. <node begin="0" cat="mwu" end="3" id="1" mwu_root="erwin & mireille" mwu_sense="erwin & mireille" rel="--"> # This is a hack to correct that. xml_string = self.regexp.sub("&", xml_string) try: id2graph = self._alpino_xml_parser.parse_string(xml_string) except ExpatError, inst: sys.stderr.write("Error:%s\nInput:\n%s\n" % (inst, xml_string)) # reset parser sys.stderr.write("Resetting Alpino output parser\n") self.init_graph_xml_parser() raise inst # the exception surfaces as an xmlrpc fault, # but subsequent calls to align method should work graph_pair = Pair(*id2graph.values()) return GraphMatching(banks=self._graphbanks, graphs=graph_pair)
def test__eq__(self): p2 = Pair("x", "y") self.assertEqual(self.p, p2)