def outputEdgeAlignment(tok_seq, amr, edge_to_toks, tok2rels): for edge_index in edge_to_toks: edge_label = amr.edges[edge_index].label for tok_idx in edge_to_toks[edge_index]: logger.writeln("Relation align: align %s to %s" % (tok_seq[tok_idx], edge_label)) tok2rels[tok_idx].add(edge_index)
def parse_flags(): try: argv = FLAGS(sys.argv) # parse flags except gflags.FlagsError as e: logger.writeln('%s\nUsage: %s ARGS\n%s' % (e, sys.argv[0], FLAGS)) sys.exit(1) return argv
def retrieveUnaligned(): num_nodes = len(amr.nodes) for node_idx in range(num_nodes): if node_idx in aligned_nodes: continue freq = amr.getFreq(node_idx) curr_node = amr.nodes[node_idx] node_str = curr_node.node_str() if node_str in role_rel_concepts: continue if (freq and freq < 100) or amr.is_predicate(curr_node): for (idx, word) in enumerate(tok_seq): if idx not in aligned_toks: lem = lem_seq[idx] if similar(node_str, word) or similar(node_str, lem): logger.writeln( "Retrieved concept map: %s, %s ; %s" % (word, lem, node_str)) category = "TOKEN" if isNumber(node_str) or isNumber(word): category = "NUMBER" all_alignments[node_idx].append( (idx, idx + 1, node_str, category)) aligned_nodes.add(node_idx) aligned_toks.add(idx)
def single_worker_decode(): jobs = get_jobs(FLAGS.do) njobs = len(jobs) fout = open(FLAGS.output, 'w') if FLAGS.output_kbest: fkbest = open(FLAGS.kbest_output, 'w') totaltime = 0 joblogger = JobLogger() while jobs: # finished jobs need to be discarded because jobs save the hypergraph job = jobs.pop(0) job.run() totaltime += job.time joblogger.log(job.id) if logger.level >= 1 and FLAGS.show_time: logger.writeln('total time: %.2f seconds (%.2f seconds/sentence)' % (totaltime, totaltime/njobs)) joblogger.finish() if FLAGS.consensus_training: consensus_trainer = ConsensusTrainer(FLAGS.lm_order, decoding_features, FLAGS.run_dir, refs) consensus_trainer.optimize()
def dump(self): """Rules are sorted by the English side. Remember to call this before finishing.""" if self.parallel: name = "%04d.%04d" % (self.parallel[0], self.n_dump) else: name = "%04d" % self.n_dump if logger.level >= 1: logger.writeln('dumping %s...' % name) self.dumped += len(self.gram) lines = [] for r in self.iter_rules(): lines.append("%s ||| %s\n" % (' '.join(str(s) for s in r.e), str(r))) lines.sort() outfile = open(os.path.join(self.outputdir, "extract.%s" % name), "w") for line in lines: outfile.write(line) outfile.close() if logger.level >= 1: logger.writeln('dumped: %s' % self.dumped) if FLAGS.accumulate: self.gram = {} else: self.gram = [] self.n_dump += 1
def align_semeval_sentence(tok_seq, lemma_seq, alignment_seq, amr, verb_list, multi_map): node_alignment, _ = initializeAlignment(amr) entity_toks = set() aligned_toks = set() all_alignments = defaultdict(list) node_to_toks, temp_aligned = extractNodeMapping(alignment_seq, amr) unaligned_set = set(xrange(len(tok_seq))) - temp_aligned alignEntities(tok_seq, amr, alignment_seq, entity_toks, aligned_toks, all_alignments, unaligned_set, node_alignment) #Verbalization list verb_map = defaultdict(set) alignVerbalization(tok_seq, lemma_seq, amr, verb_list, all_alignments, verb_map, aligned_toks, node_alignment, multi_map) aligned_nodes = set([ node_idx for (node_idx, aligned) in enumerate(node_alignment) if aligned ]) alignOtherConcepts(tok_seq, lemma_seq, amr, aligned_toks, aligned_nodes, node_to_toks, all_alignments, multi_map) ##Based on the alignment from node index to spans in the string unaligned_set = set(xrange(len(tok_seq))) - aligned_toks unaligned_idxs = sorted(list(unaligned_set)) logger.writeln("Unaligned tokens: %s" % (" ".join([tok_seq[i] for i in unaligned_idxs]))) unaligned_nodes = amr.unaligned_nodes(aligned_nodes) logger.writeln("Unaligned vertices: %s" % " ".join([node.node_str() for node in unaligned_nodes])) return all_alignments
def timed(l): prev = time.time() for i, x in enumerate(l, 1): if i % FLAGS.time_interval == 0: logger.writeln('%s (%s/sec)' % (i, FLAGS.time_interval / (time.time() - prev))) prev = time.time() yield x
def test_inside_exp_outside_exp_log(self): self.hp.set_semiring(LOGPROB) self.hp.set_functions(lambda x: log(x.prob), lambda x: 1, None) self.hp.inside() self.hp.outside() self.hp.inside_exp() self.hp.outside_exp() logger.writeln(self.hp.dot())
def dump_trees(samples, filename): logger.writeln('dump trees') treefile = TreeFile(filename) for s in timed(samples): # call this before dumping rules for each sample! LEXICAL_WEIGHTER.compute_lexical_weights(s.a) treefile.dump(s) treefile.close()
def test_inside_exp_outside_exp(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.inside() self.hp.outside() self.hp.inside_exp() self.hp.outside_exp() logger.writeln(self.hp.dot())
def log(self, jid): """read and write result of one job""" if logger.level >= 1: fname = '%s/%s_%s' % (FLAGS.run_dir, 'log', str(jid).rjust(5, '0')) f = open(fname) for line in f: logger.write(line) logger.writeln() f.close()
def serve(self): if logger.level >= 1: logger.writeln('start server') self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.bind((self.host, FLAGS.port)) self.sock.listen(5) while self.nfinished != self.njobs or \ self.nterminated != len(self.nodes): conn, addr = self.sock.accept() self.handle(conn)
def scan(self, i, j): if logger.level >= 3: logger.writeln('Scan: [%s, %s]' % (i, j)) for dotitem in self.bins[i][j - 1]: word = self.chart.fwords[j - 1] next_node = dotitem.node.get(word) if next_node is not None: new_dotitem = DotItem(next_node, i, j, dotitem.ants) if logger.level >= 4: logger.writeln(new_dotitem) self.add(new_dotitem)
def complete(self, i, k, j): if logger.level >= 3: logger.writeln('Complete: %s %s %s' % (i, k, j)) for dotitem in self.bins[i][k]: for var, bin in self.chart.iter_items_by_nts(k, j): next_node = dotitem.node.get(var) if next_node is not None: new_dotitem = DotItem(next_node, i, j, dotitem.ants + (bin, )) if logger.level >= 4: logger.writeln('new dotitem: %s' % new_dotitem) self.add(new_dotitem)
def start_slave(self, sid, host): if logger.level >= 1: logger.writeln('start slave %s on %s' % (sid, host)) cmd = ' '.join(sys.argv) # slaves inherit master options but it's important to override _Parallel # and _Slave to make them work in slave mode # they write their detailed translation report to the same log file # but their stdout and stderr are still conviniently connected to the # master terminal options = "--noparallel --slave --slave_id=%s \ --log=slaves.log \ --server=%s" % (sid, socket.gethostname()) system(r'ssh %s "cd %s; nohup %s %s" &' % (host, os.getcwd(), cmd, options))
def test_best_paths(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.assert_done('topo_sort') logger.writeln(self.hp.root.best_paths()[0].tree_str()) logger.writeln(self.hp.root.best_paths()[0].weight) logger.writeln(self.hp.root.best_paths()[1].tree_str()) logger.writeln(self.hp.root.best_paths()[1].weight)
def binary_expand(self, i, j): if logger.level >= 4: logger.writeln('span %s %s' % (i, j)) new_items = Cube() for k in range(i + 1, j): for lvar, lbin in self.chart.iter_items_by_nts(i, k): for rvar, rbin in self.chart.iter_items_by_nts(k, j): for grammar in self.grammars: rulebin = grammar.itg.get_sorted_rules((lvar, rvar)) if rulebin: new_items.add_cube((rulebin, lbin, rbin), self.get_cube_op(i, j)) for new_item in new_items.iter_top(FLAGS.bin_size): if logger.level >= 4: logger.writeln(new_item) self.chart.add(new_item)
def load_features(self, features_weights): features = [] weights = [] for s in features_weights: feature_name, weight_str = s.split(':') weight = float(weight_str) feature_class = getattr(feature_lib, feature_name, None) if feature_class is None: logger.writeln('unknown feature: %s' % feature_name) else: if feature_name.endswith('LM'): feature = feature_class(FLAGS.lm_order, FLAGS.lm) else: feature = feature_class() features.append(feature) weights.append(weight) return features, weights
def init_split(samples, split=True): global SAMPLER logger.writeln('initialization. split=%s' % split) SAMPLER = init_sampler() for sample in timed(samples): if split: for node in sample.hg.nodes: node.pnt = node.nt node.nt = random.choice(child_symbols(node.pnt)) for n, rule in sample.composed_rules_under(sample.hg.root): SAMPLER.count(rule) if FLAGS.type: # mapping from rules to nodes, and from nodes to rules CUT_INDEX.add(rule, sample, n) n.composed_rule = rule if FLAGS.check_index: CUT_INDEX.check(sample)
def em_step(self, iteration): ffile = open(self.ffilename) efile = open(self.efilename) afile = open(self.afilename) alignments = Alignment.reader_pharaoh(ffile, efile, afile) dirname = os.path.join(self.outputdir, 'iter_%s' % str(iteration + 1).rjust(3, '0')) os.mkdir(dirname) if logger.level >= 1: logger.writeln('\niteration %s' % (iteration + 1)) likelihood = 0 starttime = time.time() for i, alignment in enumerate(alignments, 1): if i % FLAGS.emtrain_log_interval == 0: logger.writeln('%s sentences at %s secs/sent' % (i, (time.time() - starttime) / i)) starttime = time.time() extractor = Extractor( maxabslen=100000, maxlen=10000, minhole=1, maxvars=100000, lexical_weighter=self.lexical_weighter, forbid_adjacent=self.forbid_adjacent, maximize_derivation=self.maximize_derivation, require_aligned_terminal=self.require_aligned_terminal) hg = extractor.extract_hypergraph(alignment) if hg is None: continue # compute expected counts self.compute_expected_counts(hg) likelihood += hg.root.inside treefilename = os.path.join(dirname, 'tree_%s' % str(i).rjust(8, '0')) self.write_viterbi_tree(hg, treefilename) #for edge in hg.edges(): # logger.writeln('%s %s' % (self.counter.get_prob(edge.rule), # edge.rule)) if logger.level >= 1: logger.writeln('likelihood: %s' % likelihood) if logger.level >= 1: logger.writeln('normalizing...') self.counter.normalize_vbdp(self.alpha, self.threshold) if logger.level >= 1: logger.writeln('prob table size: %s' % len(self.counter.prob))
def compute_gradient(self): self.collect_expected_feature_counts() self.collect_expected_products() result = [] for i in range(self.feature_n): # print('feature %s' % i) gradient = 0 tmp = 0 for n in range(self.max_n): # print('clip count %s-gram gradient' % (n+1)) if self.expected_clipped_counts[n] == 0: continue clipped_count_grad = \ self.ep_clipped_count_feature[n][i] - \ self.expected_clipped_counts[n] * \ self.expected_feature_counts[i] # print(clipped_count_grad) tmp += clipped_count_grad / self.expected_clipped_counts[n] gradient += tmp / self.max_n tmp = 0 for n in range(self.max_n): # print('count %s-gram gradient' % (n+1)) if self.expected_counts[n] == 0: continue count_grad = self.ep_count_feature[n][i] - \ self.expected_counts[n]*self.expected_feature_counts[i] # print(count_grad) tmp += count_grad / self.expected_counts[n] gradient -= tmp / self.max_n # brevity penalty if self.expected_counts[0] < self.ref_length: gradient += (self.ep_count_feature[0][i] - self.expected_counts[0]* \ self.expected_feature_counts[i]) * \ self.ref_length / \ (self.expected_counts[0])^2 result.append(gradient) if logger.level >= 1: logger.writeln('gradient: %s' % result) return result
def handle(self, conn): msg = conn.recv(1024).decode() sid, status = msg.split() if status == 'ready': if self.i <= self.njobs: conn.send(('%s\n' % self.i).encode()) self.sid2job[sid] = (self.i, time()) self.i += 1 else: conn.send('0'.encode()) self.nterminated += 1 if logger.level >= 1: logger.writeln() logger.writeln('slave %s told to terminate' % sid) elif status == 'ok': self.nfinished += 1 jid, start_time = self.sid2job[sid] self.total_time += time() - start_time self.joblogger.log(jid)
def __init__(self, m, lmfile): Feature.__init__(self) self.stateless = False self.m = m self.lmfile = lmfile self.ngram_enum = NgramEnumerator(self.m) if FLAGS.use_python_lm: from python_lm import LanguageModel else: from swig_lm import LanguageModel logger.writeln('reading LM: %s' % self.lmfile) if FLAGS.use_python_lm: self.lm = LanguageModel(self.lmfile) self.getcost = self.lm.get else: self.lm = LanguageModel(self.m, self.lmfile) self.getcost = self.lm
def pop(self): "Return None if agenda is empty" while True: try: h, item = heappop(self.items) except IndexError: # empty heap, return None break if item.dead: # item pruned in chart if logger.level >= 5: logger.writeln('pop dead item: %s' % item) logger.writeln(item) logger.writeln(item.incoming[0].rule) self.deadpop += 1 else: if logger.level >= 4: logger.writeln('pop: %s' % item) logger.writeln(item.incoming[0].rule) self.popped += 1 return item
def probe(self, cpu_reserve, mem_reserve): """return number of decoder instances that can run on this node.""" result = 0 if self.node: status = system('ssh %s :' % self.node) if status != 0: logger.writeln('%s down' % self.node) else: cpu = self.cpu_usage() mem = self.mem_free() if logger.level >= 1: logger.writeln('cpu usage: %.1f%%' % cpu) logger.writeln('mem free: %s kB' % mem) result = int(min((100-cpu)/cpu_reserve, mem/mem_reserve)) if logger.level >= 1: logger.writeln('%s decoder instances will start on %s' % (result, self.node)) logger.writeln() system('rm -f %s' % self.tmp_stat) system('rm -f %s' % self.tmp_meminfo) return result
def parse_agenda(self): while len(self.agenda) > 0: item = self.agenda.pop() if logger.level >= 4: logger.writeln('pop: %s' % item) for item1, item2, inverted in self.neighboring_pairs(item): # avoid duplicated edges. note that in ABC grammar, # if the boxes of item1 and item2 are given, the nt of the # new item is fixed if logger.level >= 4: logger.writeln('neighbors: %s %s' % (item1, item2)) key = (item1.nt, item1.fi, item1.fj, item1.ei, item1.ej, item2.nt, item2.fi, item2.fj, item2.ei, item2.ej) if key not in self.edge_index: self.edge_index.add(key) new_item = self.make_item(item1, item2, inverted) if self.chart_add(new_item): self.agenda.append(new_item) self.neighbor_index.add(new_item) self.glue_nodes.append(new_item) if logger.level >= 4: logger.writeln('push: %s' % new_item) # self.stats() root = self.final_glue() self.hg = Hypergraph(root) self.hg.topo_sort() self.stats() return self.hg
def load(self, filename): if logger.level >= 1: logger.writeln('loading rules from %s...' % filename) percent_counter = PercentCounter(input=filename, file=logger.file) f = open(filename) for i, line in enumerate(f): if logger.level >= 1: percent_counter.print_percent(i) try: rule = Rule() rule.fromstr(line) except AssertionError: logger.write('bad rule: %s %s: %s\n' % (filename, i, line)) self.nbadrules += 1 continue rule.grammar = self # used in computing features scores self.features.score_rule(rule) if rule.arity == 0: self.lexgrammar.add(rule) else: self.itg.add(rule) f.close() if logger.level >= 1: logger.writeln() logger.writeln(self.stats())
def alignVerbalization(tok_seq, lemma_seq, amr, verb_list, all_alignments, verb_map, aligned_toks, node_alignment, multi_map): matched_tuples = set() for (idx, curr_tok) in enumerate(tok_seq): if idx in aligned_toks: continue if not curr_tok in verb_list: curr_tok = lemma_seq[idx] if curr_tok in verb_list: for subgraph in verb_list[curr_tok]: matched_frags = amr.matchSubgraph(subgraph) if matched_frags: subgraph_repr = subgraph_str(subgraph) if len(matched_frags) > 1: logger.writeln( "Verbalize %s to more than 1 occurrences!" % curr_tok) for frag_tuples in matched_frags: valid = True for (head, rel, tail) in frag_tuples: if (head, rel, tail) in matched_tuples: valid = False break matched_tuples.add((head, rel, tail)) if valid: logger.writeln( "Verbalize %d-%d, %s to %s!" % (idx, idx + 1, curr_tok, subgraph_repr)) aligned_toks.add(idx) for (head, rel, tail) in frag_tuples: verb_map[head].add((head, rel, tail)) node_alignment[head] = 1 node_alignment[tail] = 1 all_alignments[head].append( (idx, idx + 1, subgraph_repr, "MULT")) head = frag_tuples[0][0] # head_concept = amr.nodes[head].node_str() multi_map[subgraph_repr] += 1 break
def add(self, item): added = False bin_idx = self.key(item) if bin_idx: # discard items with None key bin = self.bins.setdefault(bin_idx, self.binclass(FLAGS.bin_size, self)) # preprune if not FLAGS.use_simple_bin and item.rank_cost() > bin.cutoff: if logger.level >= 4: logger.writeln('prepruned: %s' % item) self.prepruned += 1 # TODO: a hack: ban unary negative deduction, # only for ghkm rules elif item.incoming[0].rule.arity == 1 and len(item.incoming[0].rule.f) == 1 and \ item.incoming[0].cost <= 0 and \ item.incoming[0].rule.grammar is not None and \ 'ghkm' in item.incoming[0].rule.grammar.name: if logger.level >= 4: logger.write( 'negative unary deduction for ghkm banned: %s' % item) self.neg_unary_pruned += 1 # ban negative deduction elif FLAGS.ban_negative_deduction and item.incoming[0].cost <= 0: if logger.level >= 4: logger.writeln('negative deduction banned: %s' % item.incoming[0]) self.negcost_pruned += 1 # unary cycle banned elif item.unary_cycle(): if logger.level >= 4: logger.writeln('unary cycle broken: %s' % item) self.unary_cycle_broken += 1 # merging needed elif (not FLAGS.use_simple_bin) and item in self.index: oldcost, olditem = self.index[item] item_merged = item.merge(olditem) if item_merged: # old item better if logger.level >= 4: logger.writeln('merged: %s' % item) else: # new item better bin.add(item) if not FLAGS.use_simple_bin: bin.ndead += 1 added = True self.merged += 1 # no need to merge else: bin.add(item) added = True return added
def push(self, item): if logger.level >= 4: logger.writeln('push:') logger.writeln(item) logger.writeln(item.incoming[0]) h = self.heuristic(item) heappush(self.items, (h, item)) self.pushed += 1
def linearize_amr(args): logger.file = open(os.path.join(args.run_dir, 'logger'), 'w') amr_file = os.path.join(args.data_dir, 'aligned_amr_nosharp') alignment_file = os.path.join(args.data_dir, 'alignment') sent_file = os.path.join(args.data_dir, 'sentence') tok_file = os.path.join(args.data_dir, 'token') #lemma_file = os.path.join(args.data_dir, 'lemma') pos_file = os.path.join(args.data_dir, 'pos') amr_graphs = load_amr_graphs(amr_file) alignments = [line.strip().split() for line in open(alignment_file, 'r')] sents = [line.strip().split() for line in open(sent_file, 'r')] toks = [line.strip().split() for line in open(tok_file, 'r')] #lemmas = [line.strip().split() for line in open(lemma_file, 'r')] poss = [line.strip().split() for line in open(pos_file, 'r')] assert len(amr_graphs) == len(alignments) and len(amr_graphs) == len(sents) and len(amr_graphs) == len(toks) and len(amr_graphs) == len(poss), '%d %d %d %d %d' % (len(amr_graphs), len(alignments), len(sents), len(toks), len(poss)) #assert len(amr_graphs) == len(alignments) and len(amr_graphs) == len(sents) and len(amr_graphs) == len(toks) and len(amr_graphs) == len(lemmas) and len(amr_graphs) == len(poss), '%d %d %d %d %d %d' % (len(amr_graphs), len(alignments), len(sents), len(toks), len(lemmas), len(poss)) #lemma_map = initialize_lemma(args.lemma) num_self_cycle = 0 used_sents = 0 amr_statistics = AMR_stats() for (sent_index, (sent_seq, tok_seq, pos_seq, alignment_seq, amr_graph)) in enumerate(zip(sents, toks, poss, alignments, amr_graphs)): logger.writeln('Sentence #%d' % (sent_index+1)) logger.writeln(str(amr_graph)) #if sent_index > 100: # break edge_alignment = bitarray(len(amr_graph.edges)) if edge_alignment.count() != 0: edge_alignment ^= edge_alignment assert edge_alignment.count() == 0 has_cycle = False if amr_graph.check_self_cycle(): num_self_cycle += 1 has_cycle = True #logger.writeln('self cycle detected') amr_graph.set_sentence(toks) #amr_graph.set_lemmas(lemma_seq) amr_graph.set_poss(pos_seq) aligned_fragments = [] reentrancies = {} #Map multiple spans as reentrancies, keeping only one as original, others as connections has_multiple = False no_alignment = False aligned_set = set() #all_frags = [] #(opt_toks, role_toks, aligned_fragments) = extract_fragments(alignment_seq, amr_graph) ##logger.writeln(str(opt_toks)) ##logger.writeln(str(role_toks)) #if not aligned_fragments: # logger.writeln('wrong alignments') # continue #temp_aligned = set(aligned_fragments.keys()) #aligned_fragments = sorted(aligned_fragments.items(), key=lambda frag: frag[0]) #temp_unaligned = set(xrange(len(pos_seq))) - temp_aligned (entity_frags, root2entityfrag, root2entitynames) = amr_graph.extract_all_entities() new_graph = AMRGraph.collapsed_graph(amr_graph, root2entityfrag, root2entitynames) logger.writeln(str(new_graph)) #logger.writeln(amr_graph.collapsed_form(root2entityfrag, root2entitynames)) (relation_nums, entity_nums, predicate_nums, variable_nums, const_nums, reentrancy_nums) = amr_graph.statistics(root2entityfrag, root2entitynames) amr_statistics.update(reentrancy_nums, predicate_nums, variable_nums, const_nums, entity_nums, relation_nums)
def linearize_amr(args): logger.file = open(os.path.join(args.run_dir, 'logger'), 'w') amr_file = os.path.join(args.data_dir, 'amr') alignment_file = os.path.join(args.data_dir, 'alignment') if args.use_lemma: tok_file = os.path.join(args.data_dir, 'lemmatized_token') else: tok_file = os.path.join(args.data_dir, 'token') pos_file = os.path.join(args.data_dir, 'pos') amr_graphs = load_amr_graphs(amr_file) alignments = [line.strip().split() for line in open(alignment_file, 'r')] toks = [line.strip().split() for line in open(tok_file, 'r')] poss = [line.strip().split() for line in open(pos_file, 'r')] assert len(amr_graphs) == len(alignments) and len(amr_graphs) == len(toks) and len(amr_graphs) == len(poss), '%d %d %d %d %d' % (len(amr_graphs), len(alignments), len(toks), len(poss)) num_self_cycle = 0 used_sents = 0 amr_statistics = AMR_stats() if args.use_stats: amr_statistics.loadFromDir(args.stats_dir) #print amr_statistics else: os.system('mkdir -p %s' % args.stats_dir) amr_statistics.collect_stats(amr_graphs) amr_statistics.dump2dir(args.stats_dir) if args.parallel: singleton_num = 0.0 multiple_num = 0.0 total_num = 0.0 empty_num = 0.0 amr_seq_file = os.path.join(args.run_dir, 'amrseq') tok_seq_file = os.path.join(args.run_dir, 'tokseq') map_seq_file = os.path.join(args.run_dir, 'train_map') amrseq_wf = open(amr_seq_file, 'w') tokseq_wf = open(tok_seq_file, 'w') mapseq_wf = open(map_seq_file, 'w') for (sent_index, (tok_seq, pos_seq, alignment_seq, amr)) in enumerate(zip(toks, poss, alignments, amr_graphs)): logger.writeln('Sentence #%d' % (sent_index+1)) logger.writeln(' '.join(tok_seq)) amr.setStats(amr_statistics) edge_alignment = bitarray(len(amr.edges)) if edge_alignment.count() != 0: edge_alignment ^= edge_alignment assert edge_alignment.count() == 0 has_cycle = False if amr.check_self_cycle(): num_self_cycle += 1 has_cycle = True amr.set_sentence(tok_seq) amr.set_poss(pos_seq) aligned_fragments = [] reentrancies = {} #Map multiple spans as reentrancies, keeping only one as original, others as connections has_multiple = False no_alignment = False aligned_set = set() (opt_toks, role_toks, node_to_span, edge_to_span, temp_aligned) = extractNodeMapping(alignment_seq, amr) temp_unaligned = set(xrange(len(pos_seq))) - temp_aligned all_frags = [] all_alignments = defaultdict(list) ####Extract named entities##### for (frag, wiki_label) in amr.extract_entities(): if len(opt_toks) == 0: logger.writeln("No alignment for the entity found") (aligned_indexes, entity_spans) = all_aligned_spans(frag, opt_toks, role_toks, temp_unaligned) root_node = amr.nodes[frag.root] entity_mention_toks = root_node.namedEntityMention() total_num += 1.0 if entity_spans: entity_spans = removeRedundant(tok_seq, entity_spans, entity_mention_toks) if len(entity_spans) == 1: singleton_num += 1.0 logger.writeln('Single fragment') for (frag_start, frag_end) in entity_spans: logger.writeln(' '.join(tok_seq[frag_start:frag_end])) all_alignments[frag.root].append((frag_start, frag_end, wiki_label)) temp_aligned |= set(xrange(frag_start, frag_end)) else: multiple_num += 1.0 logger.writeln('Multiple fragment') logger.writeln(aligned_indexes) logger.writeln(' '.join([tok_seq[index] for index in aligned_indexes])) for (frag_start, frag_end) in entity_spans: logger.writeln(' '.join(tok_seq[frag_start:frag_end])) all_alignments[frag.root].append((frag_start, frag_end, wiki_label)) temp_aligned |= set(xrange(frag_start, frag_end)) else: empty_num += 1.0 ####Process date entities date_entity_frags = amr.extract_all_dates() for frag in date_entity_frags: all_date_indices, index_to_attr = getDateAttr(frag) covered_toks, non_covered, index_to_toks = getSpanSide(tok_seq, alignment_seq, frag, temp_unaligned) covered_set = set(covered_toks) all_spans = getContinuousSpans(covered_toks, temp_unaligned, covered_set) if all_spans: temp_spans = [] for span_start, span_end in all_spans: if span_start > 0 and (span_start-1) in temp_unaligned: if tok_seq[span_start-1] in str(frag) and tok_seq[0] in '0123456789': temp_spans.append((span_start-1, span_end)) else: temp_spans.append((span_start, span_end)) else: temp_spans.append((span_start, span_end)) all_spans = temp_spans all_spans = removeDateRedundant(all_spans) for span_start, span_end in all_spans: all_alignments[frag.root].append((span_start, span_end, None)) temp_aligned |= set(xrange(span_start, span_end)) if len(non_covered) == 0: print 'Dates: %s' % ' '.join(tok_seq[span_start:span_end]) else: for index in temp_unaligned: curr_tok = tok_seq[index] found = False for un_tok in non_covered: if curr_tok[0] in '0123456789' and curr_tok in un_tok: print 'recovered: %s' % curr_tok found = True break if found: all_alignments[frag.root].append((index, index+1, None)) temp_aligned.add(index) print 'Date: %s' % tok_seq[index] #Verbalization list verb_map = {} for (index, curr_tok) in enumerate(tok_seq): if curr_tok in VERB_LIST: for subgraph in VERB_LIST[curr_tok]: matched_frags = amr.matchSubgraph(subgraph) if matched_frags: temp_aligned.add(index) for (node_index, ex_rels) in matched_frags: all_alignments[node_index].append((index, index+1, None)) verb_map[node_index] = subgraph #####Load verbalization list ##### for node_index in node_to_span: if node_index in all_alignments: continue all_alignments[node_index] = node_to_span[node_index] ##Based on the alignment from node index to spans in the string temp_unaligned = set(xrange(len(pos_seq))) - temp_aligned assert len(tok_seq) == len(pos_seq) amr_seq, cate_tok_seq, map_seq = categorizeParallelSequences(amr, tok_seq, all_alignments, temp_unaligned, verb_map, args.min_prd_freq, args.min_var_freq) print >> amrseq_wf, ' '.join(amr_seq) print >> tokseq_wf, ' '.join(cate_tok_seq) print >> mapseq_wf, '##'.join(map_seq) #To separate single space amrseq_wf.close() tokseq_wf.close() mapseq_wf.close() #print "one to one alignment: %lf" % (singleton_num/total_num) #print "one to multiple alignment: %lf" % (multiple_num/total_num) #print "one to empty alignment: %lf" % (empty_num/total_num) else: #Only build the linearized token sequence mle_map = loadMap(args.map_file) if args.use_lemma: tok_file = os.path.join(args.data_dir, 'lemmatized_token') else: tok_file = os.path.join(args.data_dir, 'token') ner_file = os.path.join(args.data_dir, 'ner') date_file = os.path.join(args.data_dir, 'date') all_entities = identify_entities(tok_file, ner_file, mle_map) all_dates = dateMap(date_file) tokseq_result = os.path.join(args.data_dir, 'linearized_tokseq') dev_map_file = os.path.join(args.data_dir, 'cate_map') tokseq_wf = open(tokseq_result, 'w') dev_map_wf = open(dev_map_file, 'w') for (sent_index, (tok_seq, pos_seq, entities_in_sent)) in enumerate(zip(toks, poss, all_entities)): print 'snt: %d' % sent_index n_toks = len(tok_seq) aligned_set = set() all_spans = [] date_spans = all_dates[sent_index] date_set = set() #Align dates for (start, end) in date_spans: if end - start > 1: new_aligned = set(xrange(start, end)) aligned_set |= new_aligned entity_name = ' '.join(tok_seq[start:end]) if entity_name in mle_map: entity_typ = mle_map[entity_name] else: entity_typ = ('DATE', "date-entity", "NONE") all_spans.append((start, end, entity_typ)) print 'Date:', start, end else: date_set.add(start) #First align multi tokens for (start, end, entity_typ) in entities_in_sent: if end - start > 1: new_aligned = set(xrange(start, end)) if len(aligned_set & new_aligned) != 0: continue aligned_set |= new_aligned entity_name = ' '.join(tok_seq[start:end]) if entity_name in mle_map: entity_typ = mle_map[entity_name] else: entity_typ = ('NE_person', "person", '-') all_spans.append((start, end, entity_typ)) #Single token for (index, curr_tok) in enumerate(tok_seq): if index in aligned_set: continue curr_pos = pos_seq[index] aligned_set.add(index) if curr_tok in mle_map: (category, node_repr, wiki_label) = mle_map[curr_tok] if category.lower() == 'none': all_spans.append((index, index+1, (curr_tok, "NONE", "NONE"))) else: all_spans.append((index, index+1, mle_map[curr_tok])) else: if curr_tok[0] in '\"\'.': print 'weird token: %s, %s' % (curr_tok, curr_pos) continue if index in date_set: entity_typ = ('DATE', "date-entity", "NONE") all_spans.append((index, index+1, entity_typ)) elif curr_tok in VERB_LIST: node_repr = VERB_LIST[curr_tok][0].keys()[0] entity_typ = ('VERBAL', node_repr, "NONE") all_spans.append((index, index+1, entity_typ)) elif curr_pos[0] == 'V': node_repr = '%s-01' % curr_tok all_spans.append((index, index+1, ('-VERB-', node_repr, "NONE"))) else: node_repr = curr_tok all_spans.append((index, index+1, ('-SURF-', curr_tok, "NONE"))) all_spans = sorted(all_spans, key=lambda span: (span[0], span[1])) print all_spans linearized_tokseq, map_repr_seq = getIndexedForm(all_spans) print >> tokseq_wf, ' '.join(linearized_tokseq) print >> dev_map_wf, '##'.join(map_repr_seq) tokseq_wf.close() dev_map_wf.close()