def main(): cr = JAMR_CorpusReader() cr.load_amrs(sys.argv[1], verbose=False) all_entities = [] for amr in cr.amrs: for node_id in amr.alignments: # get entity info token_ids = amr.alignments[node_id] if not token_ids: continue nodes = amr.alignmentsToken2Node(token_ids[0]) if len(nodes) <= 1: continue entity_sg = amr.findSubGraph(nodes) root = entity_sg.root if not node_id == root: continue edges = entity_sg.edges if not edges: continue if len(edges) == 1 and edges[0][1] in [':polarity', ':mode']: continue tokens = [ amr.tokens[t - 1] for t in token_ids if 0 <= t <= len(amr.tokens) ] final_nodes = [ n for n in nodes if not [e for e in edges if e[0] == n] ] entity_type = [ amr.nodes[id] for id in nodes if id not in final_nodes ] entity_type = ','.join(entity_type) nodes = {n: amr.nodes[n] for n in nodes} all_entities.append((amr, entity_type, tokens, root, nodes, edges)) create_fixed_rules(all_entities) create_var_rules(all_entities) create_name_rules(all_entities) create_date_entity_rules(all_entities) create_normalization_rules() print('[entity rules] Writing rules') with open('../entity_rules.json', 'w+', encoding='utf8') as f: JSON.dump(entity_rules_json, f, sort_keys=True) print('[entity rules] Fixed:', len(entity_rules_json['fixed'])) print('[entity rules] Variable:', len(entity_rules_json['var'])) print('[entity rules] Date-entity:', len(entity_rules_json['date-entity'])) print('[entity rules] Named entity:', len(entity_rules_json['names'])) print('[entity rules] Normalize:', sum(len(x) for x in entity_rules_json['normalize'].values())) print('[entity rules] Done')
def main(): cr = JAMR_CorpusReader() cr.load_amrs(sys.argv[1], verbose=False) special_alignments = Counter() for amr in cr.amrs: for node_id in amr.nodes: if node_id not in amr.alignments or not amr.alignments[node_id]: special_alignments[amr.nodes[node_id]] += 1 for special in sorted(special_alignments, reverse=True, key=lambda x: special_alignments[x]): print(special.strip(), special_alignments[special])
def main(): cr = JAMR_CorpusReader() cr.load_amrs(sys.argv[1], verbose=False) special_alignments = {} for amr in cr.amrs: for node_id in amr.alignments: aligned_token_ids = amr.alignments[node_id] aligned_node_ids = amr.alignmentsToken2Node(aligned_token_ids[0]) aligned_node_ids = [ id for id in aligned_node_ids if '"' not in amr.nodes[id] ] if len(aligned_node_ids) <= 1: continue subgraph = amr.findSubGraph(aligned_node_ids) # normalize named entities if len(subgraph.edges) == 1 and subgraph.edges[0][1] == ':name': subgraph.nodes[subgraph.root] = '[entity]' # normalize numbers for n in subgraph.nodes: if re.match('[0-9]+', subgraph.nodes[n]): subgraph.nodes[n] = '[NUM]' if subgraph.nodes[n].endswith('quantity'): subgraph.nodes[n] = '[quantity]' # if subgraph.nodes[n].endswith('entity'): # subgraph.nodes[n] = '[value]' aligned_subgraph = str(subgraph) aligned_tokens = ' '.join(amr.tokens[x] for x in aligned_token_ids if x < len(amr.tokens)) if aligned_subgraph not in special_alignments: special_alignments[aligned_subgraph] = Counter() special_alignments[aligned_subgraph][aligned_tokens] += 1 for special in sorted(special_alignments, key=lambda x: sum(special_alignments[x].values()), reverse=True): print(special, sum(special_alignments[special].values())) print(special_alignments[special].most_common(10)) print('\n')
from amr import JAMR_CorpusReader def get_token(gold_amr, t): if 0 <= t - 1 < len(gold_amr.tokens): return gold_amr.tokens[t - 1] else: return 'NA' if __name__ == '__main__': file = sys.argv[1] cr = JAMR_CorpusReader() cr.load_amrs(file) gold_amrs = cr.amrs count = 0 sentences = set() rels = Counter() for sent_idx, gold_amr in enumerate(gold_amrs): for i, tok in enumerate(gold_amr.tokens): align = gold_amr.alignmentsToken2Node(i + 1) # merge alignments root = gold_amr.findSubGraph(align).root for n in gold_amr.nodes: if n in align: continue edges = [(s, r, t) for s, r, t in gold_amr.edges if s in align and t in align]
if isHead: # rearrange latent if necessary transitions.latent.append(transitions.latent.pop(idx)) return True idx -= 1 return False if __name__ == '__main__': input_file = sys.argv[1] gfile = sys.argv[2] if len(sys.argv) > 2 else 'oracle_amrs.txt' afile = sys.argv[3] if len(sys.argv) > 3 else 'oracle_actions.txt' cr = JAMR_CorpusReader() cr.load_amrs(input_file) oracle = AMR_Oracle(verbose=True) print_log("amr", "Processing oracle") oracle.runOracle(cr.amrs, action_file=afile, graph_file=gfile, add_unaligned=0) for stat in oracle.stats: print_log("amr", stat) print_log("amr", oracle.stats[stat].most_common(100)) print_log("amr", "") if use_addnode_rules: for x in transitions.entity_rule_totals: perc = transitions.entity_rule_stats[x]/transitions.entity_rule_totals[x] print(x, transitions.entity_rule_stats[x], '/', transitions.entity_rule_totals[x], '=', f'{perc:.2f}') perc = sum(transitions.entity_rule_stats.values())/sum(transitions.entity_rule_totals.values()) print('Totals:', f'{perc:.2f}')
from amr import JAMR_CorpusReader amr_file = '../data/train.txt' new_amr_file = '../data/train.no_wiki.txt' cr = JAMR_CorpusReader() cr.load_amrs(amr_file, verbose=False) amrs = cr.amrs sent_idx = 0 for amr in amrs: wiki_edges = [] wiki_nodes = [] for s, r, t in amr.edges: if r == ':wiki': wiki_edges.append((s, r, t)) wiki_nodes.append(t) for e in wiki_edges: amr.edges.remove(e) for n in wiki_nodes: del amr.nodes[n] if n in amr.alignments: del amr.alignments[n] print('deleting wiki:', sent_idx) sent_idx += 1 with open(new_amr_file, 'w+', encoding='utf8') as f: for amr in amrs: f.write(amr.toJAMRString())
import sys from amr import JAMR_CorpusReader if __name__ == '__main__': args = sys.argv infile = args[1] cr = JAMR_CorpusReader() cr.load_amrs(infile) gold_amrs = cr.amrs for sentidx, amr in enumerate(gold_amrs): for n in amr.alignments: print( str(sentidx) + '\t' + n + '\t' + ','.join(str(s) for s in amr.alignments[n])) print()
with open(alignments_file, 'r') as f: align = dict() for line in f: if not line.strip(): align_per_sent.append(align) align = dict() continue if line.startswith('[amr]'): continue sent_idx, node, tokens = (t for t in line.split()) tokens = [int(t) for t in tokens.split(',')] sent_idx = int(sent_idx) assert (sent_idx == len(align_per_sent)) align[node] = tokens cr = JAMR_CorpusReader() cr.load_amrs(amr_file) amrs = cr.amrs sent_idx = 0 for amr, align in zip(amrs, align_per_sent): print(' '.join(amr.tokens)) for n in amr.nodes: jamr_align = amr.alignments[n] if n in amr.alignments else [] all_align = align[n] if n in align else [] if not set(all_align).issuperset(jamr_align): print('JAMR:', sent_idx, n, amr.nodes[n], jamr_align) print('All:', sent_idx, n, amr.nodes[n], all_align) sent_idx += 1
def main(): cr = JAMR_CorpusReader() cr.load_amrs(sys.argv[1], verbose=False) json = { 'size': {}, 'unaligned': {}, 'unconnected': {}, 'unrooted': {}, 'repeats': {}, 'stats': {} } all_entities = [] unaligned_nodes = [] unrooted_entities = [] changes = 0 amrs_changed = 0 for amr in cr.amrs: change = fix_alignments(amr) changes += change if change > 0: amrs_changed += 1 for node_id in amr.nodes: # get entity info if node_id not in amr.alignments: unaligned_nodes.append(amr.nodes[node_id]) continue token_ids = amr.alignments[node_id] if not token_ids: unaligned_nodes.append(amr.nodes[node_id]) continue nodes = amr.alignmentsToken2Node(token_ids[0]) if len(nodes) <= 1: continue entity_sg = amr.findSubGraph(nodes) root = entity_sg.root if not node_id == root: continue edges = entity_sg.edges tokens = [ amr.tokens[t - 1] for t in token_ids if 0 <= t <= len(amr.tokens) ] special_nodes = [ n for n in nodes if (amr.nodes[n].isdigit() or amr.nodes[n].startswith('"')) ] entity_type = sorted( [amr.nodes[id] for id in nodes if id not in special_nodes]) entity_type = ','.join(entity_type) nodes = {n: amr.nodes[n] for n in nodes} all_entities.append( (amr, entity_type, tokens, root, nodes, edges, str(amr))) for s, r, t in amr.edges: if (s, r, t) in edges: continue if len(edges) == 0: continue if s in nodes and s != root: if t not in amr.alignments or not amr.alignments[t]: continue label = f'{amr.nodes[root]} {amr.nodes[s]}' unrooted_entities.append( (entity_type, tokens, label, str(amr))) if t in nodes and t != root: if s not in amr.alignments or not amr.alignments[s]: continue label = f'{amr.nodes[root]} {amr.nodes[t]}' unrooted_entities.append( (entity_type, tokens, label, str(amr))) size_counters = dict() unconnected_counter = Counter() unaligned_counter = Counter() unrooted_counter = Counter() repeated_counter = Counter() attachment_counter = Counter() for node in unaligned_nodes: unaligned_counter[node] += 1 for entity_type, tokens, label, string in unrooted_entities: unrooted_counter[entity_type] += 1 attachment_counter[label] += 1 json['stats']['unrooted-attachments'] = {} for node in sorted(attachment_counter, reverse=True, key=lambda x: attachment_counter[x]): json['stats']['unrooted-attachments'][node] = attachment_counter[node] for amr, entity_type, tokens, root, nodes, edges, string in all_entities: label = str(entity_type.count(',') + 1) if label not in size_counters: size_counters[label] = Counter() size_counters[label][entity_type] += 1 if entity_type.count(',') + 1 > 1 and len(edges) == 0: unconnected_counter[entity_type] += 1 nodes = entity_type.split(',') if any(nodes.count(n) > 1 for n in nodes): repeated_counter[entity_type] += 1 print('Changes:', changes, 'AMRs changed:', amrs_changed) for label in sorted(size_counters.keys(), key=lambda x: int(x)): print('size', label) print( f'({len(size_counters[label])} types, {sum(size_counters[label].values())} items)' ) json['stats']['size ' + label] = { 'types': len(size_counters[label]), 'items': sum(size_counters[label].values()) } print(size_counters[label]) json['size'][label] = {} for type in sorted(size_counters[label], reverse=True, key=lambda x: size_counters[label][x]): d = { 'count': size_counters[label][type], 'tokens': [], 'graphs': [], } json['size'][label][type] = d print('unconnected') print( f'({len(unconnected_counter)} types, {sum(unconnected_counter.values())} items)' ) json['stats']['unconnected'] = { 'types': len(unconnected_counter), 'items': sum(unconnected_counter.values()) } print(unconnected_counter) json['unconnected'] = {} for type in sorted(unconnected_counter, reverse=True, key=lambda x: unconnected_counter[x]): d = { 'count': unconnected_counter[type], 'tokens': [], 'graphs': [], } json['unconnected'][type] = d print('unaligned') print( f'({len(unaligned_counter)} types, {sum(unaligned_counter.values())} items)' ) json['stats']['unaligned'] = { 'types': len(unaligned_counter), 'items': sum(unaligned_counter.values()) } print(unaligned_counter) json['unaligned'] = {} for type in sorted(unaligned_counter, reverse=True, key=lambda x: unaligned_counter[x]): d = { 'count': unaligned_counter[type], } if type.isdigit(): type = '<NUM>' + type json['unaligned'][type] = d print('unrooted') print( f'({len(unrooted_counter)} types, {sum(unrooted_counter.values())} items)' ) json['stats']['unrooted'] = { 'types': len(unrooted_counter), 'items': sum(unrooted_counter.values()) } print(unrooted_counter) json['unrooted'] = {} for type in sorted(unrooted_counter, reverse=True, key=lambda x: unrooted_counter[x]): d = { 'count': unrooted_counter[type], 'tokens': [], 'graphs': [], 'attachments': [] } json['unrooted'][type] = d print('repeats') print( f'({len(repeated_counter)} types, {sum(repeated_counter.values())} items)' ) json['stats']['repeats'] = { 'types': len(repeated_counter), 'items': sum(repeated_counter.values()) } print(repeated_counter) json['repeats'] = {} for type in sorted(repeated_counter, reverse=True, key=lambda x: repeated_counter[x]): d = { 'count': repeated_counter[type], 'tokens': [], 'graphs': [], } json['repeats'][type] = d print() for entity_type, tokens, label, string in unrooted_entities: tokens = ' '.join(tokens) if tokens not in json['unrooted'][entity_type]['tokens'] and len( json['unrooted'][entity_type]['tokens']) < 100: json['unrooted'][entity_type]['tokens'].append(tokens) if len(json['unrooted'][entity_type]['graphs']) < 1: json['unrooted'][entity_type]['graphs'].append(string) if label not in json['unrooted'][entity_type]['attachments']: json['unrooted'][entity_type]['attachments'].append(label) for amr, entity_type, tokens, root, nodes, edges, string in all_entities: tokens = ' '.join(tokens) size = str(entity_type.count(',') + 1) if tokens not in json['size'][size][entity_type]['tokens'] and len( json['size'][size][entity_type]['tokens']) < 100: json['size'][size][entity_type]['tokens'].append(tokens) if len(json['size'][size][entity_type]['graphs']) < 1: json['size'][size][entity_type]['graphs'].append(string) if entity_type.count(',') + 1 > 1 and len(edges) == 0: if tokens not in json['unconnected'][entity_type]['tokens'] and len( json['unconnected'][entity_type]['tokens']) < 100: json['unconnected'][entity_type]['tokens'].append(tokens) if len(json['unconnected'][entity_type]['graphs']) < 1: json['unconnected'][entity_type]['graphs'].append(string) nodes = entity_type.split(',') if any(nodes.count(n) > 1 for n in nodes): if tokens not in json['repeats'][entity_type]['tokens'] and len( json['repeats'][entity_type]['tokens']) < 100: json['repeats'][entity_type]['tokens'].append(tokens) if len(json['repeats'][entity_type]['graphs']) < 1: json['repeats'][entity_type]['graphs'].append(string) with open('alignment_analysis.json', 'w+', encoding='utf8') as f: J.dump(json, f)