def xlang_main(args): """ Disagreement graphs for aligned cross-language language. """ src_amr_fh = codecs.open(args.src_amr, encoding='utf8') tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8') gold_aligned_fh = None if args.align_in: gold_aligned_fh = codecs.open(args.align_in, encoding='utf8') (json_fh, align_fh) = open_output_files(args) amrs_same_sent = [] aligner = Amr2AmrAligner(num_best=args.num_align_read, num_best_in_file=args.num_aligned_in_file) while True: (src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh) if src_amr_line == "": break (tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh) src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line, src_comments, xlang=True) tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line, tgt_comments, xlang=True) (cur_id, src_sent) = get_sent_info(src_amr.metadata) (tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id) assert cur_id == tgt_id (amr_graphs, smatchgraphs) = hilight_disagreement([tgt_amr], src_amr, args.num_restarts, aligner=aligner, gold_aligned_fh=gold_aligned_fh) if json_fh: json_fh.write(json.dumps(amr_graphs[0]) + '\n') if align_fh: align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" % (cur_id, src_sent, tgt_sent)) align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) + '\n\n') if (args.verbose): print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" % (cur_id, src_sent, tgt_sent, amr_graphs[0][1])) #raw_input("Press enter to continue: ") ag = nx.to_agraph(amr_graphs[0][0]) ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent) ag.layout(prog=args.layout) ag.draw('%s/%s.png' % (args.outdir, cur_id)) src_amr_fh.close() tgt_amr_fh.close() gold_aligned_fh and gold_aligned_fh.close() close_output_files(json_fh, align_fh)
def xlang_main(args): """ Disagreement graphs for aligned cross-language language. """ src_amr_fh = codecs.open(args.src_amr, encoding='utf8') tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8') src2tgt_fh = codecs.open(args.align_src2tgt, encoding='utf8') tgt2src_fh = codecs.open(args.align_tgt2src, encoding='utf8') gold_aligned_fh = None if args.align_in: gold_aligned_fh = codecs.open(args.align_in, encoding='utf8') (json_fh, align_fh) = open_output_files(args) amrs_same_sent = [] aligner = Amr2AmrAligner(num_best=args.num_align_read, num_best_in_file=args.num_aligned_in_file, src2tgt_fh=src2tgt_fh, tgt2src_fh=tgt2src_fh) while True: (src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh) if src_amr_line == "": break (tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh) src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line, src_comments, consts_to_vars=True) tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line, tgt_comments, consts_to_vars=True) (cur_id, src_sent) = get_sent_info(src_amr.metadata) (tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id) assert cur_id == tgt_id smatchgraphs = hilight_disagreement([tgt_amr], src_amr, args.num_restarts, aligner=aligner, gold_aligned_fh=gold_aligned_fh) amr_graphs = get_disagreement_graphs(smatchgraphs, aligner=aligner, unmatch_dead_nodes=(gold_aligned_fh == None)) if json_fh: json_fh.write(json_graph.dumps(amr_graphs[0]) + '\n') if align_fh: align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" % (cur_id, src_sent, tgt_sent)) align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) + '\n\n') if (args.verbose): print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" % (cur_id, src_sent, tgt_sent, amr_graphs[0][1])) if args.outdir != None: ag = nx.to_agraph(amr_graphs[0][0]) ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent) ag.layout(prog=args.layout) ag.draw('%s/%s.png' % (args.outdir, cur_id)) src_amr_fh.close() tgt_amr_fh.close() src2tgt_fh.close() tgt2src_fh.close() gold_aligned_fh and gold_aligned_fh.close() close_output_files(json_fh, align_fh)
def monolingual_main(args): """ Disagreement graphs for different annotations of a single sentence. """ infile = codecs.open(args.infile, encoding='utf8') gold_aligned_fh = None if args.align_in: gold_aligned_fh = codecs.open(args.align_in, encoding='utf8') (json_fh, align_fh) = open_output_files(args) amrs_same_sent = [] cur_id = "" while True: (amr_line, comments) = amr_metadata.get_amr_line(infile) cur_amr = None if amr_line: cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments, consts_to_vars=(gold_aligned_fh != None or align_fh != None)) get_sent_info(cur_amr.metadata) if 'annotator' not in cur_amr.metadata: cur_amr.metadata['annotator'] = '' if not cur_id: cur_id = cur_amr.metadata['id'] if cur_amr is None or cur_id != cur_amr.metadata['id']: gold_amr = amrs_same_sent[0] test_amrs = amrs_same_sent[1:] if len(test_amrs) == 0: test_amrs = [gold_amr] # single AMR view case args.num_restarts = 1 # TODO make single AMR view more efficient smatchgraphs = hilight_disagreement(test_amrs, gold_amr, args.num_restarts, gold_aligned_fh=gold_aligned_fh) amr_graphs = get_disagreement_graphs(smatchgraphs, unmatch_dead_nodes=(gold_aligned_fh == None)) gold_anno = gold_amr.metadata['annotator'] sent = gold_amr.metadata['tok'] if (args.verbose): print("ID: %s\n Sentence: %s\n gold anno: %s" % (cur_id, sent, gold_anno)) for (ind, a) in enumerate(test_amrs): (g, score) = amr_graphs[ind] test_anno = a.metadata['annotator'] if json_fh: json_fh.write(json_graph.dumps(g) + '\n') if align_fh: sg = smatchgraphs[ind][0] align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s\n""" % \ (cur_id, sent, gold_anno, test_anno)) align_fh.write('\n'.join(sg.get_text_alignments()) + '\n\n') if (args.verbose): print(" annotator %s score: %d" % (test_anno, score)) ag = nx.to_agraph(g) ag.graph_attr['label'] = sent ag.layout(prog=args.layout) ag.draw('%s/%s_annotated_%s_%s.png' % (args.outdir, cur_id, gold_anno, test_anno)) amrs_same_sent = [] if cur_amr is not None: cur_id = cur_amr.metadata['id'] else: break amrs_same_sent.append(cur_amr) infile.close() gold_aligned_fh and gold_aligned_fh.close() close_output_files(json_fh, align_fh)
def monolingual_main(args): """ Disagreement graphs for different annotations of a single sentence. """ infile = codecs.open(args.infile, encoding='utf8') gold_aligned_fh = None if args.align_in: gold_aligned_fh = codecs.open(args.align_in, encoding='utf8') (json_fh, align_fh) = open_output_files(args) amrs_same_sent = [] cur_id = "" while True: (amr_line, comments) = amr_metadata.get_amr_line(infile) cur_amr = None if amr_line: cur_amr = amr_metadata.AmrMeta.from_parse( amr_line, comments, consts_to_vars=(gold_aligned_fh != None or align_fh != None)) get_sent_info(cur_amr.metadata) if 'annotator' not in cur_amr.metadata: cur_amr.metadata['annotator'] = '' if not cur_id: cur_id = cur_amr.metadata['id'] if cur_amr is None or cur_id != cur_amr.metadata['id']: gold_amr = amrs_same_sent[0] test_amrs = amrs_same_sent[1:] if len(test_amrs) == 0: test_amrs = [gold_amr] # single AMR view case args.num_restarts = 1 # TODO make single AMR view more efficient smatchgraphs = hilight_disagreement( test_amrs, gold_amr, args.num_restarts, gold_aligned_fh=gold_aligned_fh) amr_graphs = get_disagreement_graphs( smatchgraphs, unmatch_dead_nodes=(gold_aligned_fh == None)) gold_anno = gold_amr.metadata['annotator'] sent = gold_amr.metadata['tok'] if (args.verbose): print("ID: %s\n Sentence: %s\n gold anno: %s" % (cur_id, sent, gold_anno)) for (ind, a) in enumerate(test_amrs): (g, score) = amr_graphs[ind] test_anno = a.metadata['annotator'] if json_fh: json_fh.write(json_graph.dumps(g) + '\n') if align_fh: sg = smatchgraphs[ind][0] align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s\n""" % \ (cur_id, sent, gold_anno, test_anno)) align_fh.write('\n'.join(sg.get_text_alignments()) + '\n\n') if (args.verbose): print(" annotator %s score: %d" % (test_anno, score)) ag = nx.to_agraph(g) ag.graph_attr['label'] = sent ag.layout(prog=args.layout) ag.draw('%s/%s_annotated_%s_%s.png' % (args.outdir, cur_id, gold_anno, test_anno)) amrs_same_sent = [] if cur_amr is not None: cur_id = cur_amr.metadata['id'] else: break amrs_same_sent.append(cur_amr) infile.close() gold_aligned_fh and gold_aligned_fh.close() close_output_files(json_fh, align_fh)
def run_main_on_file(args): try: import rdflib except ImportError: raise ImportError('requires rdflib') infile = codecs.open(args.inPath, encoding='utf8') outfile = open(args.outPath, 'w') pBankRoles = True if (not (args.pbankRoles == u'1')): pBankRoles = False xref_namespace_lookup = {} with open('xref_namespaces.txt') as f: xref_lines = f.readlines() for l in xref_lines: line = re.split("\t", l) xref_namespace_lookup[line[0]] = line[1].rstrip('\r\n') # create the basic RDF data structure g = rdflib.Graph() # namespaces amr_ns = rdflib.Namespace("http://amr.isi.edu/rdf/core-amr#") amr_terms_ns = rdflib.Namespace("http://amr.isi.edu/rdf/amr-terms#") amr_data = rdflib.Namespace("http://amr.isi.edu/amr_data#") pb_ns = rdflib.Namespace("http://amr.isi.edu/frames/ld/v1.2.2/") amr_ne_ns = rdflib.Namespace("http://amr.isi.edu/entity-types#") up_ns = rdflib.Namespace("http://www.uniprot.org/uniprot/") pfam_ns = rdflib.Namespace("http://pfam.xfam.org/family/") ontonotes_ns = rdflib.Namespace( "https://catalog.ldc.upenn.edu/LDC2013T19#") g.namespace_manager.bind('propbank', pb_ns, replace=True) g.namespace_manager.bind('amr-core', amr_ns, replace=True) g.namespace_manager.bind('amr-terms', amr_terms_ns, replace=True) g.namespace_manager.bind('entity-types', amr_ne_ns, replace=True) g.namespace_manager.bind('amr-data', amr_data, replace=True) for k in xref_namespace_lookup.keys(): temp_ns = rdflib.Namespace(xref_namespace_lookup[k]) g.namespace_manager.bind(k, temp_ns, replace=True) xref_namespace_lookup[k] = temp_ns # Basic AMR Ontology consisting of # 1. concepts # 2. roles # 3. strings (which are actually going to be Literal(string)s conceptClass = amr_ns.Concept neClass = amr_ns.NamedEntity frameClass = amr_ns.Frame roleClass = amr_ns.Role frameRoleClass = pb_ns.FrameRole g.add((conceptClass, rdflib.RDF.type, rdflib.RDFS.Class)) g.add((conceptClass, RDFS.label, rdflib.Literal("AMR-Concept"))) #g.add( (conceptClass, RDFS.comment, rdflib.Literal("Class of all concepts expressed in AMRs") ) ) g.add((neClass, rdflib.RDF.type, conceptClass)) g.add((neClass, RDFS.label, rdflib.Literal("AMR-EntityType"))) #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) ) g.add((neClass, rdflib.RDF.type, conceptClass)) g.add((neClass, RDFS.label, rdflib.Literal("AMR-Term"))) #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) ) g.add((roleClass, rdflib.RDF.type, rdflib.RDFS.Class)) g.add((roleClass, RDFS.label, rdflib.Literal("AMR-Role"))) #g.add( (roleClass, RDFS.comment, rdflib.Literal("Class of all roles expressed in AMRs") ) ) g.add((frameRoleClass, rdflib.RDF.type, roleClass)) g.add((frameRoleClass, RDFS.label, rdflib.Literal("AMR-PropBank-Role"))) #g.add( (frameRoleClass, RDFS.comment, rdflib.Literal("Class of all roles of PropBank frames") ) ) g.add((frameClass, rdflib.RDF.type, conceptClass)) g.add((frameClass, RDFS.label, rdflib.Literal("AMR-PropBank-Frame"))) #g.add( (frameClass, RDFS.comment, rdflib.Literal("Class of all frames expressed in AMRs") ) ) amr_count = 0 ns_lookup = {} class_lookup = {} nelist = [] corelist = [] pattlist = [] pmid_patt = re.compile('.*pmid_(\d+)_(\d+).*') word_align_patt = re.compile('(.*)\~e\.(.+)') propbank_patt = re.compile('^(.*)\-\d+$') opN_patt = re.compile('op(\d+)') arg_patt = re.compile('ARG\d+') with open('amr-ne.txt') as f: ne_lines = f.readlines() for l in ne_lines: for w in re.split(",\s*", l): w = w.rstrip('\r\n') nelist.append(w) for ne in nelist: ns_lookup[ne] = amr_ne_ns class_lookup[ne] = neClass with open('amr-core.txt') as f: core_lines = f.readlines() for l in core_lines: for w in re.split(",\s*", l): w = w.rstrip('\r\n') corelist.append(w) for c in corelist: ns_lookup[c] = amr_ns class_lookup[c] = conceptClass pattfile = codecs.open("amr-core-patterns.txt", encoding='utf8') for l in pattfile: pattlist.append(w) amrs_same_sent = [] cur_id = "" while True: (amr_line, comments) = amr_metadata.get_amr_line(infile) cur_amr = None vb_lookup = {} label_lookup_table = {} xref_variables = {} if amr_line: cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments) if not cur_id: cur_id = cur_amr.metadata['id'] if cur_amr is None or cur_id != cur_amr.metadata['id']: amr = amrs_same_sent[0] (inst, rel1, rel2) = amr.get_triples2() temp_ns = rdflib.Namespace("http://amr.isi.edu/amr_data/" + amr.metadata['id'] + "#") a1 = temp_ns.root01 # reserve term root01 # :a1 rdf:type amr:AMR . g.add((a1, rdflib.RDF.type, amr_ns.AMR)) #:a1 amr:has-id "pmid_1177_7939.53" amr_id = amr.metadata['id'] g.add((a1, amr_ns['has-id'], rdflib.Literal(amr_id))) match = pmid_patt.match(amr_id) if match: pmid = match.group(1) + match.group(2) g.add((a1, amr_ns['has-pmid'], rdflib.Literal(pmid))) #:a1 amr:has-sentence "Sos-1 has been shown to be part of a signaling complex with Grb2, which mediates the activation of Ras upon RTK stimulation." . if (amr.metadata.get('snt', None) is not None): g.add((a1, amr_ns['has-sentence'], rdflib.Literal(amr.metadata['snt']))) #:a1 amr:has-date "2015-03-07T10:57:15 if (amr.metadata.get('date', None) is not None): g.add((a1, amr_ns['has-date'], rdflib.Literal(amr.metadata['date']))) #:a1 amr:amr-annotator SDL-AMR-09 if (amr.metadata.get('amr-annotator', None) is not None): g.add((a1, amr_ns['has-annotator'], rdflib.Literal(amr.metadata['amr-annotator']))) #:a1 amr:tok if (amr.metadata.get('tok', None) is not None): g.add((a1, amr_ns['has-tokens'], rdflib.Literal(amr.metadata['tok']))) #:a1 amr:alignments if (amr.metadata.get('alignments', None) is not None): g.add((a1, amr_ns['has-alignments'], rdflib.Literal(amr.metadata['alignments']))) g.add((a1, amr_ns.root, temp_ns[amr.root])) # Add triples for setting types pointing to other resources frames = {} for (p, s, o) in inst: o = strip_word_alignments(o, word_align_patt) #if word_pos is not None: # g.add( (temp_ns[s], # amr_ns['has-word-pos'], # rdflib.Literal(word_pos)) ) if (ns_lookup.get(o, None) is not None): resolved_ns = ns_lookup.get(o, None) o_resolved = resolved_ns[o] if (class_lookup.get(o, None) is not None): g.add((o_resolved, rdflib.RDF.type, class_lookup.get(o, None))) else: raise ValueError(o_resolved + ' does not have a class assigned.') elif (re.search('\-\d+$', o) is not None): #match = propbank_patt.match(o) #str = "" #if match: # str = match.group(1) #o_resolved = pb_ns[str + ".html#" +o ] o_resolved = pb_ns[o] g.add((o_resolved, rdflib.RDF.type, frameClass)) elif (o == 'xref' and args.fixXref): continue elif (not (o == 'name') ): # ignore 'name' objects but add all others. o_resolved = amr_terms_ns[o] g.add((o_resolved, rdflib.RDF.type, conceptClass)) # identify xref variables in AMR, don't retain it as a part of the graph. else: continue frames[s] = o g.add((temp_ns[s], RDF.type, o_resolved)) # Add object properties for local links in the current AMR for (p, s, o) in rel2: if (p == "TOP"): continue # Do not include word positions for predicates # (since they are more general and do not need to linked to everything). p = strip_word_alignments(p, word_align_patt) o = strip_word_alignments(o, word_align_patt) # remember which objects have name objects if (p == 'name'): label_lookup_table[o] = s # objects with value objects should also be in elif (p == 'xref' and args.fixXref): xref_variables[o] = s elif (re.search('^ARG\d+$', p) is not None): frameRole = frames[s] + "." + p if (not (pBankRoles)): frameRole = p g.add((pb_ns[frameRole], rdflib.RDF.type, frameRoleClass)) g.add((temp_ns[s], pb_ns[frameRole], temp_ns[o])) vb_lookup[s] = temp_ns[s] vb_lookup[frameRole] = pb_ns[frameRole] vb_lookup[o] = temp_ns[o] elif (re.search('^ARG\d+\-of$', p) is not None): frameRole = frames[o] + "." + p if (not (pBankRoles)): frameRole = p g.add((pb_ns[frameRole], rdflib.RDF.type, frameRoleClass)) g.add((temp_ns[s], pb_ns[frameRole], temp_ns[o])) vb_lookup[s] = temp_ns[s] vb_lookup[frameRole] = pb_ns[frameRole] vb_lookup[o] = temp_ns[o] else: g.add((amr_terms_ns[p], rdflib.RDF.type, roleClass)) g.add((temp_ns[s], amr_terms_ns[p], temp_ns[o])) vb_lookup[s] = temp_ns[s] vb_lookup[p] = amr_terms_ns[p] vb_lookup[o] = temp_ns[o] # Add data properties in the current AMR labels = {} for (p, s, l) in rel1: p = strip_word_alignments(p, word_align_patt) l = strip_word_alignments(l, word_align_patt) # # Build labels across multiple 'op1, op2, ... opN' links, # opN_match = re.match(opN_patt, p) if (opN_match is not None and label_lookup_table.get(s, None) is not None): opN = int(opN_match.group(1)) ss = label_lookup_table[s] if (labels.get(ss, None) is None): labels[ss] = [] labels[ss].append((opN, l)) elif (xref_variables.get(s, None) is not None and p == 'value' and args.fixXref): for k in xref_namespace_lookup.keys(): if (l.startswith(k)): l2 = l[-len(l) + len(k):] xref_vb = xref_variables.get(s, None) resolved_xref_vb = vb_lookup.get(xref_vb, None) g.add((resolved_xref_vb, amr_ns['xref'], xref_namespace_lookup[k][l2])) # Special treatment for propbank roles. elif (re.search('ARG\d+$', p) is not None): frameRole = frames[s] + "." + p if (not (pBankRoles)): frameRole = p g.add((pb_ns[frameRole], rdflib.RDF.type, frameRoleClass)) g.add((temp_ns[s], pb_ns[frameRole], rdflib.Literal(l))) # Otherwise, it's just a literal else: g.add((temp_ns[s], amr_terms_ns[p], rdflib.Literal(l))) # Add labels here # ["\n".join([i.split(' ')[j] for j in range(5)]) for i in g.vs["id"]] for key in labels.keys(): labelArray = [i[1] for i in sorted(labels[key])] label = " ".join(labelArray) g.add((temp_ns[key], RDFS.label, rdflib.Literal(label))) amrs_same_sent = [] if cur_amr is not None: cur_id = cur_amr.metadata['id'] else: break amrs_same_sent.append(cur_amr) amr_count = amr_count + 1 # Additional processing to clean up. # 1. Add labels to AMR objects #q = sparql.prepareQuery("select distinct ?s ?label " + # "where { " + # "?s <http://amr.isi.edu/rdf/core-amr#name> ?n . " + # "?n <http://amr.isi.edu/rdf/core-amr#op1> ?label " + # "}") #qres = g.query(q) #for row in qres: # print("%s type %s" % row) print("%d AMRs converted" % amr_count) outfile.write(g.serialize(format=args.format)) outfile.close() infile.close()
def main(args): """Main function of the smatch calculation program""" global verbose global iter_num global single_score global pr_flag global match_num_dict # set the restart number iter_num = args.r + 1 verbose = False if args.ms: single_score = False if args.v: verbose = True if args.pr: pr_flag = True total_match_num = 0 total_test_num = 0 total_gold_num = 0 sent_num = 1 prev_amr1 = "" outfile = open(args.outfile, 'w') if not single_score: outfile.write("Sentence\tText") if pr_flag: outfile.write("\tPrecision\tRecall") outfile.write("\tSmatch\n") while True: cur_amr1 = smatch.get_amr_line(args.f[0]) (cur_amr2, comments) = amr_metadata.get_amr_line(args.f[1]) if cur_amr1 == "" and cur_amr2 == "": break if(cur_amr1 == ""): # GULLY CHANGED THIS. # IF WE RUN OUT OF AVAILABLE AMRS FROM FILE 1, # REUSE THE LAST AVAILABLE AMR cur_amr1 = prev_amr1 #print >> sys.stderr, "Error: File 1 has less AMRs than file 2" #print >> sys.stderr, "Ignoring remaining AMRs" #break # print >> sys.stderr, "AMR 1 is empty" # continue if(cur_amr2 == ""): print >> sys.stderr, "Error: File 2 has less AMRs than file 1" print >> sys.stderr, "Ignoring remaining AMRs" break # print >> sys.stderr, "AMR 2 is empty" # continue prev_amr1 = cur_amr1 amr1 = amr.AMR.parse_AMR_line(cur_amr1) amr2 = amr.AMR.parse_AMR_line(cur_amr2) # We were getting screwy SMATCH scores from # using the amr_metadata construct meta_enabled_amr = amr_metadata.AmrMeta.from_parse(cur_amr2, comments) test_label = "a" gold_label = "b" amr1.rename_node(test_label) amr2.rename_node(gold_label) (test_inst, test_rel1, test_rel2) = amr1.get_triples2() (gold_inst, gold_rel1, gold_rel2) = amr2.get_triples2() if verbose: print "AMR pair", sent_num print >> sys.stderr, "Instance triples of AMR 1:", len(test_inst) print >> sys.stderr, test_inst # print >> sys.stderr,"Relation triples of AMR 1:",len(test_rel) print >> sys.stderr, "Relation triples of AMR 1:", len(test_rel1) + len(test_rel2) print >>sys.stderr, test_rel1 print >> sys.stderr, test_rel2 # print >> sys.stderr, test_rel print >> sys.stderr, "Instance triples of AMR 2:", len(gold_inst) print >> sys.stderr, gold_inst # print >> sys.stderr,"Relation triples of file 2:",len(gold_rel) print >> sys.stderr, "Relation triples of AMR 2:", len( gold_rel1) + len(gold_rel2) #print >> sys.stderr,"Relation triples of file 2:",len(gold_rel1)+len(gold_rel2) print >> sys.stderr, gold_rel1 print >> sys.stderr, gold_rel2 # print >> sys.stderr, gold_rel if len(test_inst) < len(gold_inst): (best_match, best_match_num) = smatch.get_fh(test_inst, test_rel1, test_rel2, gold_inst, gold_rel1, gold_rel2, test_label, gold_label) if verbose: print >> sys.stderr, "AMR pair ", sent_num print >> sys.stderr, "best match number", best_match_num print >> sys.stderr, "best match", best_match else: (best_match, best_match_num) = smatch.get_fh(gold_inst, gold_rel1, gold_rel2, test_inst, test_rel1, test_rel2, gold_label, test_label) if verbose: print >> sys.stderr, "Sent ", sent_num print >> sys.stderr, "best match number", best_match_num print >> sys.stderr, "best match", best_match if not single_score: #(precision, # recall, # best_f_score) = smatch.compute_f(best_match_num, # len(test_rel1) + len(test_inst) + len(test_rel2), # len(gold_rel1) + len(gold_inst) + len(gold_rel2)) outfile.write( str(meta_enabled_amr.metadata.get("tok", None)) ) #if pr_flag: # outfile.write( "\t%.2f" % precision ) # outfile.write( "\t%.2f" % recall ) #outfile.write( "\t%.2f" % best_f_score ) print sent_num outfile.write( "\n" ) total_match_num += best_match_num total_test_num += len(test_rel1) + len(test_rel2) + len(test_inst) total_gold_num += len(gold_rel1) + len(gold_rel2) + len(gold_inst) match_num_dict.clear() sent_num += 1 # print "F-score:",best_f_score if verbose: print >> sys.stderr, "Total match num" print >> sys.stderr, total_match_num, total_test_num, total_gold_num if single_score: (precision, recall, best_f_score) = smatch.compute_f( total_match_num, total_test_num, total_gold_num) if pr_flag: print "Precision: %.2f" % precision print "Recall: %.2f" % recall print "Document F-score: %.2f" % best_f_score args.f[0].close() args.f[1].close() outfile.close()
def run_main(args): try: import rdflib except ImportError: raise ImportError('requires rdflib') infile = codecs.open(args.infile, encoding='utf8') outfile = open(args.outfile, 'w') json_obj = [] # namespaces amr_ns = rdflib.Namespace("http://amr.isi.edu/rdf/core-amr#") pb_ns = rdflib.Namespace("https://verbs.colorado.edu/propbank#") ontonotes_ns = rdflib.Namespace( "https://catalog.ldc.upenn.edu/LDC2013T19#") amr_ne_ns = rdflib.Namespace("http://amr.isi.edu/entity-types#") up_ns = rdflib.Namespace("http://www.uniprot.org/uniprot/") pfam_ns = rdflib.Namespace("http://pfam.xfam.org/family/") ns_lookup = {} nelist = [] nefile = codecs.open("ne.txt", encoding='utf8') for l in nefile: for w in re.split(",\s*", l): nelist.append(w) for ne in nelist: ns_lookup[ne] = amr_ne_ns amrs_same_sent = [] cur_id = "" while True: (amr_line, comments) = amr_metadata.get_amr_line(infile) cur_amr = None if amr_line: cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments) if not cur_id: cur_id = cur_amr.metadata['id'] if cur_amr is None or cur_id != cur_amr.metadata['id']: amr = amrs_same_sent[0] (inst, rel1, rel2) = amr.get_triples2() # lookup from original amr objects and simple python objects lookup = {} context = {} default = "http://amr.isi.edu/amr_data/" + amr.metadata['id'] + "#" temp_ns = rdflib.Namespace(default) a1 = {} a1["@type"] = amr_ns.AMR.toPython() json_obj.append(a1) #:a1 amr:has-sentence "Sos-1 has been shown to be part of a signaling complex with Grb2, which mediates the activation of Ras upon RTK stimulation." . a1['has-sentence'] = amr.metadata['snt'] #:a1 amr:has-id "pmid_1177_7939.53" a1['@id'] = amr.metadata['id'] #:a1 amr:has-date "2015-03-07T10:57:15 a1['has-date'] = amr.metadata['date'] #:a1 amr:has-annotator SDL-AMR-09 #:a1 amr:is-preferred "true"^^xsd:boolean #:a1 amr:has-file "pmid_1177_7939_53.txt" amr_root = {} lookup[amr.root] = amr_root a1['root'] = amr_root context['root'] = amr_ns.root.toPython() context['@base'] = default for (p, s, o) in inst: if (ns_lookup.get(o, None) is not None): context[o] = amr_ne_ns[o].toPython() elif (re.search('\-\d+$', o) is not None): context[o] = pb_ns[o].toPython() else: context[o] = amr_ns[o].toPython() if (lookup.get(s, None) is None): lookup[s] = {} s_obj = lookup[s] s_obj["@id"] = s s_obj["@type"] = o for (p, s, o) in rel2: if (lookup.get(s, None) is None): lookup[s] = {} if (lookup.get(o, None) is None): lookup[o] = {} s_obj = lookup[s] o_obj = lookup[o] if (s != o): s_obj[p] = o_obj for (p, s, l) in rel1: if (lookup.get(s, None) is None): lookup[s] = {} s_obj = lookup[s] o_obj = lookup[o] s_obj[p] = l a1['@context'] = context amrs_same_sent = [] if cur_amr is not None: cur_id = cur_amr.metadata['id'] else: break amrs_same_sent.append(cur_amr) json.dump(json_obj, outfile, indent=2) outfile.close() infile.close()
def run_main_on_file(args): try: import rdflib except ImportError: raise ImportError('requires rdflib') infile = codecs.open(args.inPath, encoding='utf8') outfile = open(args.outPath, 'w') pBankRoles = True if( not(args.pbankRoles == u'1') ): pBankRoles = False xref_namespace_lookup = {} with open('xref_namespaces.txt') as f: xref_lines = f.readlines() for l in xref_lines: line = re.split("\t", l) xref_namespace_lookup[line[0]] = line[1].rstrip('\r\n') # create the basic RDF data structure g = rdflib.Graph() # namespaces amr_ns = rdflib.Namespace("http://amr.isi.edu/rdf/core-amr#") amr_terms_ns = rdflib.Namespace("http://amr.isi.edu/rdf/amr-terms#") amr_data = rdflib.Namespace("http://amr.isi.edu/amr_data#") pb_ns = rdflib.Namespace("http://amr.isi.edu/frames/ld/v1.2.2/") amr_ne_ns = rdflib.Namespace("http://amr.isi.edu/entity-types#") up_ns = rdflib.Namespace("http://www.uniprot.org/uniprot/") pfam_ns = rdflib.Namespace("http://pfam.xfam.org/family/") ontonotes_ns = rdflib.Namespace("https://catalog.ldc.upenn.edu/LDC2013T19#") g.namespace_manager.bind('propbank', pb_ns, replace=True) g.namespace_manager.bind('amr-core', amr_ns, replace=True) g.namespace_manager.bind('amr-terms', amr_terms_ns, replace=True) g.namespace_manager.bind('entity-types', amr_ne_ns, replace=True) g.namespace_manager.bind('amr-data', amr_data, replace=True) for k in xref_namespace_lookup.keys(): temp_ns = rdflib.Namespace(xref_namespace_lookup[k]) g.namespace_manager.bind(k, temp_ns, replace=True) xref_namespace_lookup[k] = temp_ns # Basic AMR Ontology consisting of # 1. concepts # 2. roles # 3. strings (which are actually going to be Literal(string)s conceptClass = amr_ns.Concept neClass = amr_ns.NamedEntity frameClass = amr_ns.Frame roleClass = amr_ns.Role frameRoleClass = pb_ns.FrameRole g.add( (conceptClass, rdflib.RDF.type, rdflib.RDFS.Class) ) g.add( (conceptClass, RDFS.label, rdflib.Literal("AMR-Concept") ) ) #g.add( (conceptClass, RDFS.comment, rdflib.Literal("Class of all concepts expressed in AMRs") ) ) g.add( (neClass, rdflib.RDF.type, conceptClass) ) g.add( (neClass, RDFS.label, rdflib.Literal("AMR-EntityType") ) ) #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) ) g.add( (neClass, rdflib.RDF.type, conceptClass) ) g.add( (neClass, RDFS.label, rdflib.Literal("AMR-Term") ) ) #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) ) g.add( (roleClass, rdflib.RDF.type, rdflib.RDFS.Class) ) g.add( (roleClass, RDFS.label, rdflib.Literal("AMR-Role") ) ) #g.add( (roleClass, RDFS.comment, rdflib.Literal("Class of all roles expressed in AMRs") ) ) g.add( (frameRoleClass, rdflib.RDF.type, roleClass) ) g.add( (frameRoleClass, RDFS.label, rdflib.Literal("AMR-PropBank-Role") ) ) #g.add( (frameRoleClass, RDFS.comment, rdflib.Literal("Class of all roles of PropBank frames") ) ) g.add( (frameClass, rdflib.RDF.type, conceptClass) ) g.add( (frameClass, RDFS.label, rdflib.Literal("AMR-PropBank-Frame") ) ) #g.add( (frameClass, RDFS.comment, rdflib.Literal("Class of all frames expressed in AMRs") ) ) amr_count = 0 ns_lookup = {} class_lookup = {} nelist = [] corelist = [] pattlist = [] pmid_patt = re.compile('.*pmid_(\d+)_(\d+).*') word_align_patt = re.compile('(.*)\~e\.(.+)') propbank_patt = re.compile('^(.*)\-\d+$') opN_patt = re.compile('op(\d+)') arg_patt = re.compile('ARG\d+') with open('amr-ne.txt') as f: ne_lines = f.readlines() for l in ne_lines: for w in re.split(",\s*", l): w = w.rstrip('\r\n') nelist.append( w ) for ne in nelist: ns_lookup[ne] = amr_ne_ns class_lookup[ne] = neClass with open('amr-core.txt') as f: core_lines = f.readlines() for l in core_lines: for w in re.split(",\s*", l): w = w.rstrip('\r\n') corelist.append( w ) for c in corelist: ns_lookup[c] = amr_ns class_lookup[c] = conceptClass pattfile = codecs.open("amr-core-patterns.txt", encoding='utf8') for l in pattfile: pattlist.append( w ) amrs_same_sent = [] cur_id = "" while True: (amr_line, comments) = amr_metadata.get_amr_line(infile) cur_amr = None vb_lookup = {} label_lookup_table = {} xref_variables = {} if amr_line: cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments) if not cur_id: cur_id = cur_amr.metadata['id'] if cur_amr is None or cur_id != cur_amr.metadata['id']: amr = amrs_same_sent[0] (inst, rel1, rel2) = amr.get_triples2() temp_ns = rdflib.Namespace("http://amr.isi.edu/amr_data/" + amr.metadata['id'] + "#") a1 = temp_ns.root01 # reserve term root01 # :a1 rdf:type amr:AMR . g.add( (a1, rdflib.RDF.type, amr_ns.AMR) ) #:a1 amr:has-id "pmid_1177_7939.53" amr_id = amr.metadata['id'] g.add( (a1, amr_ns['has-id'], rdflib.Literal(amr_id))) match = pmid_patt.match(amr_id) if match: pmid = match.group(1) + match.group(2) g.add( (a1, amr_ns['has-pmid'], rdflib.Literal(pmid))) #:a1 amr:has-sentence "Sos-1 has been shown to be part of a signaling complex with Grb2, which mediates the activation of Ras upon RTK stimulation." . if( amr.metadata.get('snt', None) is not None): g.add( (a1, amr_ns['has-sentence'], rdflib.Literal(amr.metadata['snt']) ) ) #:a1 amr:has-date "2015-03-07T10:57:15 if( amr.metadata.get('date', None) is not None): g.add( (a1, amr_ns['has-date'], rdflib.Literal(amr.metadata['date']))) #:a1 amr:amr-annotator SDL-AMR-09 if( amr.metadata.get('amr-annotator', None) is not None): g.add( (a1, amr_ns['has-annotator'], rdflib.Literal(amr.metadata['amr-annotator']))) #:a1 amr:tok if( amr.metadata.get('tok', None) is not None): g.add( (a1, amr_ns['has-tokens'], rdflib.Literal(amr.metadata['tok']))) #:a1 amr:alignments if( amr.metadata.get('alignments', None) is not None): g.add( (a1, amr_ns['has-alignments'], rdflib.Literal(amr.metadata['alignments']))) g.add( (a1, amr_ns.root, temp_ns[amr.root]) ) # Add triples for setting types pointing to other resources frames = {} for (p, s, o) in inst: o = strip_word_alignments(o,word_align_patt) #if word_pos is not None: # g.add( (temp_ns[s], # amr_ns['has-word-pos'], # rdflib.Literal(word_pos)) ) if( ns_lookup.get(o,None) is not None ): resolved_ns = ns_lookup.get(o,None) o_resolved = resolved_ns[o] if( class_lookup.get(o,None) is not None): g.add( (o_resolved, rdflib.RDF.type, class_lookup.get(o,None)) ) else: raise ValueError(o_resolved + ' does not have a class assigned.') elif( re.search('\-\d+$', o) is not None ): #match = propbank_patt.match(o) #str = "" #if match: # str = match.group(1) #o_resolved = pb_ns[str + ".html#" +o ] o_resolved = pb_ns[ o ] g.add( (o_resolved, rdflib.RDF.type, frameClass) ) elif( o == 'xref' and args.fixXref): continue elif( not(o == 'name') ): # ignore 'name' objects but add all others. o_resolved = amr_terms_ns[o] g.add( (o_resolved, rdflib.RDF.type, conceptClass) ) # identify xref variables in AMR, don't retain it as a part of the graph. else: continue frames[s] = o g.add( (temp_ns[s], RDF.type, o_resolved) ) # Add object properties for local links in the current AMR for (p, s, o) in rel2: if( p == "TOP" ): continue # Do not include word positions for predicates # (since they are more general and do not need to linked to everything). p = strip_word_alignments(p,word_align_patt) o = strip_word_alignments(o,word_align_patt) # remember which objects have name objects if( p == 'name' ): label_lookup_table[o] = s # objects with value objects should also be in elif( p == 'xref' and args.fixXref): xref_variables[o] = s elif( re.search('^ARG\d+$', p) is not None ): frameRole = frames[s] + "." + p if( not(pBankRoles) ): frameRole = p g.add( (pb_ns[frameRole], rdflib.RDF.type, frameRoleClass) ) g.add( (temp_ns[s], pb_ns[frameRole], temp_ns[o] ) ) vb_lookup[s] = temp_ns[s] vb_lookup[frameRole] = pb_ns[frameRole] vb_lookup[o] = temp_ns[o] elif( re.search('^ARG\d+\-of$', p) is not None ): frameRole = frames[o] + "." + p if( not(pBankRoles) ): frameRole = p g.add( (pb_ns[frameRole], rdflib.RDF.type, frameRoleClass) ) g.add( (temp_ns[s], pb_ns[frameRole], temp_ns[o] ) ) vb_lookup[s] = temp_ns[s] vb_lookup[frameRole] = pb_ns[frameRole] vb_lookup[o] = temp_ns[o] else: g.add( (amr_terms_ns[p], rdflib.RDF.type, roleClass) ) g.add( (temp_ns[s], amr_terms_ns[p], temp_ns[o]) ) vb_lookup[s] = temp_ns[s] vb_lookup[p] = amr_terms_ns[p] vb_lookup[o] = temp_ns[o] # Add data properties in the current AMR labels = {} for (p, s, l) in rel1: p = strip_word_alignments(p, word_align_patt) l = strip_word_alignments(l, word_align_patt) # # Build labels across multiple 'op1, op2, ... opN' links, # opN_match = re.match(opN_patt, p) if( opN_match is not None and label_lookup_table.get(s,None) is not None): opN = int(opN_match.group(1)) ss = label_lookup_table[s] if( labels.get(ss, None) is None ): labels[ss] = [] labels[ss].append( (opN, l) ) elif( xref_variables.get(s,None) is not None and p == 'value' and args.fixXref): for k in xref_namespace_lookup.keys(): if( l.startswith(k) ): l2 = l[-len(l)+len(k):] xref_vb = xref_variables.get(s,None) resolved_xref_vb = vb_lookup.get(xref_vb,None) g.add( (resolved_xref_vb, amr_ns['xref'], xref_namespace_lookup[k][l2]) ) # Special treatment for propbank roles. elif( re.search('ARG\d+$', p) is not None ): frameRole = frames[s] + "." + p if( not(pBankRoles) ): frameRole = p g.add( (pb_ns[frameRole], rdflib.RDF.type, frameRoleClass) ) g.add( (temp_ns[s], pb_ns[frameRole], rdflib.Literal(l) ) ) # Otherwise, it's just a literal else: g.add( (temp_ns[s], amr_terms_ns[p], rdflib.Literal(l) ) ) # Add labels here # ["\n".join([i.split(' ')[j] for j in range(5)]) for i in g.vs["id"]] for key in labels.keys(): labelArray = [i[1] for i in sorted(labels[key])]; label = " ".join( labelArray ) g.add( (temp_ns[key], RDFS.label, rdflib.Literal(label) ) ) amrs_same_sent = [] if cur_amr is not None: cur_id = cur_amr.metadata['id'] else: break amrs_same_sent.append(cur_amr) amr_count = amr_count+1 # Additional processing to clean up. # 1. Add labels to AMR objects #q = sparql.prepareQuery("select distinct ?s ?label " + # "where { " + # "?s <http://amr.isi.edu/rdf/core-amr#name> ?n . " + # "?n <http://amr.isi.edu/rdf/core-amr#op1> ?label " + # "}") #qres = g.query(q) #for row in qres: # print("%s type %s" % row) print ("%d AMRs converted" % amr_count) outfile.write( g.serialize(format=args.format) ) outfile.close() infile.close()