Exemple #1
0
def xlang_main(args):
    """ Disagreement graphs for aligned cross-language language. """
    src_amr_fh = codecs.open(args.src_amr, encoding='utf8')
    tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8')
    gold_aligned_fh = None
    if args.align_in:
        gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
    (json_fh, align_fh) = open_output_files(args)

    amrs_same_sent = []
    aligner = Amr2AmrAligner(num_best=args.num_align_read,
                             num_best_in_file=args.num_aligned_in_file)
    while True:
        (src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh)
        if src_amr_line == "":
            break
        (tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh)
        src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line,
                                                  src_comments,
                                                  xlang=True)
        tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line,
                                                  tgt_comments,
                                                  xlang=True)
        (cur_id, src_sent) = get_sent_info(src_amr.metadata)
        (tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id)
        assert cur_id == tgt_id

        (amr_graphs,
         smatchgraphs) = hilight_disagreement([tgt_amr],
                                              src_amr,
                                              args.num_restarts,
                                              aligner=aligner,
                                              gold_aligned_fh=gold_aligned_fh)
        if json_fh:
            json_fh.write(json.dumps(amr_graphs[0]) + '\n')
        if align_fh:
            align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" %
                           (cur_id, src_sent, tgt_sent))
            align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) +
                           '\n\n')
        if (args.verbose):
            print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" %
                  (cur_id, src_sent, tgt_sent, amr_graphs[0][1]))
        #raw_input("Press enter to continue: ")

        ag = nx.to_agraph(amr_graphs[0][0])
        ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent)
        ag.layout(prog=args.layout)
        ag.draw('%s/%s.png' % (args.outdir, cur_id))

    src_amr_fh.close()
    tgt_amr_fh.close()
    gold_aligned_fh and gold_aligned_fh.close()
    close_output_files(json_fh, align_fh)
Exemple #2
0
def xlang_main(args):
  """ Disagreement graphs for aligned cross-language language. """
  src_amr_fh = codecs.open(args.src_amr, encoding='utf8')
  tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8')
  src2tgt_fh = codecs.open(args.align_src2tgt, encoding='utf8')
  tgt2src_fh = codecs.open(args.align_tgt2src, encoding='utf8')
  gold_aligned_fh = None
  if args.align_in:
    gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
  (json_fh, align_fh) = open_output_files(args)

  amrs_same_sent = []
  aligner = Amr2AmrAligner(num_best=args.num_align_read, num_best_in_file=args.num_aligned_in_file, src2tgt_fh=src2tgt_fh, tgt2src_fh=tgt2src_fh)
  while True:
    (src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh)
    if src_amr_line == "":
      break
    (tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh)
    src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line, src_comments, consts_to_vars=True)
    tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line, tgt_comments, consts_to_vars=True)
    (cur_id, src_sent) = get_sent_info(src_amr.metadata)
    (tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id)
    assert cur_id == tgt_id

    smatchgraphs = hilight_disagreement([tgt_amr], src_amr, args.num_restarts, aligner=aligner, gold_aligned_fh=gold_aligned_fh)
    amr_graphs = get_disagreement_graphs(smatchgraphs, aligner=aligner,
      unmatch_dead_nodes=(gold_aligned_fh == None))

    if json_fh:
      json_fh.write(json_graph.dumps(amr_graphs[0]) + '\n')
    if align_fh:
      align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" % (cur_id, src_sent, tgt_sent))
      align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) + '\n\n')
    if (args.verbose):
      print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" % (cur_id, src_sent, tgt_sent, amr_graphs[0][1]))

    if args.outdir != None:
      ag = nx.to_agraph(amr_graphs[0][0])
      ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent)
      ag.layout(prog=args.layout)
      ag.draw('%s/%s.png' % (args.outdir, cur_id))

  src_amr_fh.close()
  tgt_amr_fh.close()
  src2tgt_fh.close()
  tgt2src_fh.close()
  gold_aligned_fh and gold_aligned_fh.close()
  close_output_files(json_fh, align_fh)
Exemple #3
0
def monolingual_main(args):
  """ Disagreement graphs for different annotations of a single sentence. """
  infile = codecs.open(args.infile, encoding='utf8')
  gold_aligned_fh = None
  if args.align_in:
    gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
  (json_fh, align_fh) = open_output_files(args)

  amrs_same_sent = []
  cur_id = ""
  while True:
    (amr_line, comments) = amr_metadata.get_amr_line(infile)
    cur_amr = None
    if amr_line:
      cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments,
      consts_to_vars=(gold_aligned_fh != None or align_fh != None))
      get_sent_info(cur_amr.metadata)
      if 'annotator' not in cur_amr.metadata:
        cur_amr.metadata['annotator'] = ''
      if not cur_id:
        cur_id = cur_amr.metadata['id']

    if cur_amr is None or cur_id != cur_amr.metadata['id']:
      gold_amr = amrs_same_sent[0]
      test_amrs = amrs_same_sent[1:]
      if len(test_amrs) == 0:
        test_amrs = [gold_amr] # single AMR view case
        args.num_restarts = 1 # TODO make single AMR view more efficient
      smatchgraphs = hilight_disagreement(test_amrs, gold_amr,
        args.num_restarts, gold_aligned_fh=gold_aligned_fh)
      amr_graphs = get_disagreement_graphs(smatchgraphs, unmatch_dead_nodes=(gold_aligned_fh == None))
      gold_anno = gold_amr.metadata['annotator']
      sent = gold_amr.metadata['tok']

      if (args.verbose):
        print("ID: %s\n Sentence: %s\n gold anno: %s" % (cur_id, sent, gold_anno))

      for (ind, a) in enumerate(test_amrs):
        (g, score) = amr_graphs[ind]
        test_anno = a.metadata['annotator']
        if json_fh:
          json_fh.write(json_graph.dumps(g) + '\n')
        if align_fh:
          sg = smatchgraphs[ind][0]
          align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s\n""" % \
            (cur_id, sent, gold_anno, test_anno))
          align_fh.write('\n'.join(sg.get_text_alignments()) + '\n\n')
        if (args.verbose):
          print("  annotator %s score: %d" % (test_anno, score))

        ag = nx.to_agraph(g)
        ag.graph_attr['label'] = sent
        ag.layout(prog=args.layout)
        ag.draw('%s/%s_annotated_%s_%s.png' % (args.outdir, cur_id, gold_anno, test_anno))

      amrs_same_sent = []
      if cur_amr is not None:
        cur_id = cur_amr.metadata['id']
      else:
        break

    amrs_same_sent.append(cur_amr)

  infile.close()
  gold_aligned_fh and gold_aligned_fh.close()
  close_output_files(json_fh, align_fh)
Exemple #4
0
def monolingual_main(args):
    """ Disagreement graphs for different annotations of a single sentence. """
    infile = codecs.open(args.infile, encoding='utf8')
    gold_aligned_fh = None
    if args.align_in:
        gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
    (json_fh, align_fh) = open_output_files(args)

    amrs_same_sent = []
    cur_id = ""
    while True:
        (amr_line, comments) = amr_metadata.get_amr_line(infile)
        cur_amr = None
        if amr_line:
            cur_amr = amr_metadata.AmrMeta.from_parse(
                amr_line,
                comments,
                consts_to_vars=(gold_aligned_fh != None or align_fh != None))
            get_sent_info(cur_amr.metadata)
            if 'annotator' not in cur_amr.metadata:
                cur_amr.metadata['annotator'] = ''
            if not cur_id:
                cur_id = cur_amr.metadata['id']

        if cur_amr is None or cur_id != cur_amr.metadata['id']:
            gold_amr = amrs_same_sent[0]
            test_amrs = amrs_same_sent[1:]
            if len(test_amrs) == 0:
                test_amrs = [gold_amr]  # single AMR view case
                args.num_restarts = 1  # TODO make single AMR view more efficient
            smatchgraphs = hilight_disagreement(
                test_amrs,
                gold_amr,
                args.num_restarts,
                gold_aligned_fh=gold_aligned_fh)
            amr_graphs = get_disagreement_graphs(
                smatchgraphs, unmatch_dead_nodes=(gold_aligned_fh == None))
            gold_anno = gold_amr.metadata['annotator']
            sent = gold_amr.metadata['tok']

            if (args.verbose):
                print("ID: %s\n Sentence: %s\n gold anno: %s" %
                      (cur_id, sent, gold_anno))

            for (ind, a) in enumerate(test_amrs):
                (g, score) = amr_graphs[ind]
                test_anno = a.metadata['annotator']
                if json_fh:
                    json_fh.write(json_graph.dumps(g) + '\n')
                if align_fh:
                    sg = smatchgraphs[ind][0]
                    align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s\n""" % \
                      (cur_id, sent, gold_anno, test_anno))
                    align_fh.write('\n'.join(sg.get_text_alignments()) +
                                   '\n\n')
                if (args.verbose):
                    print("  annotator %s score: %d" % (test_anno, score))

                ag = nx.to_agraph(g)
                ag.graph_attr['label'] = sent
                ag.layout(prog=args.layout)
                ag.draw('%s/%s_annotated_%s_%s.png' %
                        (args.outdir, cur_id, gold_anno, test_anno))

            amrs_same_sent = []
            if cur_amr is not None:
                cur_id = cur_amr.metadata['id']
            else:
                break

        amrs_same_sent.append(cur_amr)

    infile.close()
    gold_aligned_fh and gold_aligned_fh.close()
    close_output_files(json_fh, align_fh)
Exemple #5
0
def run_main_on_file(args):

    try:
        import rdflib
    except ImportError:
        raise ImportError('requires rdflib')

    infile = codecs.open(args.inPath, encoding='utf8')
    outfile = open(args.outPath, 'w')

    pBankRoles = True
    if (not (args.pbankRoles == u'1')):
        pBankRoles = False

    xref_namespace_lookup = {}
    with open('xref_namespaces.txt') as f:
        xref_lines = f.readlines()
    for l in xref_lines:
        line = re.split("\t", l)
        xref_namespace_lookup[line[0]] = line[1].rstrip('\r\n')

    # create the basic RDF data structure
    g = rdflib.Graph()

    # namespaces
    amr_ns = rdflib.Namespace("http://amr.isi.edu/rdf/core-amr#")
    amr_terms_ns = rdflib.Namespace("http://amr.isi.edu/rdf/amr-terms#")
    amr_data = rdflib.Namespace("http://amr.isi.edu/amr_data#")
    pb_ns = rdflib.Namespace("http://amr.isi.edu/frames/ld/v1.2.2/")
    amr_ne_ns = rdflib.Namespace("http://amr.isi.edu/entity-types#")

    up_ns = rdflib.Namespace("http://www.uniprot.org/uniprot/")
    pfam_ns = rdflib.Namespace("http://pfam.xfam.org/family/")
    ontonotes_ns = rdflib.Namespace(
        "https://catalog.ldc.upenn.edu/LDC2013T19#")

    g.namespace_manager.bind('propbank', pb_ns, replace=True)
    g.namespace_manager.bind('amr-core', amr_ns, replace=True)
    g.namespace_manager.bind('amr-terms', amr_terms_ns, replace=True)
    g.namespace_manager.bind('entity-types', amr_ne_ns, replace=True)
    g.namespace_manager.bind('amr-data', amr_data, replace=True)

    for k in xref_namespace_lookup.keys():
        temp_ns = rdflib.Namespace(xref_namespace_lookup[k])
        g.namespace_manager.bind(k, temp_ns, replace=True)
        xref_namespace_lookup[k] = temp_ns

    # Basic AMR Ontology consisting of
    #   1. concepts
    #   2. roles
    #   3. strings (which are actually going to be Literal(string)s
    conceptClass = amr_ns.Concept
    neClass = amr_ns.NamedEntity
    frameClass = amr_ns.Frame
    roleClass = amr_ns.Role
    frameRoleClass = pb_ns.FrameRole

    g.add((conceptClass, rdflib.RDF.type, rdflib.RDFS.Class))
    g.add((conceptClass, RDFS.label, rdflib.Literal("AMR-Concept")))
    #g.add( (conceptClass, RDFS.comment, rdflib.Literal("Class of all concepts expressed in AMRs") ) )

    g.add((neClass, rdflib.RDF.type, conceptClass))
    g.add((neClass, RDFS.label, rdflib.Literal("AMR-EntityType")))
    #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) )

    g.add((neClass, rdflib.RDF.type, conceptClass))
    g.add((neClass, RDFS.label, rdflib.Literal("AMR-Term")))
    #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) )

    g.add((roleClass, rdflib.RDF.type, rdflib.RDFS.Class))
    g.add((roleClass, RDFS.label, rdflib.Literal("AMR-Role")))
    #g.add( (roleClass, RDFS.comment, rdflib.Literal("Class of all roles expressed in AMRs") ) )

    g.add((frameRoleClass, rdflib.RDF.type, roleClass))
    g.add((frameRoleClass, RDFS.label, rdflib.Literal("AMR-PropBank-Role")))
    #g.add( (frameRoleClass, RDFS.comment, rdflib.Literal("Class of all roles of PropBank frames") ) )

    g.add((frameClass, rdflib.RDF.type, conceptClass))
    g.add((frameClass, RDFS.label, rdflib.Literal("AMR-PropBank-Frame")))
    #g.add( (frameClass, RDFS.comment, rdflib.Literal("Class of all frames expressed in AMRs") ) )

    amr_count = 0
    ns_lookup = {}
    class_lookup = {}
    nelist = []
    corelist = []
    pattlist = []
    pmid_patt = re.compile('.*pmid_(\d+)_(\d+).*')
    word_align_patt = re.compile('(.*)\~e\.(.+)')
    propbank_patt = re.compile('^(.*)\-\d+$')
    opN_patt = re.compile('op(\d+)')
    arg_patt = re.compile('ARG\d+')

    with open('amr-ne.txt') as f:
        ne_lines = f.readlines()
    for l in ne_lines:
        for w in re.split(",\s*", l):
            w = w.rstrip('\r\n')
            nelist.append(w)
    for ne in nelist:
        ns_lookup[ne] = amr_ne_ns
        class_lookup[ne] = neClass

    with open('amr-core.txt') as f:
        core_lines = f.readlines()
    for l in core_lines:
        for w in re.split(",\s*", l):
            w = w.rstrip('\r\n')
            corelist.append(w)
    for c in corelist:
        ns_lookup[c] = amr_ns
        class_lookup[c] = conceptClass

    pattfile = codecs.open("amr-core-patterns.txt", encoding='utf8')
    for l in pattfile:
        pattlist.append(w)

    amrs_same_sent = []

    cur_id = ""
    while True:
        (amr_line, comments) = amr_metadata.get_amr_line(infile)
        cur_amr = None

        vb_lookup = {}
        label_lookup_table = {}
        xref_variables = {}

        if amr_line:
            cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments)
            if not cur_id:
                cur_id = cur_amr.metadata['id']

        if cur_amr is None or cur_id != cur_amr.metadata['id']:
            amr = amrs_same_sent[0]

            (inst, rel1, rel2) = amr.get_triples2()

            temp_ns = rdflib.Namespace("http://amr.isi.edu/amr_data/" +
                                       amr.metadata['id'] + "#")
            a1 = temp_ns.root01  # reserve term root01

            # :a1 rdf:type amr:AMR .
            g.add((a1, rdflib.RDF.type, amr_ns.AMR))

            #:a1 amr:has-id "pmid_1177_7939.53"
            amr_id = amr.metadata['id']
            g.add((a1, amr_ns['has-id'], rdflib.Literal(amr_id)))

            match = pmid_patt.match(amr_id)
            if match:
                pmid = match.group(1) + match.group(2)
                g.add((a1, amr_ns['has-pmid'], rdflib.Literal(pmid)))

            #:a1 amr:has-sentence "Sos-1 has been shown to be part of a signaling complex with Grb2, which mediates the activation of Ras upon RTK stimulation." .
            if (amr.metadata.get('snt', None) is not None):
                g.add((a1, amr_ns['has-sentence'],
                       rdflib.Literal(amr.metadata['snt'])))

            #:a1 amr:has-date "2015-03-07T10:57:15
            if (amr.metadata.get('date', None) is not None):
                g.add((a1, amr_ns['has-date'],
                       rdflib.Literal(amr.metadata['date'])))

            #:a1 amr:amr-annotator SDL-AMR-09
            if (amr.metadata.get('amr-annotator', None) is not None):
                g.add((a1, amr_ns['has-annotator'],
                       rdflib.Literal(amr.metadata['amr-annotator'])))

            #:a1 amr:tok
            if (amr.metadata.get('tok', None) is not None):
                g.add((a1, amr_ns['has-tokens'],
                       rdflib.Literal(amr.metadata['tok'])))

            #:a1 amr:alignments
            if (amr.metadata.get('alignments', None) is not None):
                g.add((a1, amr_ns['has-alignments'],
                       rdflib.Literal(amr.metadata['alignments'])))

            g.add((a1, amr_ns.root, temp_ns[amr.root]))

            # Add triples for setting types pointing to other resources
            frames = {}
            for (p, s, o) in inst:

                o = strip_word_alignments(o, word_align_patt)
                #if word_pos is not None:
                #        g.add( (temp_ns[s],
                #                        amr_ns['has-word-pos'],
                #                        rdflib.Literal(word_pos)) )

                if (ns_lookup.get(o, None) is not None):
                    resolved_ns = ns_lookup.get(o, None)
                    o_resolved = resolved_ns[o]
                    if (class_lookup.get(o, None) is not None):
                        g.add((o_resolved, rdflib.RDF.type,
                               class_lookup.get(o, None)))
                    else:
                        raise ValueError(o_resolved +
                                         ' does not have a class assigned.')
                elif (re.search('\-\d+$', o) is not None):
                    #match = propbank_patt.match(o)
                    #str = ""
                    #if match:
                    #    str = match.group(1)
                    #o_resolved = pb_ns[str + ".html#" +o ]
                    o_resolved = pb_ns[o]
                    g.add((o_resolved, rdflib.RDF.type, frameClass))
                elif (o == 'xref' and args.fixXref):
                    continue
                elif (not (o == 'name')
                      ):  # ignore 'name' objects but add all others.
                    o_resolved = amr_terms_ns[o]
                    g.add((o_resolved, rdflib.RDF.type, conceptClass))
                # identify xref variables in AMR, don't retain it as a part of the graph.
                else:
                    continue

                frames[s] = o
                g.add((temp_ns[s], RDF.type, o_resolved))

            # Add object properties for local links in the current AMR
            for (p, s, o) in rel2:

                if (p == "TOP"):
                    continue

                # Do not include word positions for predicates
                # (since they are more general and do not need to linked to everything).
                p = strip_word_alignments(p, word_align_patt)
                o = strip_word_alignments(o, word_align_patt)

                # remember which objects have name objects
                if (p == 'name'):
                    label_lookup_table[o] = s

                # objects with value objects should also be in
                elif (p == 'xref' and args.fixXref):
                    xref_variables[o] = s

                elif (re.search('^ARG\d+$', p) is not None):

                    frameRole = frames[s] + "." + p
                    if (not (pBankRoles)):
                        frameRole = p

                    g.add((pb_ns[frameRole], rdflib.RDF.type, frameRoleClass))
                    g.add((temp_ns[s], pb_ns[frameRole], temp_ns[o]))
                    vb_lookup[s] = temp_ns[s]
                    vb_lookup[frameRole] = pb_ns[frameRole]
                    vb_lookup[o] = temp_ns[o]

                elif (re.search('^ARG\d+\-of$', p) is not None):

                    frameRole = frames[o] + "." + p
                    if (not (pBankRoles)):
                        frameRole = p

                    g.add((pb_ns[frameRole], rdflib.RDF.type, frameRoleClass))
                    g.add((temp_ns[s], pb_ns[frameRole], temp_ns[o]))
                    vb_lookup[s] = temp_ns[s]
                    vb_lookup[frameRole] = pb_ns[frameRole]
                    vb_lookup[o] = temp_ns[o]

                else:

                    g.add((amr_terms_ns[p], rdflib.RDF.type, roleClass))
                    g.add((temp_ns[s], amr_terms_ns[p], temp_ns[o]))
                    vb_lookup[s] = temp_ns[s]
                    vb_lookup[p] = amr_terms_ns[p]
                    vb_lookup[o] = temp_ns[o]

            # Add data properties in the current AMR
            labels = {}
            for (p, s, l) in rel1:

                p = strip_word_alignments(p, word_align_patt)
                l = strip_word_alignments(l, word_align_patt)

                #
                # Build labels across multiple 'op1, op2, ... opN' links,
                #
                opN_match = re.match(opN_patt, p)
                if (opN_match is not None
                        and label_lookup_table.get(s, None) is not None):
                    opN = int(opN_match.group(1))
                    ss = label_lookup_table[s]
                    if (labels.get(ss, None) is None):
                        labels[ss] = []

                    labels[ss].append((opN, l))

                elif (xref_variables.get(s, None) is not None and p == 'value'
                      and args.fixXref):
                    for k in xref_namespace_lookup.keys():
                        if (l.startswith(k)):
                            l2 = l[-len(l) + len(k):]
                            xref_vb = xref_variables.get(s, None)
                            resolved_xref_vb = vb_lookup.get(xref_vb, None)
                            g.add((resolved_xref_vb, amr_ns['xref'],
                                   xref_namespace_lookup[k][l2]))

                # Special treatment for propbank roles.
                elif (re.search('ARG\d+$', p) is not None):

                    frameRole = frames[s] + "." + p
                    if (not (pBankRoles)):
                        frameRole = p

                    g.add((pb_ns[frameRole], rdflib.RDF.type, frameRoleClass))
                    g.add((temp_ns[s], pb_ns[frameRole], rdflib.Literal(l)))

                # Otherwise, it's just a literal
                else:
                    g.add((temp_ns[s], amr_terms_ns[p], rdflib.Literal(l)))

            # Add labels here
            # ["\n".join([i.split(' ')[j] for j in range(5)]) for i in g.vs["id"]]
            for key in labels.keys():
                labelArray = [i[1] for i in sorted(labels[key])]

                label = " ".join(labelArray)
                g.add((temp_ns[key], RDFS.label, rdflib.Literal(label)))

            amrs_same_sent = []
            if cur_amr is not None:
                cur_id = cur_amr.metadata['id']
            else:
                break

        amrs_same_sent.append(cur_amr)
        amr_count = amr_count + 1

    # Additional processing to clean up.
    # 1. Add labels to AMR objects
    #q = sparql.prepareQuery("select distinct ?s ?label " +
    #                                                "where { " +
    #                                                "?s <http://amr.isi.edu/rdf/core-amr#name> ?n . " +
    #                                                "?n <http://amr.isi.edu/rdf/core-amr#op1> ?label " +
    #                                                "}")
    #qres = g.query(q)

    #for row in qres:
    #    print("%s type %s" % row)
    print("%d AMRs converted" % amr_count)
    outfile.write(g.serialize(format=args.format))
    outfile.close()

    infile.close()
Exemple #6
0
def main(args):
  """Main function of the smatch calculation program"""
  global verbose
  global iter_num
  global single_score
  global pr_flag
  global match_num_dict
  # set the restart number
  iter_num = args.r + 1
  verbose = False
  if args.ms:
    single_score = False
  if args.v:
    verbose = True
  if args.pr:
    pr_flag = True
  total_match_num = 0
  total_test_num = 0
  total_gold_num = 0
  sent_num = 1
  prev_amr1 = ""
  outfile = open(args.outfile, 'w')
  if not single_score:     
    outfile.write("Sentence\tText")
    if pr_flag:
      outfile.write("\tPrecision\tRecall")
    outfile.write("\tSmatch\n")
    
  while True:
    cur_amr1 = smatch.get_amr_line(args.f[0])
    (cur_amr2, comments) = amr_metadata.get_amr_line(args.f[1])
    if cur_amr1 == "" and cur_amr2 == "":
      break
    if(cur_amr1 == ""):        
      # GULLY CHANGED THIS. 
      # IF WE RUN OUT OF AVAILABLE AMRS FROM FILE 1, 
      # REUSE THE LAST AVAILABLE AMR
      cur_amr1 = prev_amr1  
      #print >> sys.stderr, "Error: File 1 has less AMRs than file 2"
      #print >> sys.stderr, "Ignoring remaining AMRs"
      #break
      # print >> sys.stderr, "AMR 1 is empty"
      # continue
    if(cur_amr2 == ""):
      print >> sys.stderr, "Error: File 2 has less AMRs than file 1"
      print >> sys.stderr, "Ignoring remaining AMRs"
      break
    # print >> sys.stderr, "AMR 2 is empty"
    # continue
    prev_amr1 = cur_amr1
    
    amr1 = amr.AMR.parse_AMR_line(cur_amr1)
    amr2 = amr.AMR.parse_AMR_line(cur_amr2)
    
    # We were getting screwy SMATCH scores from 
    # using the amr_metadata construct
    meta_enabled_amr = amr_metadata.AmrMeta.from_parse(cur_amr2, comments)
    
    test_label = "a"
    gold_label = "b"
    amr1.rename_node(test_label)
    amr2.rename_node(gold_label)
    (test_inst, test_rel1, test_rel2) = amr1.get_triples2()
    (gold_inst, gold_rel1, gold_rel2) = amr2.get_triples2()
    if verbose:
      print "AMR pair", sent_num
      print >> sys.stderr, "Instance triples of AMR 1:", len(test_inst)
      print >> sys.stderr, test_inst
  #   print >> sys.stderr,"Relation triples of AMR 1:",len(test_rel)
      print >> sys.stderr, "Relation triples of AMR 1:", len(test_rel1) + len(test_rel2)
      print >>sys.stderr, test_rel1
      print >> sys.stderr, test_rel2
  #   print >> sys.stderr, test_rel
      print >> sys.stderr, "Instance triples of AMR 2:", len(gold_inst)
      print >> sys.stderr, gold_inst
  #   print >> sys.stderr,"Relation triples of file 2:",len(gold_rel)
      print >> sys.stderr, "Relation triples of AMR 2:", len(
          gold_rel1) + len(gold_rel2)
      #print >> sys.stderr,"Relation triples of file 2:",len(gold_rel1)+len(gold_rel2)
      print >> sys.stderr, gold_rel1
      print >> sys.stderr, gold_rel2
  #    print >> sys.stderr, gold_rel
    if len(test_inst) < len(gold_inst):
      (best_match,
       best_match_num) = smatch.get_fh(test_inst,
                                test_rel1,
                                test_rel2,
                                gold_inst,
                                gold_rel1,
                                gold_rel2,
                                test_label,
                                gold_label)
      if verbose:
        print >> sys.stderr, "AMR pair ", sent_num
        print >> sys.stderr, "best match number", best_match_num
        print >> sys.stderr, "best match", best_match
    else:
      (best_match,
       best_match_num) = smatch.get_fh(gold_inst,
                                gold_rel1,
                                gold_rel2,
                                test_inst,
                                test_rel1,
                                test_rel2,
                                gold_label,
                                test_label)
      if verbose:
        print >> sys.stderr, "Sent ", sent_num
        print >> sys.stderr, "best match number", best_match_num
        print >> sys.stderr, "best match", best_match
    if not single_score:
      #(precision,
      # recall,
      # best_f_score) = smatch.compute_f(best_match_num,
      #                           len(test_rel1) + len(test_inst) + len(test_rel2),
      #                           len(gold_rel1) + len(gold_inst) + len(gold_rel2))
      outfile.write( str(meta_enabled_amr.metadata.get("tok", None)) )
      #if pr_flag:
      #  outfile.write( "\t%.2f" % precision )
      #  outfile.write( "\t%.2f" % recall )
      #outfile.write( "\t%.2f" % best_f_score )
      print sent_num
      outfile.write( "\n" )
    total_match_num += best_match_num
    total_test_num += len(test_rel1) + len(test_rel2) + len(test_inst)
    total_gold_num += len(gold_rel1) + len(gold_rel2) + len(gold_inst)
    match_num_dict.clear()
    sent_num += 1  # print "F-score:",best_f_score
  if verbose:
    print >> sys.stderr, "Total match num"
    print >> sys.stderr, total_match_num, total_test_num, total_gold_num
  if single_score:
    (precision, recall, best_f_score) = smatch.compute_f(
        total_match_num, total_test_num, total_gold_num)
    if pr_flag:
      print "Precision: %.2f" % precision
      print "Recall: %.2f" % recall
    print "Document F-score: %.2f" % best_f_score
  args.f[0].close()
  args.f[1].close()
  outfile.close()
Exemple #7
0
def run_main(args):
    try:
        import rdflib
    except ImportError:
        raise ImportError('requires rdflib')

    infile = codecs.open(args.infile, encoding='utf8')
    outfile = open(args.outfile, 'w')

    json_obj = []

    # namespaces
    amr_ns = rdflib.Namespace("http://amr.isi.edu/rdf/core-amr#")
    pb_ns = rdflib.Namespace("https://verbs.colorado.edu/propbank#")
    ontonotes_ns = rdflib.Namespace(
        "https://catalog.ldc.upenn.edu/LDC2013T19#")
    amr_ne_ns = rdflib.Namespace("http://amr.isi.edu/entity-types#")
    up_ns = rdflib.Namespace("http://www.uniprot.org/uniprot/")
    pfam_ns = rdflib.Namespace("http://pfam.xfam.org/family/")

    ns_lookup = {}
    nelist = []
    nefile = codecs.open("ne.txt", encoding='utf8')
    for l in nefile:
        for w in re.split(",\s*", l):
            nelist.append(w)
    for ne in nelist:
        ns_lookup[ne] = amr_ne_ns

    amrs_same_sent = []
    cur_id = ""
    while True:
        (amr_line, comments) = amr_metadata.get_amr_line(infile)
        cur_amr = None
        if amr_line:
            cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments)
            if not cur_id:
                cur_id = cur_amr.metadata['id']

        if cur_amr is None or cur_id != cur_amr.metadata['id']:
            amr = amrs_same_sent[0]

            (inst, rel1, rel2) = amr.get_triples2()

            # lookup from original amr objects and simple python objects
            lookup = {}
            context = {}

            default = "http://amr.isi.edu/amr_data/" + amr.metadata['id'] + "#"
            temp_ns = rdflib.Namespace(default)

            a1 = {}
            a1["@type"] = amr_ns.AMR.toPython()
            json_obj.append(a1)

            #:a1 amr:has-sentence "Sos-1 has been shown to be part of a signaling complex with Grb2, which mediates the activation of Ras upon RTK stimulation." .
            a1['has-sentence'] = amr.metadata['snt']

            #:a1 amr:has-id "pmid_1177_7939.53"
            a1['@id'] = amr.metadata['id']

            #:a1 amr:has-date "2015-03-07T10:57:15
            a1['has-date'] = amr.metadata['date']

            #:a1 amr:has-annotator SDL-AMR-09
            #:a1 amr:is-preferred "true"^^xsd:boolean
            #:a1 amr:has-file "pmid_1177_7939_53.txt"

            amr_root = {}
            lookup[amr.root] = amr_root
            a1['root'] = amr_root
            context['root'] = amr_ns.root.toPython()
            context['@base'] = default

            for (p, s, o) in inst:

                if (ns_lookup.get(o, None) is not None):
                    context[o] = amr_ne_ns[o].toPython()
                elif (re.search('\-\d+$', o) is not None):
                    context[o] = pb_ns[o].toPython()
                else:
                    context[o] = amr_ns[o].toPython()

                if (lookup.get(s, None) is None):
                    lookup[s] = {}

                s_obj = lookup[s]
                s_obj["@id"] = s
                s_obj["@type"] = o

            for (p, s, o) in rel2:

                if (lookup.get(s, None) is None):
                    lookup[s] = {}

                if (lookup.get(o, None) is None):
                    lookup[o] = {}

                s_obj = lookup[s]
                o_obj = lookup[o]

                if (s != o):
                    s_obj[p] = o_obj

            for (p, s, l) in rel1:

                if (lookup.get(s, None) is None):
                    lookup[s] = {}

                s_obj = lookup[s]
                o_obj = lookup[o]

                s_obj[p] = l

            a1['@context'] = context

            amrs_same_sent = []
            if cur_amr is not None:
                cur_id = cur_amr.metadata['id']
            else:
                break

        amrs_same_sent.append(cur_amr)

    json.dump(json_obj, outfile, indent=2)
    outfile.close()

    infile.close()
Exemple #8
0
def run_main_on_file(args):
    
    try:
        import rdflib
    except ImportError:
        raise ImportError('requires rdflib')
                
    infile = codecs.open(args.inPath, encoding='utf8')
    outfile = open(args.outPath, 'w')
    
    pBankRoles = True
    if( not(args.pbankRoles == u'1') ):
        pBankRoles = False
                                
    xref_namespace_lookup = {}
    with open('xref_namespaces.txt') as f:
        xref_lines = f.readlines()
    for l in xref_lines:
        line = re.split("\t", l)
        xref_namespace_lookup[line[0]] = line[1].rstrip('\r\n')
                                
    # create the basic RDF data structure
    g = rdflib.Graph()
    
    # namespaces
    amr_ns = rdflib.Namespace("http://amr.isi.edu/rdf/core-amr#")
    amr_terms_ns = rdflib.Namespace("http://amr.isi.edu/rdf/amr-terms#")
    amr_data = rdflib.Namespace("http://amr.isi.edu/amr_data#")
    pb_ns = rdflib.Namespace("http://amr.isi.edu/frames/ld/v1.2.2/")
    amr_ne_ns = rdflib.Namespace("http://amr.isi.edu/entity-types#")

    up_ns = rdflib.Namespace("http://www.uniprot.org/uniprot/")
    pfam_ns = rdflib.Namespace("http://pfam.xfam.org/family/")
    ontonotes_ns = rdflib.Namespace("https://catalog.ldc.upenn.edu/LDC2013T19#")

    g.namespace_manager.bind('propbank', pb_ns, replace=True)
    g.namespace_manager.bind('amr-core', amr_ns, replace=True)
    g.namespace_manager.bind('amr-terms', amr_terms_ns, replace=True)
    g.namespace_manager.bind('entity-types', amr_ne_ns, replace=True)    
    g.namespace_manager.bind('amr-data', amr_data, replace=True)    
    
    for k in xref_namespace_lookup.keys():
        temp_ns = rdflib.Namespace(xref_namespace_lookup[k])
        g.namespace_manager.bind(k, temp_ns, replace=True)    
        xref_namespace_lookup[k] = temp_ns
    
    # Basic AMR Ontology consisting of 
    #   1. concepts
    #   2. roles 
    #   3. strings (which are actually going to be Literal(string)s
    conceptClass = amr_ns.Concept
    neClass = amr_ns.NamedEntity
    frameClass = amr_ns.Frame
    roleClass = amr_ns.Role
    frameRoleClass = pb_ns.FrameRole
    
    g.add( (conceptClass, rdflib.RDF.type, rdflib.RDFS.Class) )
    g.add( (conceptClass, RDFS.label, rdflib.Literal("AMR-Concept") ) )
    #g.add( (conceptClass, RDFS.comment, rdflib.Literal("Class of all concepts expressed in AMRs") ) )

    g.add( (neClass, rdflib.RDF.type, conceptClass) )
    g.add( (neClass, RDFS.label, rdflib.Literal("AMR-EntityType") ) )
    #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) )

    g.add( (neClass, rdflib.RDF.type, conceptClass) )
    g.add( (neClass, RDFS.label, rdflib.Literal("AMR-Term") ) )
    #g.add( (neClass, RDFS.comment, rdflib.Literal("Class of all named entities expressed in AMRs") ) )

    g.add( (roleClass, rdflib.RDF.type, rdflib.RDFS.Class) )
    g.add( (roleClass, RDFS.label, rdflib.Literal("AMR-Role") ) )
    #g.add( (roleClass, RDFS.comment, rdflib.Literal("Class of all roles expressed in AMRs") ) )

    g.add( (frameRoleClass, rdflib.RDF.type, roleClass) )
    g.add( (frameRoleClass, RDFS.label, rdflib.Literal("AMR-PropBank-Role") ) )
    #g.add( (frameRoleClass, RDFS.comment, rdflib.Literal("Class of all roles of PropBank frames") ) )

    g.add( (frameClass, rdflib.RDF.type, conceptClass) )
    g.add( (frameClass, RDFS.label, rdflib.Literal("AMR-PropBank-Frame") ) )
    #g.add( (frameClass, RDFS.comment, rdflib.Literal("Class of all frames expressed in AMRs") ) )
    
    amr_count = 0
    ns_lookup = {}
    class_lookup = {}
    nelist = []
    corelist = []
    pattlist = []
    pmid_patt = re.compile('.*pmid_(\d+)_(\d+).*')
    word_align_patt = re.compile('(.*)\~e\.(.+)')
    propbank_patt = re.compile('^(.*)\-\d+$')
    opN_patt = re.compile('op(\d+)')
    arg_patt = re.compile('ARG\d+')

    with open('amr-ne.txt') as f:
        ne_lines = f.readlines()
    for l in ne_lines:
        for w in re.split(",\s*", l):
            w = w.rstrip('\r\n')
            nelist.append( w )
    for ne in nelist:
            ns_lookup[ne] = amr_ne_ns
            class_lookup[ne] = neClass

    with open('amr-core.txt') as f:
        core_lines = f.readlines()
    for l in core_lines:
        for w in re.split(",\s*", l):
            w = w.rstrip('\r\n')
            corelist.append( w )
    for c in corelist:
            ns_lookup[c] = amr_ns    
            class_lookup[c] = conceptClass
            
    pattfile = codecs.open("amr-core-patterns.txt", encoding='utf8')
    for l in pattfile:
        pattlist.append( w )
    
    amrs_same_sent = []
    
    cur_id = ""
    while True:
        (amr_line, comments) = amr_metadata.get_amr_line(infile)
        cur_amr = None

        vb_lookup = {}
        label_lookup_table = {}
        xref_variables = {}
    
        if amr_line:
            cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments)
            if not cur_id:
                cur_id = cur_amr.metadata['id']

        if cur_amr is None or cur_id != cur_amr.metadata['id']:
            amr = amrs_same_sent[0]

            (inst, rel1, rel2) = amr.get_triples2()
        
            temp_ns = rdflib.Namespace("http://amr.isi.edu/amr_data/" + amr.metadata['id'] + "#")    
            a1 = temp_ns.root01 # reserve term root01 
            
            # :a1 rdf:type amr:AMR .
            g.add( (a1, 
                    rdflib.RDF.type, 
                    amr_ns.AMR) )

            #:a1 amr:has-id "pmid_1177_7939.53"
            amr_id = amr.metadata['id']
            g.add( (a1, 
                    amr_ns['has-id'], 
                    rdflib.Literal(amr_id)))
            
            match = pmid_patt.match(amr_id)
            if match:
                    pmid = match.group(1) + match.group(2)
                    g.add( (a1, 
                            amr_ns['has-pmid'], 
                            rdflib.Literal(pmid)))

            #:a1 amr:has-sentence "Sos-1 has been shown to be part of a signaling complex with Grb2, which mediates the activation of Ras upon RTK stimulation." .
            if( amr.metadata.get('snt', None) is not None):
                    g.add( (a1, 
                            amr_ns['has-sentence'], 
                            rdflib.Literal(amr.metadata['snt']) )
                          )

            #:a1 amr:has-date "2015-03-07T10:57:15
            if( amr.metadata.get('date', None) is not None):
                    g.add( (a1, 
                            amr_ns['has-date'], 
                            rdflib.Literal(amr.metadata['date'])))

            #:a1 amr:amr-annotator SDL-AMR-09
            if( amr.metadata.get('amr-annotator', None) is not None):
                    g.add( (a1,
                            amr_ns['has-annotator'], 
                            rdflib.Literal(amr.metadata['amr-annotator'])))
                    
            #:a1 amr:tok 
            if( amr.metadata.get('tok', None) is not None):
                    g.add( (a1, 
                            amr_ns['has-tokens'], 
                            rdflib.Literal(amr.metadata['tok'])))

            #:a1 amr:alignments
            if( amr.metadata.get('alignments', None) is not None):
                    g.add( (a1, 
                            amr_ns['has-alignments'], 
                            rdflib.Literal(amr.metadata['alignments'])))
            
            g.add( (a1, amr_ns.root, temp_ns[amr.root]) )

            # Add triples for setting types pointing to other resources
            frames = {}
            for (p, s, o) in inst:
                    
                o = strip_word_alignments(o,word_align_patt)
                #if word_pos is not None:
                #        g.add( (temp_ns[s], 
                #                        amr_ns['has-word-pos'], 
                #                        rdflib.Literal(word_pos)) )            
                  
                if( ns_lookup.get(o,None) is not None ):
                    resolved_ns = ns_lookup.get(o,None)
                    o_resolved = resolved_ns[o]
                    if( class_lookup.get(o,None) is not None): 
                        g.add( (o_resolved, rdflib.RDF.type, class_lookup.get(o,None)) )
                    else:
                        raise ValueError(o_resolved + ' does not have a class assigned.')
                elif( re.search('\-\d+$', o) is not None ):
                    #match = propbank_patt.match(o)
                    #str = ""
                    #if match:
                    #    str = match.group(1)
                    #o_resolved = pb_ns[str + ".html#" +o ]
                    o_resolved = pb_ns[ o ]
                    g.add( (o_resolved, rdflib.RDF.type, frameClass) ) 
                elif( o == 'xref' and args.fixXref): 
                    continue
                elif( not(o == 'name') ): # ignore 'name' objects but add all others.
                    o_resolved = amr_terms_ns[o]
                    g.add( (o_resolved, rdflib.RDF.type, conceptClass) )
                # identify xref variables in AMR, don't retain it as a part of the graph.
                else: 
                    continue
                 
                frames[s] = o
                g.add( (temp_ns[s], RDF.type, o_resolved) )

            # Add object properties for local links in the current AMR
            for (p, s, o) in rel2:
                
                if( p == "TOP" ):
                    continue
                 
                # Do not include word positions for predicates 
                # (since they are more general and do not need to linked to everything).     
                p = strip_word_alignments(p,word_align_patt)                
                o = strip_word_alignments(o,word_align_patt)
                                
                # remember which objects have name objects 
                if( p == 'name' ):
                    label_lookup_table[o] = s 
                    
                # objects with value objects should also be in  
                elif( p == 'xref' and args.fixXref):
                    xref_variables[o] = s   
               
                elif( re.search('^ARG\d+$', p) is not None ):

                    frameRole = frames[s] + "." + p
                    if( not(pBankRoles) ): 
                        frameRole = p

                    g.add( (pb_ns[frameRole], rdflib.RDF.type, frameRoleClass) )
                    g.add( (temp_ns[s], pb_ns[frameRole], temp_ns[o] ) )                    
                    vb_lookup[s] = temp_ns[s]
                    vb_lookup[frameRole] = pb_ns[frameRole]
                    vb_lookup[o] = temp_ns[o]

                elif( re.search('^ARG\d+\-of$', p) is not None ):
                    
                    frameRole = frames[o] + "." + p
                    if( not(pBankRoles) ): 
                        frameRole = p
                        
                    g.add( (pb_ns[frameRole], rdflib.RDF.type, frameRoleClass) )
                    g.add( (temp_ns[s], pb_ns[frameRole], temp_ns[o] ) )        
                    vb_lookup[s] = temp_ns[s]
                    vb_lookup[frameRole] = pb_ns[frameRole]
                    vb_lookup[o] = temp_ns[o]
                
                else:
                
                    g.add( (amr_terms_ns[p], rdflib.RDF.type, roleClass) )
                    g.add( (temp_ns[s], amr_terms_ns[p], temp_ns[o]) )
                    vb_lookup[s] = temp_ns[s]
                    vb_lookup[p] = amr_terms_ns[p]
                    vb_lookup[o] = temp_ns[o]
    

            # Add data properties in the current AMR
            labels = {}
            for (p, s, l) in rel1:

                p = strip_word_alignments(p, word_align_patt)
                l = strip_word_alignments(l, word_align_patt)
                
                #
                # Build labels across multiple 'op1, op2, ... opN' links, 
                #
                opN_match = re.match(opN_patt, p)
                if( opN_match is not None and
                        label_lookup_table.get(s,None) is not None):
                    opN = int(opN_match.group(1))
                    ss = label_lookup_table[s]
                    if( labels.get(ss, None) is None ):
                        labels[ss] = []
                    
                    labels[ss].append( (opN, l) )

                elif( xref_variables.get(s,None) is not None 
                      and p == 'value'
                      and args.fixXref):
                    for k in xref_namespace_lookup.keys():
                        if( l.startswith(k) ):
                            l2 = l[-len(l)+len(k):]
                            xref_vb = xref_variables.get(s,None)
                            resolved_xref_vb = vb_lookup.get(xref_vb,None)
                            g.add( (resolved_xref_vb, 
                                    amr_ns['xref'], 
                                    xref_namespace_lookup[k][l2]) )
                            
                # Special treatment for propbank roles.                 
                elif( re.search('ARG\d+$', p) is not None ):
                    
                    frameRole = frames[s] + "." + p
                    if( not(pBankRoles) ): 
                        frameRole = p
                    
                    g.add( (pb_ns[frameRole], rdflib.RDF.type, frameRoleClass) )
                    g.add( (temp_ns[s], pb_ns[frameRole], rdflib.Literal(l) ) )                    
                
                # Otherwise, it's just a literal 
                else:
                    g.add( (temp_ns[s], amr_terms_ns[p], rdflib.Literal(l) ) )
            
            # Add labels here
            # ["\n".join([i.split(' ')[j] for j in range(5)]) for i in g.vs["id"]]
            for key in labels.keys():
                labelArray = [i[1] for i in sorted(labels[key])];
                
                label = " ".join( labelArray )
                g.add( (temp_ns[key], 
                        RDFS.label, 
                        rdflib.Literal(label) ) )
            
            amrs_same_sent = []
            if cur_amr is not None:
                cur_id = cur_amr.metadata['id']
            else:
                break

        amrs_same_sent.append(cur_amr)
        amr_count = amr_count+1

    # Additional processing to clean up. 
    # 1. Add labels to AMR objects
    #q = sparql.prepareQuery("select distinct ?s ?label " +
    #                                                "where { " +
    #                                                "?s <http://amr.isi.edu/rdf/core-amr#name> ?n . " +
    #                                                "?n <http://amr.isi.edu/rdf/core-amr#op1> ?label " +
    #                                                "}")
    #qres = g.query(q)

    #for row in qres:
    #    print("%s type %s" % row)
    print ("%d AMRs converted" % amr_count)
    outfile.write( g.serialize(format=args.format) )
    outfile.close()

    infile.close()