コード例 #1
0
 for xann in xdoc.findall("ANNOTATION"):
   annid = xann.get("id")
   annset[annid] = xann
   anntask = xann.get("task")
   if xann.find("EXTENT") is None:
     if anntask != "FE":
       sys.stderr.write("Warning: extent-free non-full annotation in "+annfile+": "+ET.tostring(xann)+"\n")
       continue
     continue
   xextent = xann.find("EXTENT")
   tup = [anntask, docid, xextent.get("start_char") or "None", xextent.get("end_char") or "None", annid or "None", xextent.text or "None"]
   if anntask == "NE": # simple ne annotation
     tup.append(xann.get("type"))
   else: # everything else has category/tag style
     try:
       tup.append(funornone(xann.find("CATEGORY"), lambda x: x.text))
       tup.append(funornone(xann.find("TAG"), lambda x: x.text))
       if (anntask == "FE"):
         if xann.find("ENTITY") is not None:
           eid = xann.find("ENTITY").get("entity_id")
           tup.append(eid)
           tup.append(annset[eid].find("TAG").text if annset[eid].find("TAG") is not None else "NONE")
         elif xann.find("PHRASE") is not None:
           tup.append(xann.find("PHRASE").get("phrase_id"))
         else:
           sys.stderr("Expected ENTITY or PHRASE at "+annfile+"; "+ET.tostring(xann))
           continue
       elif (anntask == "SSA"):
         if xann.find("PREDICATE") is not None:
           tup.append(xann.find("PREDICATE").get("predicate_id"))
       else:
コード例 #2
0
def main():
  import codecs
  parser = argparse.ArgumentParser(description="Extract and print laf annotat" \
                                   "ion data from LRLP in a form that is amen" \
                                   "able to insertion into future xml",
                                   formatter_class= \
                                   argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir")
  # parser.add_argument("--outfile", "-o",
  #                     type=argparse.FileType('w'), default=sys.stdout,
  #                     help="where to write extracted semantic info")
  parser.add_argument("--outfile", "-o", help="where to write")
  parser.add_argument("--extwtdir", "-et", help="extracted tweet rsd files dir")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  outfile = open(args.outfile, 'w')
  twtdir = args.extwtdir
  anndir = os.path.join(args.rootdir, 'data', 'annotation')
  if not os.path.exists(anndir):
    sys.stderr.write("No annotation directory found\n")
    sys.exit(0)
  # print anndir
  for annfile in recursive_file_gen(anndir):
    if annfile.endswith("laf.xml") and \
       not os.path.basename(annfile).startswith("."):
      try:
        xobj = ET.parse(annfile)
      except:
        sys.stderr.write("Problem parsing "+annfile+"\n")
        continue

      for xdoc in xobj.findall("DOC"):
        docid = xdoc.get("id")
        if docid.startswith('doc-'): # In NPC annotation, LDC uses "doc-n"
                                     # instead of original docid
            docid = os.path.basename(annfile).replace('.laf.xml', '')
        if docid.startswith('SN_TWT_'): # No string head for TWT, need rsd file
            if not os.path.isfile('%s/%s.rsd.txt' % (twtdir, docid)):
                continue

        # Store all annotations by id. if they have an extent, spit them out.
        # if no extent, check they are entities; nothing else should be
        # extent-free PREDICATE, ENTITY, and PHRASE are cross references to ids;
        # for ENTITY the core type is copied, for everything else just the cross
        # reference
        annset = {}
        for xann in xdoc.findall("ANNOTATION"):
          annid = xann.get("id")
          if annid.startswith('doc-'):
              annid = re.sub('doc-\d+', docid, annid)
          annset[annid] = xann
          anntask = xann.get("task")
          if xann.find("EXTENT") is None:
            if anntask != "FE" and anntask != "SSA":
              sys.stderr.write("Warning: extent-free non-full annotation in " \
                               +annfile+": "+ET.tostring(xann)+"\n")
            continue
          # map aberrant type
          if anntask == "NPchunk":
            anntask = "NPC"
          xextent = xann.find('EXTENT')
          if docid.startswith('SN_TWT_'): # No string head for TWT
              strhead = xextent.text
              tweet = open('%s/%s.rsd.txt' % (twtdir, docid)).read()
              beg = int(xextent.get("start_char"))
              end = int(xextent.get("end_char"))
              # but don't go negative
              if beg < 0 or end > len(tweet):
                  sys.stderr.write(annfile+" Bad offsets: can't do %d, %d on %s\n" % (beg, end, docid))
                  continue
              strhead = tweet[beg:end+1]
              tup = [anntask, docid, xextent.get("start_char") or "None",
                     xextent.get("end_char") or "None", annid or "None",
                     strhead or "None"]
          else:
              tup = [anntask, docid, xextent.get("start_char") or "None",
                     xextent.get("end_char") or "None", annid or "None",
                     xextent.text or "None"]
          if anntask == "NE": # Simple ne annotation
            # old style: in attributes. new style: in tag
            if "type" in xann.keys():
              tup.append(xann.get("type"))
            else:
              tup.append(funornone(xann.find("TAG"), lambda x: x.text))
          elif (anntask == "NPC" or anntask=="NPchunk"): # NP chunking
            # old style: in attributes. new style: in tag
            if "type" in xann.keys():
              tup.append(xann.get("type"))
            else:
              tup.append(funornone(xann.find("TAG"), lambda x: x.text))
          else: # Everything else has category/tag style
            try:
              tup.append(funornone(xann.find("CATEGORY"), lambda x: x.text))
              tup.append(funornone(xann.find("TAG"), lambda x: x.text))
              if (anntask == "FE"):
                if xann.find("ENTITY") is not None:
                  eid = xann.find("ENTITY").get("entity_id")
                  tup.append(eid)
                  tup.append(annset[eid].find("TAG").text if \
                             annset[eid].find("TAG") is not None else "NONE")
                elif xann.find("PHRASE") is not None:
                  tup.append(xann.find("PHRASE").get("phrase_id"))
                else:
                  sys.stderr("Expected ENTITY or PHRASE at "\
                             +annfile+"; "+ET.tostring(xann))
                  continue
              elif (anntask == "SSA"):
                if xann.find("PREDICATE") is not None:
                  tup.append(xann.find("PREDICATE").get("predicate_id"))
              else:
                sys.stderr.write(annfile+": Don't know how to process "\
                                 +anntask+"\n")
                continue
            except:
              print(annfile)
              print(ET.tostring(xann))
              raise
          try:
              outfile.write("\t".join(map(str,tup))+"\n")
          except UnicodeDecodeError:
              sys.stderr.write("Warning: Unknown encoding %s:%s-%s\n" % \
                               (tup[4], tup[2], tup[3]))
コード例 #3
0
def main():
    import codecs
    parser = argparse.ArgumentParser(description="Extract and print laf annotat" \
                                     "ion data from LRLP in a form that is amen" \
                                     "able to insertion into future xml",
                                     formatter_class= \
                                     argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir")
    # parser.add_argument("--outfile", "-o",
    #                     type=argparse.FileType('w'), default=sys.stdout,
    #                     help="where to write extracted semantic info")
    parser.add_argument("--outfile", "-o", help="where to write")
    parser.add_argument("--extwtdir",
                        "-et",
                        default=None,
                        help="extracted tweet rsd files dir")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    outfile = open(args.outfile, 'w')
    twtdir = args.extwtdir
    anndir = os.path.join(args.rootdir, 'data', 'annotation')
    if not os.path.exists(anndir):
        sys.stderr.write("No annotation directory found\n")
        sys.exit(0)
    if twtdir is not None and not os.path.exists(twtdir):
        sys.stderr.write("Warning: no {}\n".format(twtdir))
        twtdir = None
    # print anndir
    for annfile in recursive_file_gen(anndir):
        if annfile.endswith("laf.xml") and \
           not os.path.basename(annfile).startswith("."):
            try:
                xobj = ET.parse(annfile)
            except:
                sys.stderr.write("Problem parsing " + annfile + "\n")
                continue

            for xdoc in xobj.findall("DOC"):
                docid = xdoc.get("id")
                if docid.startswith(
                        'doc-'):  # In NPC annotation, LDC uses "doc-n"
                    # instead of original docid
                    docid = os.path.basename(annfile).replace('.laf.xml', '')
                if is_sn(
                        docid
                ) and twtdir is not None:  # No string head for TWT, need rsd file
                    if not os.path.isfile('%s/%s.rsd.txt' % (twtdir, docid)):
                        continue

                # Store all annotations by id. if they have an extent, spit them out.
                # if no extent, check they are entities; nothing else should be
                # extent-free PREDICATE, ENTITY, and PHRASE are cross references to ids;
                # for ENTITY the core type is copied, for everything else just the cross
                # reference
                annset = {}
                for xann in xdoc.findall("ANNOTATION"):
                    annid = xann.get("id")
                    if annid.startswith('doc-'):
                        annid = re.sub('doc-\d+', docid, annid)
                    annset[annid] = xann
                    anntask = xann.get("task")
                    if xann.find("EXTENT") is None:
                        if anntask != "FE" and anntask != "SSA":
                            sys.stderr.write("Warning: extent-free non-full annotation in " \
                                             +annfile+": "+ET.tostring(xann)+"\n")
                        continue
                    # map aberrant type
                    if anntask == "NPchunk":
                        anntask = "NPC"
                    xextent = xann.find('EXTENT')
                    try:
                        if is_sn(
                                docid
                        ) and twtdir is not None:  # No string head for TWT
                            strhead = xextent.text
                            tweet = open('%s/%s.rsd.txt' %
                                         (twtdir, docid)).read()
                            beg = int(xextent.get("start_char"))
                            end = int(xextent.get("end_char"))
                            # but don't go negative
                            if beg < 0 or end > len(tweet):
                                sys.stderr.write(
                                    annfile +
                                    " Bad offsets: can't do %d, %d on %s\n" %
                                    (beg, end, docid))
                                continue
                            strhead = tweet[beg:end + 1]
                            tup = [
                                anntask, docid,
                                str(int(xextent.get("start_char"))) or "None",
                                xextent.get("end_char") or "None", annid
                                or "None", strhead or "None"
                            ]
                        else:
                            tup = [
                                anntask, docid,
                                str(int(xextent.get("start_char"))) or "None",
                                xextent.get("end_char") or "None", annid
                                or "None", xextent.text or "None"
                            ]
                    except:
                        sys.stderr.write(
                            "Trouble at %s in %s should be investigated\n" %
                            (docid, annfile))
                        continue

                    if anntask == "NE":  # Simple ne annotation
                        # old style: in attributes. new style: in tag
                        if "type" in xann.keys():
                            tup.append(xann.get("type"))
                        else:
                            tup.append(
                                funornone(xann.find("TAG"), lambda x: x.text))
                    elif (anntask == "NPC"
                          or anntask == "NPchunk"):  # NP chunking
                        # old style: in attributes. new style: in tag
                        if "type" in xann.keys():
                            tup.append(xann.get("type"))
                        else:
                            tup.append(
                                funornone(xann.find("TAG"), lambda x: x.text))
                    else:  # Everything else has category/tag style
                        try:
                            tup.append(
                                funornone(xann.find("CATEGORY"),
                                          lambda x: x.text))
                            tup.append(
                                funornone(xann.find("TAG"), lambda x: x.text))
                            if (anntask == "FE"):
                                if xann.find("ENTITY") is not None:
                                    eid = xann.find("ENTITY").get("entity_id")
                                    tup.append(eid)
                                    tup.append(annset[eid].find("TAG").text if \
                                               annset[eid].find("TAG") is not None else "NONE")
                                elif xann.find("PHRASE") is not None:
                                    tup.append(
                                        xann.find("PHRASE").get("phrase_id"))
                                else:
                                    sys.stderr("Expected ENTITY or PHRASE at "\
                                               +annfile+"; "+ET.tostring(xann))
                                    continue
                            elif (anntask == "SSA"):
                                if xann.find("PREDICATE") is not None:
                                    tup.append(
                                        xann.find("PREDICATE").get(
                                            "predicate_id"))
                            else:
                                sys.stderr.write(annfile+": Don't know how to process "\
                                                 +anntask+"\n")
                                continue
                        except:
                            print(annfile)
                            print(ET.tostring(xann))
                            raise
                    try:
                        outfile.write("\t".join(map(str, tup)) + "\n")
                    except UnicodeDecodeError:
                        sys.stderr.write("Warning: Unknown encoding %s:%s-%s\n" % \
                                         (tup[4], tup[2], tup[3]))