Example #1
0
def main():
    parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                                 " data, tokenized, morph, pos tag and " \
                                                 "original, with manifests from extracted files")
    parser.add_argument("--rootdir", "-r", default=".",
                        help="root lrlp dir")
    parser.add_argument("--datadirs", nargs='+', default=[],
                        help="elements in path from root to ltf files")
    parser.add_argument("--src", "-s", default='uzb',
                        help="source language 3 letter code")
    parser.add_argument("--trg", "-t", default='eng',
                        help="target language 3 letter code")
    parser.add_argument("--outdir", "-o",
                        help="where to write extracted files")
    parser.add_argument("--nogarbage", action='store_true', default=False,
                        help="turn off garbage filtering")
    parser.add_argument("--toksubdir", default="tokenized",
                        help="subdirectory for tokenized files")
    parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                        help="subdirectory for cdec-tokenized files")
    parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                        help="subdirectory for tokenized files based on " \
                             "morphological segmentation")
    parser.add_argument("--morphsubdir", default="morph",
                        help="subdirectory for morphological information")
    parser.add_argument("--origsubdir", default="original",
                        help="subdirectory for untokenized files")
    parser.add_argument("--garbagesubdir", default="garbage",
                        help="subdirectory for garbage files (under orig)")
    parser.add_argument("--possubdir", default="pos",
                        help="subdirectory for pos tag files")
    parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                                "cdectok.sh"),
                        help="cdec tokenizer program wrapper")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    tokoutdir = os.path.join(args.outdir, args.toksubdir)
    origoutdir = os.path.join(args.outdir, args.origsubdir)
    cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir)
    morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir)
    morphoutdir = os.path.join(args.outdir, args.morphsubdir)
    posoutdir = os.path.join(args.outdir, args.possubdir)

    dirs = [args.outdir,
            tokoutdir,
            cdectokoutdir,
            origoutdir,
            morphtokoutdir,
            morphoutdir,
            posoutdir]
    if args.nogarbage:
        garbageoutdir = None
    else:
        garbageoutdir = os.path.join(origoutdir, args.garbagesubdir)
        dirs.append(garbageoutdir)
    for dir in dirs:
        if not os.path.exists(dir):
            os.makedirs(dir)

    datadirs = [args.rootdir, ] + args.datadirs
    indir = os.path.join(*datadirs)
    man_fh = open(os.path.join(args.outdir, "mono.manifest"), 'w')
    orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w')
    if args.nogarbage:
        garbage_fh = None
        garbage_man_fh = None
    else:
        garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w')
        garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"), 'w')
    tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                    "mono.flat"), 'w')
    morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w')
    pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w')

    for srcfile in os.listdir(indir):
        if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"):
            continue
        srcfile = os.path.join(indir, srcfile)
        with open(srcfile, 'r') as ifh:
            try:
                xobj = ET.parse(ifh)
                docid = xobj.findall(".//DOC")[0].get('id')
                origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")]
                garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
                goodmask = [not x for x in garbagemask]
                seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')]
                           for x in xobj.findall(".//SEG")]
                for line in compress(origlines, garbagemask):
                    orig_fh.write(line)
                for tup in compress(seginfo, garbagemask):
                    man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n")
                if not args.nogarbage:
                    for line in compress(origlines, goodmask):
                        garbage_fh.write(line)
                    for tup in compress(seginfo, goodmask):
                        garbage_man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n")
                for x in compress(xobj.findall(".//SEG"), garbagemask):
                    tokens = x.findall(".//TOKEN")
                    toktext = []
                    morphtoktext = []
                    morphtext = []
                    postext = []
                    for y in tokens:
                        if y.text is None:
                            continue
                        toktext.append(y.text)
                        postext.append(y.get("pos") or "none")
                        for mt, mtt in morph_tok(y):
                            morphtext.append(mt)
                            morphtoktext.append(mtt)
                    tok_fh.write(' '.join(toktext) + "\n")
                    morphtok_fh.write(' '.join(morphtoktext) + "\n")
                    morph_fh.write(' '.join(morphtext) + "\n")
                    pos_fh.write(' '.join(postext) + "\n")
            except ET.ParseError:
                sys.stderr.write("Parse error on " + ifh.name + "\n")
                continue
    orig_fh.close()
    cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                         orig_fh.name,
                                         os.path.join(cdectokoutdir,
                                                      "mono.flat.lc"),
                                         os.path.join(cdectokoutdir,
                                                      "mono.flat"))
    p = subprocess.Popen(shlex.split(cdec_cmd))
    p.wait()
Example #2
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--agiletoksubdir", default="agile-tokenized",
                      help="subdirectory for agile-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'),
                      help="path to agile tokenizer binary")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          agiletokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable')
  clusters = getclusters(os.path.join(rootdir, 'clusters'))
  srcindir = os.path.join(rootdir, args.src, 'ltf')
  trgindir = os.path.join(rootdir, args.trg, 'ltf')
  datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir),
              (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)]

  for lang, indir, exttokenizer, exttokoutdir in datasets:
    inbase = lang
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for filename in os.listdir(indir):
      # assume ltf filename
      if not filename.endswith("ltf.xml"):
        continue
      # avoid mac meta stuff
      if filename.startswith("."):
        continue
      # print info.filename
      with open(os.path.join(indir, filename), 'r') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          if len(clusters[lang][docid]) < 1:
            sys.stderr.write("Warning: no clusters for %s\n" % docid)
            clusid="NONE"
          else:
            clset = clusters[lang][docid]
            if len(clset) > 1:
              sys.stderr.write("Warning: multiple clusters for %s\n" % docid)
            clusid = '_'.join(clset)
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            #TODO: get cluster ID!!!
            man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer,
                                         orig_fh.name,
                                         os.path.join(exttokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(exttokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(ext_cmd))
    p.wait()
Example #3
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
                      default=[sys.stdin,],
                      help="input zip file(s) (each contains a multi file)")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  defaultcount=0
  for infile in args.infile:
    inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
    if len(inbase) == 0:
      inbase="default.%d" % defaultcount
      defaultcount+=1
    archive = zf(infile)
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for info in archive.infolist():
      if info.file_size < 20:
        continue
      # assume ltf filename
      if not info.filename.endswith("ltf.xml"):
        continue
      # print info.filename
      with archive.open(info, 'rU') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                         orig_fh.name,
                                         os.path.join(cdectokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(cdectokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(cdec_cmd))
    p.wait()
Example #4
0
def main():
    parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                                 " data, tokenized, morph, pos tag and " \
                                                 "original, with manifests")
    parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
                        default=[sys.stdin, ],
                        help="input zip file(s) (each contains a multi file)")
    parser.add_argument("--outdir", "-o",
                        help="where to write extracted files")
    parser.add_argument("--nogarbage", action='store_true', default=False,
                        help="turn off garbage filtering")
    parser.add_argument("--toksubdir", default="raw.tokenized",
                        help="subdirectory for ldc-tokenized files")
    parser.add_argument("--cleantoksubdir", default="tokenized",
                        help="subdirectory for cleaned ldc-tokenized files")
    parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                        help="subdirectory for cdec-tokenized files")
    parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                        help="subdirectory for tokenized files based on " \
                             "morphological segmentation")
    parser.add_argument("--morphsubdir", default="morph",
                        help="subdirectory for morphological information")
    parser.add_argument("--origsubdir", default="raw.original",
                        help="subdirectory for untokenized files")
    parser.add_argument("--cleanorigsubdir", default="original",
                        help="subdirectory for cleaned raw original")

    parser.add_argument("--garbagesubdir", default="garbage",
                        help="subdirectory for garbage files (under orig)")
    parser.add_argument("--possubdir", default="pos",
                        help="subdirectory for pos tag files")
    parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'),
                        help="path to cleaning script")
    parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                                "cdectok.sh"),
                        help="cdec tokenizer program wrapper")
    addonoffarg(parser, 'cdec', help="do cdec tokenization", default=True)
    addonoffarg(parser, 'removesn', help="remove SN from mono zip (to avoid underscore tweets)", default=False)

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    tokoutdir = os.path.join(args.outdir, args.toksubdir)
    origoutdir = os.path.join(args.outdir, args.origsubdir)
    cleantokoutdir = os.path.join(args.outdir, args.cleantoksubdir)
    cleanorigoutdir = os.path.join(args.outdir, args.cleanorigsubdir)
    cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir)
    morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir)
    morphoutdir = os.path.join(args.outdir, args.morphsubdir)
    posoutdir = os.path.join(args.outdir, args.possubdir)
    cleanpath = args.cleanpath
    dirs = [args.outdir,
            tokoutdir,
            origoutdir,
            cleantokoutdir,
            cleanorigoutdir,
            morphtokoutdir,
            morphoutdir,
            posoutdir]
    if args.cdec:
        dirs.append(cdectokoutdir)
    if args.nogarbage:
        garbageoutdir = None
    else:
        garbageoutdir = os.path.join(origoutdir, args.garbagesubdir)
        dirs.append(garbageoutdir)
    for dir in dirs:
        if not os.path.exists(dir):
            os.makedirs(dir)

    defaultcount = 0
    for infile in args.infile:
        inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
        if len(inbase) == 0:
            inbase = "default.%d" % defaultcount
            defaultcount += 1
        archive = zf(infile)
        man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase), 'w')
        orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
        if args.nogarbage:
            garbage_fh = None
            garbage_man_fh = None
        else:
            garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
            garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase), 'w')
        tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
        morphtok_fh = open(os.path.join(morphtokoutdir,
                                        "%s.flat" % inbase), 'w')
        morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
        pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
        for info in archive.infolist():
            if info.file_size < 20:
                continue
            # assume ltf filename
            if not info.filename.endswith("ltf.xml"):
                continue
            # print info.filename
            with TextIOWrapper(archive.open(info, 'r')) as ifh:
                try:
                    xobj = ET.parse(ifh)
                    docid = xobj.findall(".//DOC")[0].get('id')
                    # avoid anonymized tweets in packages but not relocated downloaded mono tweets
                    if "tweets" not in inbase and args.removesn and "_SN_" in docid:
                        sys.stderr.write("SN skip: not extracting {}\n".format(docid))
                        continue
                    origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")]
                    garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
                    goodmask = [not x for x in garbagemask]
                    seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')]
                               for x in xobj.findall(".//SEG")]
                    for line in compress(origlines, garbagemask):
                        orig_fh.write(line)
                    for tup in compress(seginfo, garbagemask):
                        man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n")
                    if not args.nogarbage:
                        for line in compress(origlines, goodmask):
                            garbage_fh.write(line)
                        for tup in compress(seginfo, goodmask):
                            garbage_man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n")
                    for x in compress(xobj.findall(".//SEG"), garbagemask):
                        tokens = x.findall(".//TOKEN")
                        toktext = []
                        morphtoktext = []
                        morphtext = []
                        postext = []
                        for y in tokens:
                            if y.text is None:
                                continue
                            toktext.append(y.text)
                            postext.append(y.get("pos") or "none")
                            for mt, mtt in morph_tok(y):
                                morphtext.append(mt)
                                morphtoktext.append(mtt)
                        tok_fh.write(' '.join(toktext) + "\n")
                        morphtok_fh.write(' '.join(morphtoktext) + "\n")
                        morph_fh.write(' '.join(morphtext) + "\n")
                        pos_fh.write(' '.join(postext) + "\n")
                except ET.ParseError:
                    sys.stderr.write("Parse error on " + ifh.name + "\n")
                    continue
        orig_fh.close()
        tok_fh.close()
        # raw orig->clean orig
        # raw tok->clean tok
        clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase)
        clean_tok = os.path.join(cleantokoutdir, "%s.flat" % inbase)
        for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)):
            cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean)
            sys.stderr.write(cleancmd + "\n")
            try:
                check_call(shlex.split(cleancmd))
            except CalledProcessError as e:
                sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
                sys.exit(1)

        if args.cdec:
            cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                                 orig_fh.name,
                                                 os.path.join(cdectokoutdir,
                                                              "%s.flat.lc" % inbase),
                                                 os.path.join(cdectokoutdir,
                                                              "%s.flat" % inbase))
            p = subprocess.Popen(shlex.split(cdec_cmd))
            p.wait()
Example #5
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests from extracted files")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--datadirs", nargs='+', default=[],
                      help="elements in path from root to ltf files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)


  datadirs=[args.rootdir,]+args.datadirs
  indir = os.path.join(*datadirs)
  man_fh = open(os.path.join(args.outdir, "mono.manifest"),'w')
  orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w')
  if args.nogarbage:
    garbage_fh = None
    garbage_man_fh = None
  else:
    garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w')
    garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"),'w')
  tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w')
  morphtok_fh = open(os.path.join(morphtokoutdir,
                                         "mono.flat"), 'w')
  morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w')
  pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w')

  for srcfile in os.listdir(indir):
    if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"):
      continue
    srcfile = os.path.join(indir, srcfile)
    with open(srcfile, 'r') as ifh:
      try:
        xobj = ET.parse(ifh)
        docid = xobj.findall(".//DOC")[0].get('id')
        origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
        garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
        goodmask = [not x for x in garbagemask]
        seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                    for x in xobj.findall(".//SEG") ]
        for line in compress(origlines, garbagemask):
          orig_fh.write(line)
        for tup in compress(seginfo, garbagemask):
          man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n")
        if not args.nogarbage:
          for line in compress(origlines, goodmask):
            garbage_fh.write(line)
          for tup in compress(seginfo, goodmask):
            garbage_man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n")
        for x in compress(xobj.findall(".//SEG"), garbagemask):
          tokens = x.findall(".//TOKEN")
          toktext = []
          morphtoktext = []
          morphtext = []
          postext = []
          for y in tokens:
            if y.text is None:
              continue
            toktext.append(y.text)
            postext.append(y.get("pos") or "none")
            for mt, mtt in morph_tok(y):
              morphtext.append(mt)
              morphtoktext.append(mtt)
          tok_fh.write(' '.join(toktext)+"\n")
          morphtok_fh.write(' '.join(morphtoktext)+"\n")
          morph_fh.write(' '.join(morphtext)+"\n")
          pos_fh.write(' '.join(postext)+"\n")
      except ET.ParseError:
        sys.stderr.write("Parse error on "+ifh.name+"\n")
        continue
  orig_fh.close()
  cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                       orig_fh.name,
                                       os.path.join(cdectokoutdir,
                                                    "mono.flat.lc"),
                                       os.path.join(cdectokoutdir,
                                                    "mono.flat"))
  p = subprocess.Popen(shlex.split(cdec_cmd))
  p.wait()
Example #6
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="raw.tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cleantoksubdir", default="tokenized",
                      help="subdirectory for cleaned ldc-tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--agiletoksubdir", default="agile-tokenized",
                      help="subdirectory for agile-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--cleanorigsubdir", default="original",
                      help="subdirectory for cleaned raw original")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="raw.original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'),
                      help="path to cleaning script")
  parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'),
                      help="path to agile tokenizer binary")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cleantokoutdir=os.path.join(args.outdir,  args.cleantoksubdir)
  cleanorigoutdir=os.path.join(args.outdir, args.cleanorigsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)
  cleanpath = args.cleanpath
  dirs = [args.outdir,
          tokoutdir,
          cleantokoutdir,
          cleanorigoutdir,
          cdectokoutdir,
          agiletokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable')
  clusters = getclusters(os.path.join(rootdir, 'clusters'))
  srcindir = os.path.join(rootdir, args.src, 'ltf')
  trgindir = os.path.join(rootdir, args.trg, 'ltf')
  datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir),
              (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)]

  for lang, indir, exttokenizer, exttokoutdir in datasets:
    inbase = lang
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for filename in os.listdir(indir):
      # assume ltf filename
      if not filename.endswith("ltf.xml"):
        continue
      # avoid mac meta stuff
      if filename.startswith("."):
        continue
      # print info.filename
      with open(os.path.join(indir, filename), 'r') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          if len(clusters[lang][docid]) < 1:
            sys.stderr.write("Warning: no clusters for %s\n" % docid)
            clusid="NONE"
          else:
            clset = clusters[lang][docid]
            if len(clset) > 1:
              sys.stderr.write("Warning: multiple clusters for %s\n" % docid)
            clusid = '_'.join(clset)
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            #TODO: get cluster ID!!!
            man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    tok_fh.close()
    clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase)
    clean_tok =  os.path.join(cleantokoutdir, "%s.flat" % inbase)
    for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)):
      cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean)
      sys.stderr.write(cleancmd+"\n")
      try:
        check_call(shlex.split(cleancmd))
      except CalledProcessError as e:
        sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
        sys.exit(1)

    ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer,
                                         orig_fh.name,
                                         os.path.join(exttokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(exttokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(ext_cmd))
    p.wait()