Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                                 " data, tokenized, morph, pos tag and " \
                                                 "original, with manifests from extracted files")
    parser.add_argument("--rootdir", "-r", default=".",
                        help="root lrlp dir")
    parser.add_argument("--datadirs", nargs='+', default=[],
                        help="elements in path from root to ltf files")
    parser.add_argument("--src", "-s", default='uzb',
                        help="source language 3 letter code")
    parser.add_argument("--trg", "-t", default='eng',
                        help="target language 3 letter code")
    parser.add_argument("--outdir", "-o",
                        help="where to write extracted files")
    parser.add_argument("--nogarbage", action='store_true', default=False,
                        help="turn off garbage filtering")
    parser.add_argument("--toksubdir", default="tokenized",
                        help="subdirectory for tokenized files")
    parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                        help="subdirectory for cdec-tokenized files")
    parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                        help="subdirectory for tokenized files based on " \
                             "morphological segmentation")
    parser.add_argument("--morphsubdir", default="morph",
                        help="subdirectory for morphological information")
    parser.add_argument("--origsubdir", default="original",
                        help="subdirectory for untokenized files")
    parser.add_argument("--garbagesubdir", default="garbage",
                        help="subdirectory for garbage files (under orig)")
    parser.add_argument("--possubdir", default="pos",
                        help="subdirectory for pos tag files")
    parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                                "cdectok.sh"),
                        help="cdec tokenizer program wrapper")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    tokoutdir = os.path.join(args.outdir, args.toksubdir)
    origoutdir = os.path.join(args.outdir, args.origsubdir)
    cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir)
    morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir)
    morphoutdir = os.path.join(args.outdir, args.morphsubdir)
    posoutdir = os.path.join(args.outdir, args.possubdir)

    dirs = [args.outdir,
            tokoutdir,
            cdectokoutdir,
            origoutdir,
            morphtokoutdir,
            morphoutdir,
            posoutdir]
    if args.nogarbage:
        garbageoutdir = None
    else:
        garbageoutdir = os.path.join(origoutdir, args.garbagesubdir)
        dirs.append(garbageoutdir)
    for dir in dirs:
        if not os.path.exists(dir):
            os.makedirs(dir)

    datadirs = [args.rootdir, ] + args.datadirs
    indir = os.path.join(*datadirs)
    man_fh = open(os.path.join(args.outdir, "mono.manifest"), 'w')
    orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w')
    if args.nogarbage:
        garbage_fh = None
        garbage_man_fh = None
    else:
        garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w')
        garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"), 'w')
    tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                    "mono.flat"), 'w')
    morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w')
    pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w')

    for srcfile in os.listdir(indir):
        if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"):
            continue
        srcfile = os.path.join(indir, srcfile)
        with open(srcfile, 'r') as ifh:
            try:
                xobj = ET.parse(ifh)
                docid = xobj.findall(".//DOC")[0].get('id')
                origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")]
                garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
                goodmask = [not x for x in garbagemask]
                seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')]
                           for x in xobj.findall(".//SEG")]
                for line in compress(origlines, garbagemask):
                    orig_fh.write(line)
                for tup in compress(seginfo, garbagemask):
                    man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n")
                if not args.nogarbage:
                    for line in compress(origlines, goodmask):
                        garbage_fh.write(line)
                    for tup in compress(seginfo, goodmask):
                        garbage_man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n")
                for x in compress(xobj.findall(".//SEG"), garbagemask):
                    tokens = x.findall(".//TOKEN")
                    toktext = []
                    morphtoktext = []
                    morphtext = []
                    postext = []
                    for y in tokens:
                        if y.text is None:
                            continue
                        toktext.append(y.text)
                        postext.append(y.get("pos") or "none")
                        for mt, mtt in morph_tok(y):
                            morphtext.append(mt)
                            morphtoktext.append(mtt)
                    tok_fh.write(' '.join(toktext) + "\n")
                    morphtok_fh.write(' '.join(morphtoktext) + "\n")
                    morph_fh.write(' '.join(morphtext) + "\n")
                    pos_fh.write(' '.join(postext) + "\n")
            except ET.ParseError:
                sys.stderr.write("Parse error on " + ifh.name + "\n")
                continue
    orig_fh.close()
    cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                         orig_fh.name,
                                         os.path.join(cdectokoutdir,
                                                      "mono.flat.lc"),
                                         os.path.join(cdectokoutdir,
                                                      "mono.flat"))
    p = subprocess.Popen(shlex.split(cdec_cmd))
    p.wait()
Ejemplo n.º 2
0
def printout(prefix, path, src, trg, outdir, origoutdir, garbageoutdir,
             tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir,
             agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir,
             agiletokpath, cdectokpath, 
             stp=lputil.selected_translation_pairs, el=lputil.extract_lines,
             tweet=False):
  ''' Find files and print them out '''
  src_man_fh=open(os.path.join(outdir, "%s.%s.manifest" % (prefix, src)), 'w')
  trg_man_fh=open(os.path.join(outdir, "%s.%s.manifest" % (prefix, trg)), 'w')
  src_orig_fname=os.path.join(outdir, origoutdir, "%s.%s.%s.flat" % \
                                (prefix,origoutdir,src))
  src_orig_fh=open(src_orig_fname, 'w')
  trg_orig_fname=os.path.join(outdir, origoutdir, "%s.%s.%s.flat" % \
                              (prefix,origoutdir,trg))
  trg_orig_fh=open(trg_orig_fname, 'w')

  garbagefhs = {}
  garbagedisabled=True
  if garbageoutdir is not None:
    garbagedisabled=False
    src_orig_garbage_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \
                                          (prefix,src)), 'w')
    garbagefhs[src_orig_fh]=src_orig_garbage_fh
    trg_orig_garbage_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \
                                          (prefix,trg)), 'w')
    garbagefhs[trg_orig_fh]=trg_orig_garbage_fh
    src_garbage_man_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, src)), 'w')
    garbagefhs[src_man_fh]=src_garbage_man_fh
    trg_garbage_man_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, trg)), 'w')
    garbagefhs[trg_man_fh]=trg_garbage_man_fh
  src_tok_fh=open(os.path.join(outdir, tokoutdir, "%s.%s.%s.flat" % \
                               (prefix,tokoutdir,src)), 'w')
  trg_tok_fh=open(os.path.join(outdir, tokoutdir, "%s.%s.%s.flat" % \
                               (prefix,tokoutdir,trg)), 'w')
  src_morphtok_fh=open(os.path.join(outdir, morphtokoutdir, "%s.%s.%s.flat" % \
                                    (prefix,morphtokoutdir,src)),'w')
  trg_morphtok_fh=open(os.path.join(outdir, morphtokoutdir, "%s.%s.%s.flat" % \
                                    (prefix,morphtokoutdir,trg)),'w')
  src_morph_fh=open(os.path.join(outdir, morphoutdir, "%s.%s.%s.flat" % \
                                 (prefix,morphoutdir,src)),'w')
  trg_morph_fh=open(os.path.join(outdir, morphoutdir, "%s.%s.%s.flat" % \
                                 (prefix,morphoutdir,trg)),'w')
  src_pos_fh=open(os.path.join(outdir, posoutdir, "%s.%s.%s.flat" % \
                               (prefix,posoutdir,src)),'w')
  trg_pos_fh=open(os.path.join(outdir, posoutdir, "%s.%s.%s.flat" % \
                               (prefix,posoutdir,trg)),'w')
  src_cdectok_fname=os.path.join(outdir, cdectokoutdir, "%s.%s.%s.flat" % \
                                    (prefix,cdectokoutdir,src))
  trg_agiletok_fname=os.path.join(outdir, agiletokoutdir, "%s.%s.%s.flat" % \
                                    (prefix,agiletokoutdir,trg))
  src_cdectoklc_fname=os.path.join(outdir, cdectoklcoutdir, "%s.%s.%s.flat" % \
                                    (prefix,cdectoklcoutdir,src))
  trg_agiletoklc_fname=os.path.join(outdir, agiletoklcoutdir, "%s.%s.%s.flat" % \
                                    (prefix,agiletoklcoutdir,trg))

  for m in stp(path, src=src, trg=trg, xml=True, tweet=tweet):

    if not tweet:
      sdata, tdata = el(*m)
    else:
      sdata, tdata = el(*m, sxml=False, txml=True)

    if sdata is None or tdata is None:
      sys.stderr.write("Warning: empty files:\n%s or %s\n" % (m[0], m[1]))
      continue
    # Strict rejection of different length lines. If these are desired,
    # do gale & church or brown et al or something similar here
    slen = len(sdata["ORIG"])
    tlen = len(tdata["ORIG"])
    #print(slen,tlen)
    if slen != tlen:
      sys.stderr.write("Warning: different number of lines in files:\n" \
                       "%s %d\n%s %d\n" % (m[0], slen, m[1], tlen))
      continue

    # filter out control code-bearing lines here. mask out the data from all fields
    garbagemask = lputil.getgarbagemask(sdata["ORIG"], tdata["ORIG"], disabled=garbagedisabled)

    goodmask = [not x for x in garbagemask]
    ### Write original
    for fh, data in zip((src_orig_fh, trg_orig_fh), (sdata["ORIG"], tdata["ORIG"])):
      for line in compress(data, garbagemask):
        fh.write(line)
      ### Write garbage original
      if not garbagedisabled:
        for line in compress(data, goodmask):
          garbagefhs[fh].write(line)
    
    ### Write manifest
    if not tweet:
      try:
        for fh, fname, tupgen in zip((src_man_fh, trg_man_fh), (m[0], m[1]),
                                     (list(zip(sdata["DOCID"], sdata["SEGID"],
                                          sdata["START"], sdata["END"])),
                                      list(zip(tdata["DOCID"], tdata["SEGID"],
                                          tdata["START"], tdata["END"])))):
          for tup in compress(tupgen, garbagemask):
            fh.write("\t".join(map(str, (fname,)+tup))+"\n")
          if not garbagedisabled:
            for tup in compress(tupgen, goodmask):
              garbagefhs[fh].write("\t".join(map(str, (fname,)+tup))+"\n")
      except:
        sys.stderr.write(src_man_fh.name)
        #sys.stderr.write(fname)
        raise
    else:
      # Source
      fh = src_man_fh
      field = sdata["DOCID"]
      for line in compress(field, garbagemask):
        line = line.strip()
        fh.write('%s\t%s\n' % (line,
#                               line))
                               re.search('.+/(\S*?)\.', line).group(1)))
      if not garbagedisabled:
        for line in compress(field, goodmask):
          line = line.strip()
          garbagefhs[fh].write('%s\t%s\n' % (line,
  #                                           line))
                                 re.search('.+/(\S*?)\.', line).group(1)))

      # Target
      try:
        fh = trg_man_fh
        fname = m[1]
        for tup in compress(list(zip(tdata["DOCID"], tdata["SEGID"],
                                     tdata["START"], tdata["END"])), garbagemask):
            fh.write("\t".join(map(str, (fname,)+tup))+"\n")
        if not garbagedisabled:
          for tup in compress(list(zip(tdata["DOCID"], tdata["SEGID"],
                                       tdata["START"], tdata["END"])), goodmask):
              garbagefhs[fh].write("\t".join(map(str, (fname,)+tup))+"\n")
      except:
        sys.stderr.write(fname)
        raise

    ### Write tokenized, morph tokenized, pos tag
    if not tweet:
      zipset = zip(((src_tok_fh, src_morphtok_fh, src_morph_fh, src_pos_fh),
                    (trg_tok_fh, trg_morphtok_fh, trg_morph_fh, trg_pos_fh)),
                   (sdata, tdata))
    else:
      # no source tok/morph info in tweets
      zipset = zip(((trg_tok_fh, trg_morphtok_fh, trg_morph_fh, trg_pos_fh),),
                   (tdata,))

    for fhset, data in zipset:
      for fh, field in zip(fhset, ("TOK", "MORPHTOK", "MORPH", "POS")):
        for line in compress(data[field], garbagemask):
          fh.write(line)

  # run agile tokenizer on target orig
  # TODO: lowercase
  trg_orig_fh.close()
  agiletok_cmd = "%s -i %s -o %s -t %s " % (agiletokpath, trg_orig_fname, trg_agiletoklc_fname, trg_agiletok_fname)
  sys.stderr.write(agiletok_cmd+"\n")
  try:
    check_call(agiletok_cmd, shell=True)
  except CalledProcessError as e:
    sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
    sys.exit(1)
  # run cdec tokenizer on source orig
  src_orig_fh.close()
  cdectok_cmd = "%s -i %s -o %s -t %s " % (cdectokpath, src_orig_fname, src_cdectoklc_fname, src_cdectok_fname)
  sys.stderr.write(cdectok_cmd+"\n")
  try:
    check_call(cdectok_cmd, shell=True)
  except CalledProcessError as e:
    sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
    sys.exit(1)
Ejemplo n.º 3
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--agiletoksubdir", default="agile-tokenized",
                      help="subdirectory for agile-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'),
                      help="path to agile tokenizer binary")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          agiletokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable')
  clusters = getclusters(os.path.join(rootdir, 'clusters'))
  srcindir = os.path.join(rootdir, args.src, 'ltf')
  trgindir = os.path.join(rootdir, args.trg, 'ltf')
  datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir),
              (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)]

  for lang, indir, exttokenizer, exttokoutdir in datasets:
    inbase = lang
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for filename in os.listdir(indir):
      # assume ltf filename
      if not filename.endswith("ltf.xml"):
        continue
      # avoid mac meta stuff
      if filename.startswith("."):
        continue
      # print info.filename
      with open(os.path.join(indir, filename), 'r') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          if len(clusters[lang][docid]) < 1:
            sys.stderr.write("Warning: no clusters for %s\n" % docid)
            clusid="NONE"
          else:
            clset = clusters[lang][docid]
            if len(clset) > 1:
              sys.stderr.write("Warning: multiple clusters for %s\n" % docid)
            clusid = '_'.join(clset)
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            #TODO: get cluster ID!!!
            man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer,
                                         orig_fh.name,
                                         os.path.join(exttokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(exttokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(ext_cmd))
    p.wait()
Ejemplo n.º 4
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
                      default=[sys.stdin,],
                      help="input zip file(s) (each contains a multi file)")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  defaultcount=0
  for infile in args.infile:
    inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
    if len(inbase) == 0:
      inbase="default.%d" % defaultcount
      defaultcount+=1
    archive = zf(infile)
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for info in archive.infolist():
      if info.file_size < 20:
        continue
      # assume ltf filename
      if not info.filename.endswith("ltf.xml"):
        continue
      # print info.filename
      with archive.open(info, 'rU') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                         orig_fh.name,
                                         os.path.join(cdectokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(cdectokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(cdec_cmd))
    p.wait()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                                 " data, tokenized, morph, pos tag and " \
                                                 "original, with manifests")
    parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'),
                        default=[sys.stdin, ],
                        help="input zip file(s) (each contains a multi file)")
    parser.add_argument("--outdir", "-o",
                        help="where to write extracted files")
    parser.add_argument("--nogarbage", action='store_true', default=False,
                        help="turn off garbage filtering")
    parser.add_argument("--toksubdir", default="raw.tokenized",
                        help="subdirectory for ldc-tokenized files")
    parser.add_argument("--cleantoksubdir", default="tokenized",
                        help="subdirectory for cleaned ldc-tokenized files")
    parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                        help="subdirectory for cdec-tokenized files")
    parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                        help="subdirectory for tokenized files based on " \
                             "morphological segmentation")
    parser.add_argument("--morphsubdir", default="morph",
                        help="subdirectory for morphological information")
    parser.add_argument("--origsubdir", default="raw.original",
                        help="subdirectory for untokenized files")
    parser.add_argument("--cleanorigsubdir", default="original",
                        help="subdirectory for cleaned raw original")

    parser.add_argument("--garbagesubdir", default="garbage",
                        help="subdirectory for garbage files (under orig)")
    parser.add_argument("--possubdir", default="pos",
                        help="subdirectory for pos tag files")
    parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'),
                        help="path to cleaning script")
    parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                                "cdectok.sh"),
                        help="cdec tokenizer program wrapper")
    addonoffarg(parser, 'cdec', help="do cdec tokenization", default=True)
    addonoffarg(parser, 'removesn', help="remove SN from mono zip (to avoid underscore tweets)", default=False)

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    tokoutdir = os.path.join(args.outdir, args.toksubdir)
    origoutdir = os.path.join(args.outdir, args.origsubdir)
    cleantokoutdir = os.path.join(args.outdir, args.cleantoksubdir)
    cleanorigoutdir = os.path.join(args.outdir, args.cleanorigsubdir)
    cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir)
    morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir)
    morphoutdir = os.path.join(args.outdir, args.morphsubdir)
    posoutdir = os.path.join(args.outdir, args.possubdir)
    cleanpath = args.cleanpath
    dirs = [args.outdir,
            tokoutdir,
            origoutdir,
            cleantokoutdir,
            cleanorigoutdir,
            morphtokoutdir,
            morphoutdir,
            posoutdir]
    if args.cdec:
        dirs.append(cdectokoutdir)
    if args.nogarbage:
        garbageoutdir = None
    else:
        garbageoutdir = os.path.join(origoutdir, args.garbagesubdir)
        dirs.append(garbageoutdir)
    for dir in dirs:
        if not os.path.exists(dir):
            os.makedirs(dir)

    defaultcount = 0
    for infile in args.infile:
        inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2])
        if len(inbase) == 0:
            inbase = "default.%d" % defaultcount
            defaultcount += 1
        archive = zf(infile)
        man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase), 'w')
        orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
        if args.nogarbage:
            garbage_fh = None
            garbage_man_fh = None
        else:
            garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
            garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase), 'w')
        tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
        morphtok_fh = open(os.path.join(morphtokoutdir,
                                        "%s.flat" % inbase), 'w')
        morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
        pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
        for info in archive.infolist():
            if info.file_size < 20:
                continue
            # assume ltf filename
            if not info.filename.endswith("ltf.xml"):
                continue
            # print info.filename
            with TextIOWrapper(archive.open(info, 'r')) as ifh:
                try:
                    xobj = ET.parse(ifh)
                    docid = xobj.findall(".//DOC")[0].get('id')
                    # avoid anonymized tweets in packages but not relocated downloaded mono tweets
                    if "tweets" not in inbase and args.removesn and "_SN_" in docid:
                        sys.stderr.write("SN skip: not extracting {}\n".format(docid))
                        continue
                    origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")]
                    garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
                    goodmask = [not x for x in garbagemask]
                    seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')]
                               for x in xobj.findall(".//SEG")]
                    for line in compress(origlines, garbagemask):
                        orig_fh.write(line)
                    for tup in compress(seginfo, garbagemask):
                        man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n")
                    if not args.nogarbage:
                        for line in compress(origlines, goodmask):
                            garbage_fh.write(line)
                        for tup in compress(seginfo, goodmask):
                            garbage_man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n")
                    for x in compress(xobj.findall(".//SEG"), garbagemask):
                        tokens = x.findall(".//TOKEN")
                        toktext = []
                        morphtoktext = []
                        morphtext = []
                        postext = []
                        for y in tokens:
                            if y.text is None:
                                continue
                            toktext.append(y.text)
                            postext.append(y.get("pos") or "none")
                            for mt, mtt in morph_tok(y):
                                morphtext.append(mt)
                                morphtoktext.append(mtt)
                        tok_fh.write(' '.join(toktext) + "\n")
                        morphtok_fh.write(' '.join(morphtoktext) + "\n")
                        morph_fh.write(' '.join(morphtext) + "\n")
                        pos_fh.write(' '.join(postext) + "\n")
                except ET.ParseError:
                    sys.stderr.write("Parse error on " + ifh.name + "\n")
                    continue
        orig_fh.close()
        tok_fh.close()
        # raw orig->clean orig
        # raw tok->clean tok
        clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase)
        clean_tok = os.path.join(cleantokoutdir, "%s.flat" % inbase)
        for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)):
            cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean)
            sys.stderr.write(cleancmd + "\n")
            try:
                check_call(shlex.split(cleancmd))
            except CalledProcessError as e:
                sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
                sys.exit(1)

        if args.cdec:
            cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                                 orig_fh.name,
                                                 os.path.join(cdectokoutdir,
                                                              "%s.flat.lc" % inbase),
                                                 os.path.join(cdectokoutdir,
                                                              "%s.flat" % inbase))
            p = subprocess.Popen(shlex.split(cdec_cmd))
            p.wait()
Ejemplo n.º 6
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print monolingual" \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests from extracted files")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--datadirs", nargs='+', default=[],
                      help="elements in path from root to ltf files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))


  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)

  dirs = [args.outdir,
          tokoutdir,
          cdectokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)


  datadirs=[args.rootdir,]+args.datadirs
  indir = os.path.join(*datadirs)
  man_fh = open(os.path.join(args.outdir, "mono.manifest"),'w')
  orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w')
  if args.nogarbage:
    garbage_fh = None
    garbage_man_fh = None
  else:
    garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w')
    garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"),'w')
  tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w')
  morphtok_fh = open(os.path.join(morphtokoutdir,
                                         "mono.flat"), 'w')
  morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w')
  pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w')

  for srcfile in os.listdir(indir):
    if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"):
      continue
    srcfile = os.path.join(indir, srcfile)
    with open(srcfile, 'r') as ifh:
      try:
        xobj = ET.parse(ifh)
        docid = xobj.findall(".//DOC")[0].get('id')
        origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
        garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
        goodmask = [not x for x in garbagemask]
        seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                    for x in xobj.findall(".//SEG") ]
        for line in compress(origlines, garbagemask):
          orig_fh.write(line)
        for tup in compress(seginfo, garbagemask):
          man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n")
        if not args.nogarbage:
          for line in compress(origlines, goodmask):
            garbage_fh.write(line)
          for tup in compress(seginfo, goodmask):
            garbage_man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n")
        for x in compress(xobj.findall(".//SEG"), garbagemask):
          tokens = x.findall(".//TOKEN")
          toktext = []
          morphtoktext = []
          morphtext = []
          postext = []
          for y in tokens:
            if y.text is None:
              continue
            toktext.append(y.text)
            postext.append(y.get("pos") or "none")
            for mt, mtt in morph_tok(y):
              morphtext.append(mt)
              morphtoktext.append(mtt)
          tok_fh.write(' '.join(toktext)+"\n")
          morphtok_fh.write(' '.join(morphtoktext)+"\n")
          morph_fh.write(' '.join(morphtext)+"\n")
          pos_fh.write(' '.join(postext)+"\n")
      except ET.ParseError:
        sys.stderr.write("Parse error on "+ifh.name+"\n")
        continue
  orig_fh.close()
  cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer,
                                       orig_fh.name,
                                       os.path.join(cdectokoutdir,
                                                    "mono.flat.lc"),
                                       os.path.join(cdectokoutdir,
                                                    "mono.flat"))
  p = subprocess.Popen(shlex.split(cdec_cmd))
  p.wait()
Ejemplo n.º 7
0
def printout(prefix, path, src, trg, outdir, origoutdir, cleanorigoutdir, garbageoutdir,
             tokoutdir, cleantokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir,
             agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir,
             agiletokpath, cdectokpath, cleanpath, docdec,
             stp=lputil.selected_translation_pairs, el=lputil.extract_lines,
             tweet=False, swap=False):
    ''' Find files and print them out '''
    src_man_fh = open(os.path.join(outdir, "%s.%s.manifest" % (prefix, src)), 'w')
    trg_man_fh = open(os.path.join(outdir, "%s.%s.manifest" % (prefix, trg)), 'w')

    # open a bunch of file handles
    # third element indicates whether it should actually be opened or if the file should be simply named
    namedirpairs = [('orig', origoutdir, True),
                    ('cleanorig', cleanorigoutdir, False),
                    ('tok', tokoutdir, True),
                    ('cleantok', cleantokoutdir, False),
                    ('morphtok', morphtokoutdir, True),
                    ('cdectok', cdectokoutdir, False),
                    ('cdectoklc', cdectoklcoutdir, False),
                    ('agiletok', agiletokoutdir, False),
                    ('agiletoklc', agiletoklcoutdir, False),
                    ('morph', morphoutdir, True),
                    ('pos', posoutdir, True),
                    ]
    outfiles = dd(dict)
    for sidename, side in (('src', src),
                           ('trg', trg)):
        for dirname, dirval, doopen in namedirpairs:
            entry = os.path.join(outdir, dirval, "{}.{}.{}.flat".format(prefix, dirval, side))
            if doopen:
                entry = open(entry, 'w')
            outfiles[sidename][dirname] = entry

    garbagefhs = {}
    garbagedisabled = True
    if garbageoutdir is not None:
        garbagedisabled = False
        src_orig_garbage_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \
                                                (prefix, src)), 'w')
        garbagefhs[outfiles['src']['orig']] = src_orig_garbage_fh
        trg_orig_garbage_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \
                                                (prefix, trg)), 'w')
        garbagefhs[outfiles['trg']['orig']] = trg_orig_garbage_fh
        src_garbage_man_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, src)), 'w')
        garbagefhs[src_man_fh] = src_garbage_man_fh
        trg_garbage_man_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, trg)), 'w')
        garbagefhs[trg_man_fh] = trg_garbage_man_fh

    (stpsrc, stptrg) = (trg, src) if swap else (src, trg)
    for m in stp(path, src=stpsrc, trg=stptrg, xml=True, tweet=tweet):
        sdata, tdata = el(*m)

        # found data sometimes seems to require swap behavior
        if swap:
            sdata, tdata = tdata, sdata

        if sdata is None or tdata is None:
            sys.stderr.write("Warning: empty files:\n%s or %s\n" % (m[0], m[1]))
            continue
        # Strict rejection of different length lines. If these are desired,
        # do gale & church or brown et al or something similar here
        slen = len(sdata["ORIG"])
        tlen = len(tdata["ORIG"])
        # print(slen,tlen)
        if slen != tlen:
            sys.stderr.write("Warning: different number of lines in files:\n" \
                             "%s %d\n%s %d\n" % (m[0], slen, m[1], tlen))
            continue

        # filter out control code-bearing lines here. mask out the data from all fields
        garbagemask = lputil.getgarbagemask(sdata["ORIG"], tdata["ORIG"], disabled=garbagedisabled)

        goodmask = [not x for x in garbagemask]
        ### Write original
        for fh, data in zip((outfiles['src']['orig'], outfiles['trg']['orig']), (sdata["ORIG"], tdata["ORIG"])):
            for line in compress(data, garbagemask):
                fh.write(line)
            ### Write garbage original
            if not garbagedisabled:
                for line in compress(data, goodmask):
                    garbagefhs[fh].write(line)

        ### Write manifest

        try:
            for fh, fname, tupgen in zip((src_man_fh, trg_man_fh), (m[0], m[1]),
                                         (list(zip(sdata["DOCID"], sdata["SEGID"],
                                                   sdata["START"], sdata["END"])),
                                          list(zip(tdata["DOCID"], tdata["SEGID"],
                                                   tdata["START"], tdata["END"])))):
                for tup in compress(tupgen, garbagemask):
                    fh.write("\t".join(map(str, (fname,) + tup)) + "\n")
                if not garbagedisabled:
                    for tup in compress(tupgen, goodmask):
                        garbagefhs[fh].write("\t".join(map(str, (fname,) + tup)) + "\n")
        except:
            sys.stderr.write(src_man_fh.name)
            # sys.stderr.write(fname)
            raise

        ### Write tokenized, morph tokenized, pos tag

        zipset = zip(
            ((outfiles["src"]["tok"], outfiles["src"]["morphtok"], outfiles["src"]["morph"], outfiles["src"]["pos"]),
             (outfiles["trg"]["tok"], outfiles["trg"]["morphtok"], outfiles["trg"]["morph"], outfiles["trg"]["pos"])),
            (sdata, tdata))

        for fhset, data in zipset:
            for fh, field in zip(fhset, ("TOK", "MORPHTOK", "MORPH", "POS")):
                for line in compress(data[field], garbagemask):
                    fh.write(line)

    # raw orig->clean orig
    # raw tok->clean tok
    # run agile tokenizer on target orig
    # TODO: lowercase

    outfiles['src']['orig'].close()
    for side in ('src', 'trg'):
        for contents in ('orig', 'tok'):
            outfiles[side][contents].close()
            cleancmd = "{cmd} {infile} {outfile}".format(cmd=cleanpath, infile=outfiles[side][contents].name,
                                                         outfile=outfiles[side]["clean{}".format(contents)])
            sys.stderr.write(cleancmd + "\n")
            try:
                check_call(shlex.split(cleancmd))
            except CalledProcessError as e:
                sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
                sys.exit(1)
    agiletok_cmd = "%s -i %s -o %s -t %s " % (
    agiletokpath, outfiles['trg']['cleanorig'], outfiles["trg"]["agiletoklc"], outfiles["trg"]["agiletok"])
    sys.stderr.write(agiletok_cmd + "\n")
    try:
        check_call(shlex.split(agiletok_cmd))
    except CalledProcessError as e:
        sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
        sys.exit(1)
    # run cdec tokenizer on source orig

    if docdec:
        cdectok_cmd = "%s -i %s -o %s -t %s " % (
        cdectokpath, outfiles['src']['cleanorig'], outfiles["src"]["cdectoklc"], outfiles["src"]["cdectok"])
        sys.stderr.write(cdectok_cmd + "\n")
        try:
            check_call(shlex.split(cdectok_cmd))
        except CalledProcessError as e:
            sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
            sys.exit(1)
Ejemplo n.º 8
0
def main():
  parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \
                                   " data, tokenized, morph, pos tag and " \
                                   "original, with manifests")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--outdir", "-o",
                      help="where to write extracted files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="raw.tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cleantoksubdir", default="tokenized",
                      help="subdirectory for cleaned ldc-tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--agiletoksubdir", default="agile-tokenized",
                      help="subdirectory for agile-tokenized files")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--cleanorigsubdir", default="original",
                      help="subdirectory for cleaned raw original")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological information")
  parser.add_argument("--origsubdir", default="raw.original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'),
                      help="path to cleaning script")
  parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'),
                      help="path to agile tokenizer binary")
  parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir,
                                                              "cdectok.sh"),
                      help="cdec tokenizer program wrapper")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  tokoutdir=os.path.join(args.outdir, args.toksubdir)
  origoutdir=os.path.join(args.outdir, args.origsubdir)
  cleantokoutdir=os.path.join(args.outdir,  args.cleantoksubdir)
  cleanorigoutdir=os.path.join(args.outdir, args.cleanorigsubdir)
  cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir)
  agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir)
  morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir)
  morphoutdir=os.path.join(args.outdir, args.morphsubdir)
  posoutdir=os.path.join(args.outdir, args.possubdir)
  cleanpath = args.cleanpath
  dirs = [args.outdir,
          tokoutdir,
          cleantokoutdir,
          cleanorigoutdir,
          cdectokoutdir,
          agiletokoutdir,
          origoutdir,
          morphtokoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)
  for dir in dirs:
    if not os.path.exists(dir):
      os.makedirs(dir)

  rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable')
  clusters = getclusters(os.path.join(rootdir, 'clusters'))
  srcindir = os.path.join(rootdir, args.src, 'ltf')
  trgindir = os.path.join(rootdir, args.trg, 'ltf')
  datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir),
              (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)]

  for lang, indir, exttokenizer, exttokoutdir in datasets:
    inbase = lang
    man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w')
    orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w')
    if args.nogarbage:
      garbage_fh = None
      garbage_man_fh = None
    else:
      garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w')
      garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w')
    tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w')
    morphtok_fh = open(os.path.join(morphtokoutdir,
                                           "%s.flat" % inbase), 'w')
    morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w')
    pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w')
    for filename in os.listdir(indir):
      # assume ltf filename
      if not filename.endswith("ltf.xml"):
        continue
      # avoid mac meta stuff
      if filename.startswith("."):
        continue
      # print info.filename
      with open(os.path.join(indir, filename), 'r') as ifh:
        try:
          xobj = ET.parse(ifh)
          docid = xobj.findall(".//DOC")[0].get('id')
          if len(clusters[lang][docid]) < 1:
            sys.stderr.write("Warning: no clusters for %s\n" % docid)
            clusid="NONE"
          else:
            clset = clusters[lang][docid]
            if len(clset) > 1:
              sys.stderr.write("Warning: multiple clusters for %s\n" % docid)
            clusid = '_'.join(clset)
          origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ]
          garbagemask = getgarbagemask(origlines, disabled=args.nogarbage)
          goodmask = [not x for x in garbagemask]
          seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ]
                      for x in xobj.findall(".//SEG") ]
          for line in compress(origlines, garbagemask):
            orig_fh.write(line)
          for tup in compress(seginfo, garbagemask):
            #TODO: get cluster ID!!!
            man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          if not args.nogarbage:
            for line in compress(origlines, goodmask):
              garbage_fh.write(line)
            for tup in compress(seginfo, goodmask):
              garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n")
          for x in compress(xobj.findall(".//SEG"), garbagemask):
            tokens = x.findall(".//TOKEN")
            toktext = []
            morphtoktext = []
            morphtext = []
            postext = []
            for y in tokens:
              if y.text is None:
                continue
              toktext.append(y.text)
              postext.append(y.get("pos") or "none")
              for mt, mtt in morph_tok(y):
                morphtext.append(mt)
                morphtoktext.append(mtt)
            tok_fh.write(' '.join(toktext)+"\n")
            morphtok_fh.write(' '.join(morphtoktext)+"\n")
            morph_fh.write(' '.join(morphtext)+"\n")
            pos_fh.write(' '.join(postext)+"\n")
        except ET.ParseError:
          sys.stderr.write("Parse error on "+ifh.name+"\n")
          continue
    orig_fh.close()
    tok_fh.close()
    clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase)
    clean_tok =  os.path.join(cleantokoutdir, "%s.flat" % inbase)
    for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)):
      cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean)
      sys.stderr.write(cleancmd+"\n")
      try:
        check_call(shlex.split(cleancmd))
      except CalledProcessError as e:
        sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd))
        sys.exit(1)

    ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer,
                                         orig_fh.name,
                                         os.path.join(exttokoutdir,
                                                      "%s.flat.lc" % inbase),
                                         os.path.join(exttokoutdir,
                                                      "%s.flat" % inbase))
    p = subprocess.Popen(shlex.split(ext_cmd))
    p.wait()