def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests from extracted files") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--datadirs", nargs='+', default=[], help="elements in path from root to ltf files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir = os.path.join(args.outdir, args.toksubdir) origoutdir = os.path.join(args.outdir, args.origsubdir) cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir) morphoutdir = os.path.join(args.outdir, args.morphsubdir) posoutdir = os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir = os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) datadirs = [args.rootdir, ] + args.datadirs indir = os.path.join(*datadirs) man_fh = open(os.path.join(args.outdir, "mono.manifest"), 'w') orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"), 'w') tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "mono.flat"), 'w') morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w') pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w') for srcfile in os.listdir(indir): if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"): continue srcfile = os.path.join(indir, srcfile) with open(srcfile, 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')] for x in xobj.findall(".//SEG")] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext) + "\n") morphtok_fh.write(' '.join(morphtoktext) + "\n") morph_fh.write(' '.join(morphtext) + "\n") pos_fh.write(' '.join(postext) + "\n") except ET.ParseError: sys.stderr.write("Parse error on " + ifh.name + "\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "mono.flat.lc"), os.path.join(cdectokoutdir, "mono.flat")) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def printout(prefix, path, src, trg, outdir, origoutdir, garbageoutdir, tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir, agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir, agiletokpath, cdectokpath, stp=lputil.selected_translation_pairs, el=lputil.extract_lines, tweet=False): ''' Find files and print them out ''' src_man_fh=open(os.path.join(outdir, "%s.%s.manifest" % (prefix, src)), 'w') trg_man_fh=open(os.path.join(outdir, "%s.%s.manifest" % (prefix, trg)), 'w') src_orig_fname=os.path.join(outdir, origoutdir, "%s.%s.%s.flat" % \ (prefix,origoutdir,src)) src_orig_fh=open(src_orig_fname, 'w') trg_orig_fname=os.path.join(outdir, origoutdir, "%s.%s.%s.flat" % \ (prefix,origoutdir,trg)) trg_orig_fh=open(trg_orig_fname, 'w') garbagefhs = {} garbagedisabled=True if garbageoutdir is not None: garbagedisabled=False src_orig_garbage_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \ (prefix,src)), 'w') garbagefhs[src_orig_fh]=src_orig_garbage_fh trg_orig_garbage_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \ (prefix,trg)), 'w') garbagefhs[trg_orig_fh]=trg_orig_garbage_fh src_garbage_man_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, src)), 'w') garbagefhs[src_man_fh]=src_garbage_man_fh trg_garbage_man_fh=open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, trg)), 'w') garbagefhs[trg_man_fh]=trg_garbage_man_fh src_tok_fh=open(os.path.join(outdir, tokoutdir, "%s.%s.%s.flat" % \ (prefix,tokoutdir,src)), 'w') trg_tok_fh=open(os.path.join(outdir, tokoutdir, "%s.%s.%s.flat" % \ (prefix,tokoutdir,trg)), 'w') src_morphtok_fh=open(os.path.join(outdir, morphtokoutdir, "%s.%s.%s.flat" % \ (prefix,morphtokoutdir,src)),'w') trg_morphtok_fh=open(os.path.join(outdir, morphtokoutdir, "%s.%s.%s.flat" % \ (prefix,morphtokoutdir,trg)),'w') src_morph_fh=open(os.path.join(outdir, morphoutdir, "%s.%s.%s.flat" % \ (prefix,morphoutdir,src)),'w') trg_morph_fh=open(os.path.join(outdir, morphoutdir, "%s.%s.%s.flat" % \ (prefix,morphoutdir,trg)),'w') src_pos_fh=open(os.path.join(outdir, posoutdir, "%s.%s.%s.flat" % \ (prefix,posoutdir,src)),'w') trg_pos_fh=open(os.path.join(outdir, posoutdir, "%s.%s.%s.flat" % \ (prefix,posoutdir,trg)),'w') src_cdectok_fname=os.path.join(outdir, cdectokoutdir, "%s.%s.%s.flat" % \ (prefix,cdectokoutdir,src)) trg_agiletok_fname=os.path.join(outdir, agiletokoutdir, "%s.%s.%s.flat" % \ (prefix,agiletokoutdir,trg)) src_cdectoklc_fname=os.path.join(outdir, cdectoklcoutdir, "%s.%s.%s.flat" % \ (prefix,cdectoklcoutdir,src)) trg_agiletoklc_fname=os.path.join(outdir, agiletoklcoutdir, "%s.%s.%s.flat" % \ (prefix,agiletoklcoutdir,trg)) for m in stp(path, src=src, trg=trg, xml=True, tweet=tweet): if not tweet: sdata, tdata = el(*m) else: sdata, tdata = el(*m, sxml=False, txml=True) if sdata is None or tdata is None: sys.stderr.write("Warning: empty files:\n%s or %s\n" % (m[0], m[1])) continue # Strict rejection of different length lines. If these are desired, # do gale & church or brown et al or something similar here slen = len(sdata["ORIG"]) tlen = len(tdata["ORIG"]) #print(slen,tlen) if slen != tlen: sys.stderr.write("Warning: different number of lines in files:\n" \ "%s %d\n%s %d\n" % (m[0], slen, m[1], tlen)) continue # filter out control code-bearing lines here. mask out the data from all fields garbagemask = lputil.getgarbagemask(sdata["ORIG"], tdata["ORIG"], disabled=garbagedisabled) goodmask = [not x for x in garbagemask] ### Write original for fh, data in zip((src_orig_fh, trg_orig_fh), (sdata["ORIG"], tdata["ORIG"])): for line in compress(data, garbagemask): fh.write(line) ### Write garbage original if not garbagedisabled: for line in compress(data, goodmask): garbagefhs[fh].write(line) ### Write manifest if not tweet: try: for fh, fname, tupgen in zip((src_man_fh, trg_man_fh), (m[0], m[1]), (list(zip(sdata["DOCID"], sdata["SEGID"], sdata["START"], sdata["END"])), list(zip(tdata["DOCID"], tdata["SEGID"], tdata["START"], tdata["END"])))): for tup in compress(tupgen, garbagemask): fh.write("\t".join(map(str, (fname,)+tup))+"\n") if not garbagedisabled: for tup in compress(tupgen, goodmask): garbagefhs[fh].write("\t".join(map(str, (fname,)+tup))+"\n") except: sys.stderr.write(src_man_fh.name) #sys.stderr.write(fname) raise else: # Source fh = src_man_fh field = sdata["DOCID"] for line in compress(field, garbagemask): line = line.strip() fh.write('%s\t%s\n' % (line, # line)) re.search('.+/(\S*?)\.', line).group(1))) if not garbagedisabled: for line in compress(field, goodmask): line = line.strip() garbagefhs[fh].write('%s\t%s\n' % (line, # line)) re.search('.+/(\S*?)\.', line).group(1))) # Target try: fh = trg_man_fh fname = m[1] for tup in compress(list(zip(tdata["DOCID"], tdata["SEGID"], tdata["START"], tdata["END"])), garbagemask): fh.write("\t".join(map(str, (fname,)+tup))+"\n") if not garbagedisabled: for tup in compress(list(zip(tdata["DOCID"], tdata["SEGID"], tdata["START"], tdata["END"])), goodmask): garbagefhs[fh].write("\t".join(map(str, (fname,)+tup))+"\n") except: sys.stderr.write(fname) raise ### Write tokenized, morph tokenized, pos tag if not tweet: zipset = zip(((src_tok_fh, src_morphtok_fh, src_morph_fh, src_pos_fh), (trg_tok_fh, trg_morphtok_fh, trg_morph_fh, trg_pos_fh)), (sdata, tdata)) else: # no source tok/morph info in tweets zipset = zip(((trg_tok_fh, trg_morphtok_fh, trg_morph_fh, trg_pos_fh),), (tdata,)) for fhset, data in zipset: for fh, field in zip(fhset, ("TOK", "MORPHTOK", "MORPH", "POS")): for line in compress(data[field], garbagemask): fh.write(line) # run agile tokenizer on target orig # TODO: lowercase trg_orig_fh.close() agiletok_cmd = "%s -i %s -o %s -t %s " % (agiletokpath, trg_orig_fname, trg_agiletoklc_fname, trg_agiletok_fname) sys.stderr.write(agiletok_cmd+"\n") try: check_call(agiletok_cmd, shell=True) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) # run cdec tokenizer on source orig src_orig_fh.close() cdectok_cmd = "%s -i %s -o %s -t %s " % (cdectokpath, src_orig_fname, src_cdectoklc_fname, src_cdectok_fname) sys.stderr.write(cdectok_cmd+"\n") try: check_call(cdectok_cmd, shell=True) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1)
def main(): parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--agiletoksubdir", default="agile-tokenized", help="subdirectory for agile-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'), help="path to agile tokenizer binary") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, agiletokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable') clusters = getclusters(os.path.join(rootdir, 'clusters')) srcindir = os.path.join(rootdir, args.src, 'ltf') trgindir = os.path.join(rootdir, args.trg, 'ltf') datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir), (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)] for lang, indir, exttokenizer, exttokoutdir in datasets: inbase = lang man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for filename in os.listdir(indir): # assume ltf filename if not filename.endswith("ltf.xml"): continue # avoid mac meta stuff if filename.startswith("."): continue # print info.filename with open(os.path.join(indir, filename), 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') if len(clusters[lang][docid]) < 1: sys.stderr.write("Warning: no clusters for %s\n" % docid) clusid="NONE" else: clset = clusters[lang][docid] if len(clset) > 1: sys.stderr.write("Warning: multiple clusters for %s\n" % docid) clusid = '_'.join(clset) origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): #TODO: get cluster ID!!! man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer, orig_fh.name, os.path.join(exttokoutdir, "%s.flat.lc" % inbase), os.path.join(exttokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(ext_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'), default=[sys.stdin,], help="input zip file(s) (each contains a multi file)") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) defaultcount=0 for infile in args.infile: inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2]) if len(inbase) == 0: inbase="default.%d" % defaultcount defaultcount+=1 archive = zf(infile) man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for info in archive.infolist(): if info.file_size < 20: continue # assume ltf filename if not info.filename.endswith("ltf.xml"): continue # print info.filename with archive.open(info, 'rU') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "%s.flat.lc" % inbase), os.path.join(cdectokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'), default=[sys.stdin, ], help="input zip file(s) (each contains a multi file)") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="raw.tokenized", help="subdirectory for ldc-tokenized files") parser.add_argument("--cleantoksubdir", default="tokenized", help="subdirectory for cleaned ldc-tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="raw.original", help="subdirectory for untokenized files") parser.add_argument("--cleanorigsubdir", default="original", help="subdirectory for cleaned raw original") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'), help="path to cleaning script") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") addonoffarg(parser, 'cdec', help="do cdec tokenization", default=True) addonoffarg(parser, 'removesn', help="remove SN from mono zip (to avoid underscore tweets)", default=False) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir = os.path.join(args.outdir, args.toksubdir) origoutdir = os.path.join(args.outdir, args.origsubdir) cleantokoutdir = os.path.join(args.outdir, args.cleantoksubdir) cleanorigoutdir = os.path.join(args.outdir, args.cleanorigsubdir) cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir) morphoutdir = os.path.join(args.outdir, args.morphsubdir) posoutdir = os.path.join(args.outdir, args.possubdir) cleanpath = args.cleanpath dirs = [args.outdir, tokoutdir, origoutdir, cleantokoutdir, cleanorigoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.cdec: dirs.append(cdectokoutdir) if args.nogarbage: garbageoutdir = None else: garbageoutdir = os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) defaultcount = 0 for infile in args.infile: inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2]) if len(inbase) == 0: inbase = "default.%d" % defaultcount defaultcount += 1 archive = zf(infile) man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase), 'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase), 'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for info in archive.infolist(): if info.file_size < 20: continue # assume ltf filename if not info.filename.endswith("ltf.xml"): continue # print info.filename with TextIOWrapper(archive.open(info, 'r')) as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') # avoid anonymized tweets in packages but not relocated downloaded mono tweets if "tweets" not in inbase and args.removesn and "_SN_" in docid: sys.stderr.write("SN skip: not extracting {}\n".format(docid)) continue origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')] for x in xobj.findall(".//SEG")] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext) + "\n") morphtok_fh.write(' '.join(morphtoktext) + "\n") morph_fh.write(' '.join(morphtext) + "\n") pos_fh.write(' '.join(postext) + "\n") except ET.ParseError: sys.stderr.write("Parse error on " + ifh.name + "\n") continue orig_fh.close() tok_fh.close() # raw orig->clean orig # raw tok->clean tok clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase) clean_tok = os.path.join(cleantokoutdir, "%s.flat" % inbase) for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)): cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean) sys.stderr.write(cleancmd + "\n") try: check_call(shlex.split(cleancmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) if args.cdec: cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "%s.flat.lc" % inbase), os.path.join(cdectokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests from extracted files") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--datadirs", nargs='+', default=[], help="elements in path from root to ltf files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) datadirs=[args.rootdir,]+args.datadirs indir = os.path.join(*datadirs) man_fh = open(os.path.join(args.outdir, "mono.manifest"),'w') orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"),'w') tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "mono.flat"), 'w') morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w') pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w') for srcfile in os.listdir(indir): if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"): continue srcfile = os.path.join(indir, srcfile) with open(srcfile, 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "mono.flat.lc"), os.path.join(cdectokoutdir, "mono.flat")) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def printout(prefix, path, src, trg, outdir, origoutdir, cleanorigoutdir, garbageoutdir, tokoutdir, cleantokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir, agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir, agiletokpath, cdectokpath, cleanpath, docdec, stp=lputil.selected_translation_pairs, el=lputil.extract_lines, tweet=False, swap=False): ''' Find files and print them out ''' src_man_fh = open(os.path.join(outdir, "%s.%s.manifest" % (prefix, src)), 'w') trg_man_fh = open(os.path.join(outdir, "%s.%s.manifest" % (prefix, trg)), 'w') # open a bunch of file handles # third element indicates whether it should actually be opened or if the file should be simply named namedirpairs = [('orig', origoutdir, True), ('cleanorig', cleanorigoutdir, False), ('tok', tokoutdir, True), ('cleantok', cleantokoutdir, False), ('morphtok', morphtokoutdir, True), ('cdectok', cdectokoutdir, False), ('cdectoklc', cdectoklcoutdir, False), ('agiletok', agiletokoutdir, False), ('agiletoklc', agiletoklcoutdir, False), ('morph', morphoutdir, True), ('pos', posoutdir, True), ] outfiles = dd(dict) for sidename, side in (('src', src), ('trg', trg)): for dirname, dirval, doopen in namedirpairs: entry = os.path.join(outdir, dirval, "{}.{}.{}.flat".format(prefix, dirval, side)) if doopen: entry = open(entry, 'w') outfiles[sidename][dirname] = entry garbagefhs = {} garbagedisabled = True if garbageoutdir is not None: garbagedisabled = False src_orig_garbage_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \ (prefix, src)), 'w') garbagefhs[outfiles['src']['orig']] = src_orig_garbage_fh trg_orig_garbage_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.flat" % \ (prefix, trg)), 'w') garbagefhs[outfiles['trg']['orig']] = trg_orig_garbage_fh src_garbage_man_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, src)), 'w') garbagefhs[src_man_fh] = src_garbage_man_fh trg_garbage_man_fh = open(os.path.join(outdir, garbageoutdir, "%s.%s.manifest" % (prefix, trg)), 'w') garbagefhs[trg_man_fh] = trg_garbage_man_fh (stpsrc, stptrg) = (trg, src) if swap else (src, trg) for m in stp(path, src=stpsrc, trg=stptrg, xml=True, tweet=tweet): sdata, tdata = el(*m) # found data sometimes seems to require swap behavior if swap: sdata, tdata = tdata, sdata if sdata is None or tdata is None: sys.stderr.write("Warning: empty files:\n%s or %s\n" % (m[0], m[1])) continue # Strict rejection of different length lines. If these are desired, # do gale & church or brown et al or something similar here slen = len(sdata["ORIG"]) tlen = len(tdata["ORIG"]) # print(slen,tlen) if slen != tlen: sys.stderr.write("Warning: different number of lines in files:\n" \ "%s %d\n%s %d\n" % (m[0], slen, m[1], tlen)) continue # filter out control code-bearing lines here. mask out the data from all fields garbagemask = lputil.getgarbagemask(sdata["ORIG"], tdata["ORIG"], disabled=garbagedisabled) goodmask = [not x for x in garbagemask] ### Write original for fh, data in zip((outfiles['src']['orig'], outfiles['trg']['orig']), (sdata["ORIG"], tdata["ORIG"])): for line in compress(data, garbagemask): fh.write(line) ### Write garbage original if not garbagedisabled: for line in compress(data, goodmask): garbagefhs[fh].write(line) ### Write manifest try: for fh, fname, tupgen in zip((src_man_fh, trg_man_fh), (m[0], m[1]), (list(zip(sdata["DOCID"], sdata["SEGID"], sdata["START"], sdata["END"])), list(zip(tdata["DOCID"], tdata["SEGID"], tdata["START"], tdata["END"])))): for tup in compress(tupgen, garbagemask): fh.write("\t".join(map(str, (fname,) + tup)) + "\n") if not garbagedisabled: for tup in compress(tupgen, goodmask): garbagefhs[fh].write("\t".join(map(str, (fname,) + tup)) + "\n") except: sys.stderr.write(src_man_fh.name) # sys.stderr.write(fname) raise ### Write tokenized, morph tokenized, pos tag zipset = zip( ((outfiles["src"]["tok"], outfiles["src"]["morphtok"], outfiles["src"]["morph"], outfiles["src"]["pos"]), (outfiles["trg"]["tok"], outfiles["trg"]["morphtok"], outfiles["trg"]["morph"], outfiles["trg"]["pos"])), (sdata, tdata)) for fhset, data in zipset: for fh, field in zip(fhset, ("TOK", "MORPHTOK", "MORPH", "POS")): for line in compress(data[field], garbagemask): fh.write(line) # raw orig->clean orig # raw tok->clean tok # run agile tokenizer on target orig # TODO: lowercase outfiles['src']['orig'].close() for side in ('src', 'trg'): for contents in ('orig', 'tok'): outfiles[side][contents].close() cleancmd = "{cmd} {infile} {outfile}".format(cmd=cleanpath, infile=outfiles[side][contents].name, outfile=outfiles[side]["clean{}".format(contents)]) sys.stderr.write(cleancmd + "\n") try: check_call(shlex.split(cleancmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) agiletok_cmd = "%s -i %s -o %s -t %s " % ( agiletokpath, outfiles['trg']['cleanorig'], outfiles["trg"]["agiletoklc"], outfiles["trg"]["agiletok"]) sys.stderr.write(agiletok_cmd + "\n") try: check_call(shlex.split(agiletok_cmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) # run cdec tokenizer on source orig if docdec: cdectok_cmd = "%s -i %s -o %s -t %s " % ( cdectokpath, outfiles['src']['cleanorig'], outfiles["src"]["cdectoklc"], outfiles["src"]["cdectok"]) sys.stderr.write(cdectok_cmd + "\n") try: check_call(shlex.split(cdectok_cmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1)
def main(): parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="raw.tokenized", help="subdirectory for tokenized files") parser.add_argument("--cleantoksubdir", default="tokenized", help="subdirectory for cleaned ldc-tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--agiletoksubdir", default="agile-tokenized", help="subdirectory for agile-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--cleanorigsubdir", default="original", help="subdirectory for cleaned raw original") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="raw.original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'), help="path to cleaning script") parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'), help="path to agile tokenizer binary") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cleantokoutdir=os.path.join(args.outdir, args.cleantoksubdir) cleanorigoutdir=os.path.join(args.outdir, args.cleanorigsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) cleanpath = args.cleanpath dirs = [args.outdir, tokoutdir, cleantokoutdir, cleanorigoutdir, cdectokoutdir, agiletokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable') clusters = getclusters(os.path.join(rootdir, 'clusters')) srcindir = os.path.join(rootdir, args.src, 'ltf') trgindir = os.path.join(rootdir, args.trg, 'ltf') datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir), (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)] for lang, indir, exttokenizer, exttokoutdir in datasets: inbase = lang man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for filename in os.listdir(indir): # assume ltf filename if not filename.endswith("ltf.xml"): continue # avoid mac meta stuff if filename.startswith("."): continue # print info.filename with open(os.path.join(indir, filename), 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') if len(clusters[lang][docid]) < 1: sys.stderr.write("Warning: no clusters for %s\n" % docid) clusid="NONE" else: clset = clusters[lang][docid] if len(clset) > 1: sys.stderr.write("Warning: multiple clusters for %s\n" % docid) clusid = '_'.join(clset) origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): #TODO: get cluster ID!!! man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() tok_fh.close() clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase) clean_tok = os.path.join(cleantokoutdir, "%s.flat" % inbase) for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)): cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean) sys.stderr.write(cleancmd+"\n") try: check_call(shlex.split(cleancmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer, orig_fh.name, os.path.join(exttokoutdir, "%s.flat.lc" % inbase), os.path.join(exttokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(ext_cmd)) p.wait()