def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests from extracted files") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--datadirs", nargs='+', default=[], help="elements in path from root to ltf files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir = os.path.join(args.outdir, args.toksubdir) origoutdir = os.path.join(args.outdir, args.origsubdir) cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir) morphoutdir = os.path.join(args.outdir, args.morphsubdir) posoutdir = os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir = os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) datadirs = [args.rootdir, ] + args.datadirs indir = os.path.join(*datadirs) man_fh = open(os.path.join(args.outdir, "mono.manifest"), 'w') orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"), 'w') tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "mono.flat"), 'w') morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w') pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w') for srcfile in os.listdir(indir): if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"): continue srcfile = os.path.join(indir, srcfile) with open(srcfile, 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')] for x in xobj.findall(".//SEG")] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [srcfile, docid] + tup)) + "\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext) + "\n") morphtok_fh.write(' '.join(morphtoktext) + "\n") morph_fh.write(' '.join(morphtext) + "\n") pos_fh.write(' '.join(postext) + "\n") except ET.ParseError: sys.stderr.write("Parse error on " + ifh.name + "\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "mono.flat.lc"), os.path.join(cdectokoutdir, "mono.flat")) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--agiletoksubdir", default="agile-tokenized", help="subdirectory for agile-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'), help="path to agile tokenizer binary") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, agiletokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable') clusters = getclusters(os.path.join(rootdir, 'clusters')) srcindir = os.path.join(rootdir, args.src, 'ltf') trgindir = os.path.join(rootdir, args.trg, 'ltf') datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir), (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)] for lang, indir, exttokenizer, exttokoutdir in datasets: inbase = lang man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for filename in os.listdir(indir): # assume ltf filename if not filename.endswith("ltf.xml"): continue # avoid mac meta stuff if filename.startswith("."): continue # print info.filename with open(os.path.join(indir, filename), 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') if len(clusters[lang][docid]) < 1: sys.stderr.write("Warning: no clusters for %s\n" % docid) clusid="NONE" else: clset = clusters[lang][docid] if len(clset) > 1: sys.stderr.write("Warning: multiple clusters for %s\n" % docid) clusid = '_'.join(clset) origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): #TODO: get cluster ID!!! man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer, orig_fh.name, os.path.join(exttokoutdir, "%s.flat.lc" % inbase), os.path.join(exttokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(ext_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'), default=[sys.stdin,], help="input zip file(s) (each contains a multi file)") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) defaultcount=0 for infile in args.infile: inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2]) if len(inbase) == 0: inbase="default.%d" % defaultcount defaultcount+=1 archive = zf(infile) man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for info in archive.infolist(): if info.file_size < 20: continue # assume ltf filename if not info.filename.endswith("ltf.xml"): continue # print info.filename with archive.open(info, 'rU') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [info.filename,docid]+tup))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "%s.flat.lc" % inbase), os.path.join(cdectokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--infile", "-i", nargs='+', type=argparse.FileType('rb'), default=[sys.stdin, ], help="input zip file(s) (each contains a multi file)") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="raw.tokenized", help="subdirectory for ldc-tokenized files") parser.add_argument("--cleantoksubdir", default="tokenized", help="subdirectory for cleaned ldc-tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="raw.original", help="subdirectory for untokenized files") parser.add_argument("--cleanorigsubdir", default="original", help="subdirectory for cleaned raw original") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'), help="path to cleaning script") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") addonoffarg(parser, 'cdec', help="do cdec tokenization", default=True) addonoffarg(parser, 'removesn', help="remove SN from mono zip (to avoid underscore tweets)", default=False) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir = os.path.join(args.outdir, args.toksubdir) origoutdir = os.path.join(args.outdir, args.origsubdir) cleantokoutdir = os.path.join(args.outdir, args.cleantoksubdir) cleanorigoutdir = os.path.join(args.outdir, args.cleanorigsubdir) cdectokoutdir = os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir = os.path.join(args.outdir, args.morphtoksubdir) morphoutdir = os.path.join(args.outdir, args.morphsubdir) posoutdir = os.path.join(args.outdir, args.possubdir) cleanpath = args.cleanpath dirs = [args.outdir, tokoutdir, origoutdir, cleantokoutdir, cleanorigoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.cdec: dirs.append(cdectokoutdir) if args.nogarbage: garbageoutdir = None else: garbageoutdir = os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) defaultcount = 0 for infile in args.infile: inbase = '.'.join(os.path.basename(infile.name).split('.')[:-2]) if len(inbase) == 0: inbase = "default.%d" % defaultcount defaultcount += 1 archive = zf(infile) man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase), 'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase), 'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for info in archive.infolist(): if info.file_size < 20: continue # assume ltf filename if not info.filename.endswith("ltf.xml"): continue # print info.filename with TextIOWrapper(archive.open(info, 'r')) as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') # avoid anonymized tweets in packages but not relocated downloaded mono tweets if "tweets" not in inbase and args.removesn and "_SN_" in docid: sys.stderr.write("SN skip: not extracting {}\n".format(docid)) continue origlines = [x.text + "\n" for x in xobj.findall(".//ORIGINAL_TEXT")] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [[x.get(y) for y in ('id', 'start_char', 'end_char')] for x in xobj.findall(".//SEG")] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [info.filename, docid] + tup)) + "\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext) + "\n") morphtok_fh.write(' '.join(morphtoktext) + "\n") morph_fh.write(' '.join(morphtext) + "\n") pos_fh.write(' '.join(postext) + "\n") except ET.ParseError: sys.stderr.write("Parse error on " + ifh.name + "\n") continue orig_fh.close() tok_fh.close() # raw orig->clean orig # raw tok->clean tok clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase) clean_tok = os.path.join(cleantokoutdir, "%s.flat" % inbase) for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)): cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean) sys.stderr.write(cleancmd + "\n") try: check_call(shlex.split(cleancmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) if args.cdec: cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "%s.flat.lc" % inbase), os.path.join(cdectokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print monolingual" \ " data, tokenized, morph, pos tag and " \ "original, with manifests from extracted files") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--datadirs", nargs='+', default=[], help="elements in path from root to ltf files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) dirs = [args.outdir, tokoutdir, cdectokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) datadirs=[args.rootdir,]+args.datadirs indir = os.path.join(*datadirs) man_fh = open(os.path.join(args.outdir, "mono.manifest"),'w') orig_fh = open(os.path.join(origoutdir, "mono.flat"), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "mono.flat"), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "mono.manifest"),'w') tok_fh = open(os.path.join(tokoutdir, "mono.flat"), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "mono.flat"), 'w') morph_fh = open(os.path.join(morphoutdir, "mono.flat"), 'w') pos_fh = open(os.path.join(posoutdir, "mono.flat"), 'w') for srcfile in os.listdir(indir): if srcfile.startswith(".") or not srcfile.endswith("ltf.xml"): continue srcfile = os.path.join(indir, srcfile) with open(srcfile, 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [srcfile,docid]+tup))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() cdec_cmd = "%s -i %s -o %s -t %s" % (args.cdectokenizer, orig_fh.name, os.path.join(cdectokoutdir, "mono.flat.lc"), os.path.join(cdectokoutdir, "mono.flat")) p = subprocess.Popen(shlex.split(cdec_cmd)) p.wait()
def main(): parser = argparse.ArgumentParser(description="Extract and print comparable corpus " \ " data, tokenized, morph, pos tag and " \ "original, with manifests") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--outdir", "-o", help="where to write extracted files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="raw.tokenized", help="subdirectory for tokenized files") parser.add_argument("--cleantoksubdir", default="tokenized", help="subdirectory for cleaned ldc-tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--agiletoksubdir", default="agile-tokenized", help="subdirectory for agile-tokenized files") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--cleanorigsubdir", default="original", help="subdirectory for cleaned raw original") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological information") parser.add_argument("--origsubdir", default="raw.original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'), help="path to cleaning script") parser.add_argument("--agiletokenizer", default=os.path.join(scriptdir, 'agiletok.sh'), help="path to agile tokenizer binary") parser.add_argument("--cdectokenizer", default=os.path.join(scriptdir, "cdectok.sh"), help="cdec tokenizer program wrapper") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) tokoutdir=os.path.join(args.outdir, args.toksubdir) origoutdir=os.path.join(args.outdir, args.origsubdir) cleantokoutdir=os.path.join(args.outdir, args.cleantoksubdir) cleanorigoutdir=os.path.join(args.outdir, args.cleanorigsubdir) cdectokoutdir=os.path.join(args.outdir, args.cdectoksubdir) agiletokoutdir=os.path.join(args.outdir, args.agiletoksubdir) morphtokoutdir=os.path.join(args.outdir, args.morphtoksubdir) morphoutdir=os.path.join(args.outdir, args.morphsubdir) posoutdir=os.path.join(args.outdir, args.possubdir) cleanpath = args.cleanpath dirs = [args.outdir, tokoutdir, cleantokoutdir, cleanorigoutdir, cdectokoutdir, agiletokoutdir, origoutdir, morphtokoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: if not os.path.exists(dir): os.makedirs(dir) rootdir = os.path.join(args.rootdir, 'data', 'translation', 'comparable') clusters = getclusters(os.path.join(rootdir, 'clusters')) srcindir = os.path.join(rootdir, args.src, 'ltf') trgindir = os.path.join(rootdir, args.trg, 'ltf') datasets = [(args.src, srcindir, args.cdectokenizer, cdectokoutdir), (args.trg, trgindir, args.agiletokenizer, agiletokoutdir)] for lang, indir, exttokenizer, exttokoutdir in datasets: inbase = lang man_fh = open(os.path.join(args.outdir, "%s.manifest" % inbase),'w') orig_fh = open(os.path.join(origoutdir, "%s.flat" % inbase), 'w') if args.nogarbage: garbage_fh = None garbage_man_fh = None else: garbage_fh = open(os.path.join(garbageoutdir, "%s.flat" % inbase), 'w') garbage_man_fh = open(os.path.join(garbageoutdir, "%s.manifest" % inbase),'w') tok_fh = open(os.path.join(tokoutdir, "%s.flat" % inbase), 'w') morphtok_fh = open(os.path.join(morphtokoutdir, "%s.flat" % inbase), 'w') morph_fh = open(os.path.join(morphoutdir, "%s.flat" % inbase), 'w') pos_fh = open(os.path.join(posoutdir, "%s.flat" % inbase), 'w') for filename in os.listdir(indir): # assume ltf filename if not filename.endswith("ltf.xml"): continue # avoid mac meta stuff if filename.startswith("."): continue # print info.filename with open(os.path.join(indir, filename), 'r') as ifh: try: xobj = ET.parse(ifh) docid = xobj.findall(".//DOC")[0].get('id') if len(clusters[lang][docid]) < 1: sys.stderr.write("Warning: no clusters for %s\n" % docid) clusid="NONE" else: clset = clusters[lang][docid] if len(clset) > 1: sys.stderr.write("Warning: multiple clusters for %s\n" % docid) clusid = '_'.join(clset) origlines = [ x.text+"\n" for x in xobj.findall(".//ORIGINAL_TEXT") ] garbagemask = getgarbagemask(origlines, disabled=args.nogarbage) goodmask = [not x for x in garbagemask] seginfo = [ [ x.get(y) for y in ('id', 'start_char', 'end_char') ] for x in xobj.findall(".//SEG") ] for line in compress(origlines, garbagemask): orig_fh.write(line) for tup in compress(seginfo, garbagemask): #TODO: get cluster ID!!! man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") if not args.nogarbage: for line in compress(origlines, goodmask): garbage_fh.write(line) for tup in compress(seginfo, goodmask): garbage_man_fh.write("\t".join(map(str, [filename,docid]+tup+[clusid,]))+"\n") for x in compress(xobj.findall(".//SEG"), garbagemask): tokens = x.findall(".//TOKEN") toktext = [] morphtoktext = [] morphtext = [] postext = [] for y in tokens: if y.text is None: continue toktext.append(y.text) postext.append(y.get("pos") or "none") for mt, mtt in morph_tok(y): morphtext.append(mt) morphtoktext.append(mtt) tok_fh.write(' '.join(toktext)+"\n") morphtok_fh.write(' '.join(morphtoktext)+"\n") morph_fh.write(' '.join(morphtext)+"\n") pos_fh.write(' '.join(postext)+"\n") except ET.ParseError: sys.stderr.write("Parse error on "+ifh.name+"\n") continue orig_fh.close() tok_fh.close() clean_orig = os.path.join(cleanorigoutdir, "%s.flat" % inbase) clean_tok = os.path.join(cleantokoutdir, "%s.flat" % inbase) for inclean, outclean in zip((orig_fh.name, tok_fh.name), (clean_orig, clean_tok)): cleancmd = "{cmd} {inclean} {outclean}".format(cmd=cleanpath, inclean=inclean, outclean=outclean) sys.stderr.write(cleancmd+"\n") try: check_call(shlex.split(cleancmd)) except CalledProcessError as e: sys.stderr.write("Error code %d running %s\n" % (e.returncode, e.cmd)) sys.exit(1) ext_cmd = "%s -i %s -o %s -t %s" % (exttokenizer, orig_fh.name, os.path.join(exttokoutdir, "%s.flat.lc" % inbase), os.path.join(exttokoutdir, "%s.flat" % inbase)) p = subprocess.Popen(shlex.split(ext_cmd)) p.wait()