def main(): parser = argparse.ArgumentParser(description="relocate parts of a lrlp into a centralized location to make it easier to gather later", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--source", "-s", help='path to the expanded lrlp') parser.add_argument("--old", "-o", default=None, help='path to old ephemera directory') parser.add_argument("--target", "-t", help='path to the desired catch-all directory') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) if os.path.exists(args.target): shutil.rmtree(args.target) mkdir_p(args.target) for indirstub, outdirstub in manifest.items(): indir = os.path.join(args.source, indirstub) if os.path.exists(indir): outdir = os.path.join(args.target, outdirstub) sys.stderr.write("Copying {} to {}\n".format(indir, outdir)) copything(indir, outdir) if args.old is not None: # traverse top level of old and move everything not in transferexcluded to target/old oldtarget=os.path.join(args.target, "old") mkdir_p(oldtarget) for oldfile in os.listdir(args.old): if oldfile not in transferexcluded: fullsource = os.path.join(args.old, oldfile) fulltarget = os.path.join(oldtarget, oldfile) sys.stderr.write("Transferring {} to {}\n".format(fullsource, fulltarget)) copything(fullsource, fulltarget)
def relocate_ltf(dldir, lrlpdir, logfile): ''' relocate files and replace them ''' # source of the new files parent = os.path.dirname(dldir) repldir = os.path.join(parent, 'ltf') if not os.path.exists(repldir): sys.stderr.write( "Directories not set up properly; couldn't find {}\n".format( repldir)) sys.exit(1) bkpdir = os.path.join(parent, 'ltf.retired') mkdir_p(bkpdir) mkdir_p(lrlpdir) for file in os.listdir(lrlpdir): if not is_sn(file) or not file.endswith(".ltf.xml"): continue oldfile = os.path.join(lrlpdir, file) bkpfile = os.path.join(bkpdir, file) shutil.move(oldfile, bkpfile) for file in os.listdir(repldir): if not file.endswith(".ltf.xml"): continue dstfile = os.path.join(lrlpdir, file) replfile = os.path.join(repldir, file) # introduce the replacement file in the new location shutil.copyfile(replfile, dstfile)
def main(): parser = argparse.ArgumentParser(description="relocate parts of a lrlp into a centralized location to make it easier to gather later", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--source", "-s", help='path to the expanded lrlp') parser.add_argument("--target", "-t", help='path to the desired catch-all directory') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) if os.path.exists(args.target): shutil.rmtree(args.target) mkdir_p(args.target) for indirstub, outdirstub in manifest.items(): indir = os.path.join(args.source, indirstub) if os.path.exists(indir): outdir = os.path.join(args.target, outdirstub) if os.path.isdir(indir): shutil.copytree(indir, outdir) elif os.path.isfile(indir): shutil.copy(indir, outdir) else: sys.stderr.write("%s not directory or file; skipping" % indir)
def zip_and_copy(workdir, dldir, outfile, logfile): ''' zip up directory tree; requires a relocation for files to all line up right ''' parent = os.path.dirname(dldir) repldir = os.path.join(parent, 'ltf') if not os.path.exists(repldir): sys.stderr.write( "Directories not set up properly; couldn't find input {}\n".format( repldir)) sys.exit(1) # trying to get an ltf directory underneath so that zip file has ltf prefix before everything realwork = os.path.join(workdir, 'foo') shutil.copytree(repldir, os.path.join(realwork, 'ltf')) mkdir_p(os.path.dirname(outfile)) shutil.make_archive(outfile, 'zip', realwork)
def tokrsd(dldir, ruby, exec, param, workdir): ''' create ltfs from rsds ''' rsddir = dldir parent = os.path.dirname(rsddir) ltfdir = os.path.join(parent, 'ltf') if not os.path.exists(rsddir): sys.stderr.write( "Directories not set up properly; couldn't find {}\n".format( rsddir)) sys.exit(1) mkdir_p(ltfdir) listfile = os.path.join(workdir, 'list') lfh = prepfile(listfile, 'w') for l in iglob(os.path.join(rsddir, '*.rsd.txt')): lfh.write("{}\n".format(l)) lfh.close() paramtxt = "" if param is None else "-t {}".format(param) cmd = "{} {} {} {}".format(ruby, exec, paramtxt, listfile) return check_call(shlex.split(cmd))
def main(): parser = argparse.ArgumentParser(description="Given category per doc, idfile, data file, put data in category-specific dir", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--catfile", "-c", nargs='?', type=argparse.FileType('r'), help="doc cat file (docid cat)") parser.add_argument("--idfile", "-d", nargs='?', type=argparse.FileType('r'), help="id file (docid per line)") parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file") parser.add_argument("--prefix", "-p", default=".", help="directory prefix for categories") parser.add_argument("--postfix", "-P", default=".", help="directory postfix after categories") parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should match previous remainder category") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) catfile = args.catfile infile = args.infile idfile = args.idfile basefile = os.path.basename(args.infile.name) cats = {} fhs = {} for line in catfile: doc, cat = line.strip().split('\t') prefix = os.path.join(args.prefix, cat, args.postfix) innercatfile = os.path.join(prefix, basefile) if innercatfile not in fhs: mkdir_p(prefix) fhs[innercatfile]=open(innercatfile, 'w') cats[doc]=fhs[innercatfile] remcatpref = os.path.join(args.prefix, args.remainder, args.postfix) remaindercatfile = os.path.join(remcatpref, basefile) if remaindercatfile not in fhs: mkdir_p(remcatpref) fhs[remaindercatfile]=open(remaindercatfile, 'w') for doc, data in zip(idfile, infile): doc = doc.strip() fh = cats[doc] if doc in cats else fhs[remaindercatfile] fh.write(data)
def main(): parser = argparse.ArgumentParser(description="extract parallel data from " \ "expanded lrlp to flat files and manifests.") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--datadirs", nargs='+', default=['data', 'translation'], help="elements in path from root to ltf files") parser.add_argument("--outdir", "-o", default="./parallel/extracted", help="where to write extracted files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--origsubdir", default="original", help="subdirectory for untokenized files") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="tokenized", help="subdirectory for tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--agiletoksubdir", default="agile-tokenized", help="subdirectory for agile-tokenized files (target side only)") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological files") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") parser.add_argument("--extwtdir", "-et", default=None, help="directory of extracted tweet rsd files") parser.add_argument("--agiletokpath", default=os.path.join(scriptdir, 'agiletok.sh'), help="path to agile tokenizer binary") parser.add_argument("--cdectokpath", default=os.path.join(scriptdir, 'cdectok.sh'), help="path to cdec tokenizer binary") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) origoutdir=args.origsubdir tokoutdir=args.toksubdir morphtokoutdir=args.morphtoksubdir cdectokoutdir=args.cdectoksubdir agiletokoutdir=args.agiletoksubdir cdectoklcoutdir=args.cdectoksubdir+".lc" agiletoklcoutdir=args.agiletoksubdir+".lc" morphoutdir=args.morphsubdir posoutdir=args.possubdir agiletokpath = args.agiletokpath cdectokpath = args.cdectokpath dirs = [origoutdir, tokoutdir, morphtokoutdir, cdectokoutdir, agiletokoutdir, cdectoklcoutdir, agiletoklcoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir=os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: fulldir = os.path.join(args.outdir, dir) lputil.mkdir_p(fulldir) source_fh = open(os.path.join(args.outdir, "source"), 'a') source_fh.write("Extracted parallel data from %s to %s on %s\nusing %s;" \ " command issued from %s\n" % (args.rootdir, args.outdir, datetime.datetime.now(), ' '.join(sys.argv), os.getcwd())) datadirs=[args.rootdir,]+args.datadirs ''' from_eng/ -- manual translations from English into LRLP (elicitation, phrasebook, core REFLEX news text, additional news text) from_xxx/ -- manual translations from LRLP into English in multiple genres ''' # name of corpus and location in lrlp (for cases that don't do anything special) corpustuples = [("fromsource.generic", os.path.join(*(datadirs+["from_%s" % args.src,]))), ("fromtarget.news", os.path.join(*(datadirs+["from_%s" % args.trg, "news"]))), ("fromtarget.phrasebook", os.path.join(*(datadirs+["from_%s" % args.trg, "phrasebook"]))), ("fromtarget.elicitation", os.path.join(*(datadirs+["from_%s" % args.trg, "elicitation"])))] for corpustuple in corpustuples: printout(corpustuple[0], corpustuple[1], args.src, args.trg, args.outdir, origoutdir, garbageoutdir, tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir, agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir, agiletokpath, cdectokpath) # Found data printout("found.generic", args.rootdir, args.src, args.trg, args.outdir, origoutdir, garbageoutdir, tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir, agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir, agiletokpath, cdectokpath, stp=lputil.all_found_tuples, el=lputil.get_aligned_sentences) # Tweet data if args.extwtdir is not None and os.path.exists(args.extwtdir): move_extracted_tweet(os.path.join(*datadirs), args.src, args.extwtdir) printout("fromsource.tweet", os.path.join(*(datadirs+["from_%s" % args.src,])), args.src, args.trg, args.outdir, origoutdir, garbageoutdir, tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir, agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir, agiletokpath, cdectokpath, tweet=True)
def main(args): indir = args.indir origsizes = args.sizes termfile = args.termfile # TODO: find these? # doc = keep full docs together (can detect this by counting number of unique docs) docprefixes = [ "fromsource.generic", "fromsource.tweet", "fromtarget.news", "found.generic" ] nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"] if args.allperseg: nodocprefixes.extend(docprefixes) docprefixes = [] extractpath = os.path.join(indir, args.extractpath) # http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory filetypes = [subdir for subdir in next(os.walk(extractpath))[1]] origpath = os.path.join(extractpath, 'original') outpath = os.path.join(indir, args.outdir) mkdir_p(outpath) sf_ann_doc_ids_file = None if 'setE' in args.categories: # annotated SF docs goes to setE sf_ann_doc_ids_file = os.path.join(outpath, "sf.ann.doc.ids") sf_ann_dir = os.path.join( indir, '../expanded/lrlp/data/annotation/situation_frame/') scan_sf_ann_doc_ids(sf_ann_dir, sf_ann_doc_ids_file) # number of words in each file fullsizes = {} adjsizes = {} sizesum = 0.0 for preflist in [docprefixes, nodocprefixes]: for prefix in list(preflist): # don't deal with it more if there's nothing in the manifest manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0: print("removing " + prefix) preflist.remove(prefix) for prefix in docprefixes + nodocprefixes: engfile = os.path.join(origpath, "%s.original.eng.flat" % prefix) prefsize = int( check_output("wc -w %s" % engfile, shell=True).decode('utf8').strip().split(' ')[0]) fullsizes[prefix] = prefsize sizesum += prefsize # adjust size split by proportion, with minimum for prefix in docprefixes + nodocprefixes: mult = fullsizes[prefix] / sizesum adjsizes[prefix] = [ max(args.minimum, int(mult * x)) for x in origsizes ] print(prefix, adjsizes[prefix]) # doc-based processing catlist = ' '.join(args.categories) for prefix in docprefixes: idfile = os.path.join(outpath, "%s.ids" % prefix) manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) try: check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) engfile = os.path.join(origpath, "%s.original.eng.flat" % prefix) sizelist = ' '.join(map(str, adjsizes[prefix])) catfile = run_selection(prefix, idfile, engfile, termfile, catlist, args.remainder, sizelist, filetypes, args.language, extractpath, outpath, args.devlstfile, setElstfile=sf_ann_doc_ids_file) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % ( script_dir, manifest, idfile, catfile, outpath) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # nodoc-based processing for prefix in nodocprefixes: idfile = os.path.join(outpath, "%s.fakeids" % prefix) try: mansize = int( check_output( "wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf8').strip().split(' ')[0]) check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) engfile = os.path.join(origpath, "%s.original.eng.flat" % prefix) sizelist = ' '.join(map(str, adjsizes[prefix])) catfile = run_selection(prefix, idfile, engfile, termfile, catlist, args.remainder, sizelist, filetypes, args.language, extractpath, outpath) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % ( script_dir, manifest, idfile, catfile, outpath) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # warning if entries not found in given dev list if args.devlstfile: devlst = set(open(args.devlstfile).read().split()) all_docids = list() for prefix in docprefixes: all_docids += open(os.path.join(outpath, "%s.ids" % prefix)).read().split('\n') for i in devlst - set(all_docids): print("***Warning: docid not found: %s" % i)
def main(): parser = argparse.ArgumentParser( description= "Make dataset selections for experimentation given previously generated categorization files", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", help="location of parallel data") parser.add_argument("--language", "-l", help="source language three digit code") parser.add_argument( "--extractpath", "-e", default="filtered", help="location of extracted data (might want to use 'filtered')") parser.add_argument( "--remainder", "-r", default="train", help="remainder category. Should match previous remainder category") parser.add_argument("--previous", "-p", help="location of previous cat files") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # reader = codecs.getreader('utf8') # writer = codecs.getwriter('utf8') # outfile = writer(args.outfile) indir = args.indir # TODO: find these? # doc = keep full docs together (can detect this by counting number of unique docs) # TODO: re-add found.generic to docprefixes docprefixes = [ "fromsource.generic", "fromsource.tweet", "fromtarget.news", "found.generic" ] nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"] extractpath = os.path.join(indir, args.extractpath) #http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory filetypes = [subdir for subdir in next(os.walk(extractpath))[1]] origpath = os.path.join(extractpath, 'original') outpath = os.path.join(indir, 'splits') mkdir_p(outpath) for preflist in [docprefixes, nodocprefixes]: for prefix in list(preflist): # don't deal with it more if there's nothing in the manifest manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0: print("removing " + prefix) preflist.remove(prefix) # doc-based processing for prefix in docprefixes: idfile = os.path.join(outpath, "%s.ids" % prefix) manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) try: check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = os.path.join(args.previous, "%s.cats" % prefix) newcatfile = os.path.join(outpath, os.path.basename(catfile)) if os.path.exists(catfile): copy(catfile, newcatfile) else: touch(newcatfile) runselection(prefix, idfile, newcatfile, args.remainder, filetypes, args.language, extractpath, outpath) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % ( scriptdir, manifest, idfile, newcatfile, outpath, args.remainder) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # nodoc-based processing for prefix in nodocprefixes: idfile = os.path.join(outpath, "%s.fakeids" % prefix) try: mansize = int( check_output( "wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf-8').strip().split(' ')[0]) check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = os.path.join(args.previous, "%s.cats" % prefix) newcatfile = os.path.join(outpath, os.path.basename(catfile)) if os.path.exists(catfile): copy(catfile, newcatfile) else: touch(newcatfile) runselection(prefix, idfile, newcatfile, args.remainder, filetypes, args.language, extractpath, outpath) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % ( scriptdir, manifest, idfile, newcatfile, outpath, args.remainder) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True)
def main(): parser = argparse.ArgumentParser(description="Make dataset selections for experimentation", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", help="location of parallel data") parser.add_argument("--language", "-l", help="source language three digit code") parser.add_argument("--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')") parser.add_argument("--minimum", "-m", default=100, help="minimum number of words per subselection") parser.add_argument("--sizes", "-s", nargs='+', type=int, help="list of sizes desired in each category") parser.add_argument("--categories", "-c", nargs='+', help="list of categories. Must match sizes") parser.add_argument("--termfile", "-t", help="file of desired terms, one per line") parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should be a new category") parser.add_argument("--devlstfile", "-d", default=None, help="file of desired documents for dev (subject to length constraints, must be a set called 'dev')") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # reader = codecs.getreader('utf8') # writer = codecs.getwriter('utf8') # outfile = writer(args.outfile) indir = args.indir origsizes = args.sizes termfile = args.termfile # TODO: find these? # doc = keep full docs together (can detect this by counting number of unique docs) docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news", "found.generic"] nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"] # TODO: find these filetypes = ["morph", "morph-tokenized", "original", "pos", "tokenized", "mttok", "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc", "cdec-tokenized.lc"] extractpath = os.path.join(indir, args.extractpath) origpath = os.path.join(extractpath, 'original') outpath = os.path.join(indir, 'splits') mkdir_p(outpath) # number of words in each file fullsizes = {} adjsizes = {} sizesum = 0.0 for preflist in [docprefixes, nodocprefixes]: for prefix in list(preflist): # don't deal with it more if there's nothing in the manifest manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0: print("removing "+prefix) preflist.remove(prefix) for prefix in docprefixes+nodocprefixes: engfile=os.path.join(origpath, "%s.original.eng.flat" % prefix) prefsize = int(check_output("wc -w %s" % engfile, shell=True).decode('utf8').strip().split(' ')[0]) fullsizes[prefix] = prefsize sizesum +=prefsize # adjust size split by proportion, with minimum for prefix in docprefixes+nodocprefixes: mult = fullsizes[prefix]/sizesum adjsizes[prefix] = [max(args.minimum, int(mult*x)) for x in origsizes] print(prefix,adjsizes[prefix]) # doc-based processing catlist = ' '.join(args.categories) for prefix in docprefixes: idfile = os.path.join(outpath, "%s.ids" % prefix) manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) try: check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) engfile=os.path.join(origpath, "%s.original.eng.flat" % prefix) sizelist = ' '.join(map(str, adjsizes[prefix])) catfile = runselection(prefix, idfile, engfile, termfile, catlist, args.remainder, sizelist, filetypes, args.language, extractpath, outpath, args.devlstfile) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % (scriptdir, manifest, idfile, catfile, outpath) print("Running "+cmd) check_output(cmd, stderr=STDOUT, shell=True) # nodoc-based processing for prefix in nodocprefixes: idfile = os.path.join(outpath, "%s.fakeids" % prefix) try: mansize = int(check_output("wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf8').strip().split(' ')[0]) check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) engfile=os.path.join(origpath, "%s.original.eng.flat" % prefix) sizelist = ' '.join(map(str, adjsizes[prefix])) catfile = runselection(prefix, idfile, engfile, termfile, catlist, args.remainder, sizelist, filetypes, args.language, extractpath, outpath) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % (scriptdir, manifest, idfile, catfile, outpath) print("Running "+cmd) check_output(cmd, stderr=STDOUT, shell=True) # warning if entries not found in given dev list if args.devlstfile: devlst = set(open(args.devlstfile).read().split()) all_docids = list() for prefix in docprefixes: all_docids += open(os.path.join(outpath, "%s.ids" % prefix)).read().split('\n') for i in devlst - set(all_docids): print ("***Warning: docid not found: %s" % i)
def main(): steps = [] # extract_mono.py steps.append(Step('decrypt_sets.py', help="decode encrypted sets")) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) # extract_mono_tweet.py steps.append(Step('extract_mono_tweet.py', help="make twitter data look like regular mono data")) steps.append(Step('make_mono_release.py', help="package mono flat data")) stepsbyname = {} for step in steps: stepsbyname[step.prog] = step parser = argparse.ArgumentParser(description="Build an eval IL monoset from LDC to elisa form", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--setdir", "-d", default='.', help='name of set directory (i.e. set1, setE, etc.)') parser.add_argument("--language", "-l", default='uzb', help='three letter code of IL language') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--notweets", "-n", action='store_true', default=None, help='do not include tweets (for eval IL setE only)') parser.add_argument("--expdir", "-e", help='path to where the extraction is. If starting at ' \ 'step 0 this is ignored') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--outfile", "-o", help='name of the output file') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps)-1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="/opt/local/bin/ruby2.2", help='path to ruby (2.1 or higher)') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) rootdir = args.root language = args.language setdir = args.setdir outdir = os.path.join(rootdir, language, setdir) outfile = os.path.join(outdir, args.outfile) start = args.start stop = args.stop + 1 if args.expdir is None: expdir = os.path.join(rootdir, language, 'expanded', 'lrlp') else: expdir = args.expdir mkdir_p(outdir) if args.key is None: stepsbyname["decrypt_sets.py"].disable() else: stepsbyname["decrypt_sets.py"].stderr=os.path.join(outdir, 'decrypt_sets.err') stepsbyname["decrypt_sets.py"].argstring="-r %s -k %s -s %s" % (expdir, args.key, setdir) stepsbyname["decrypt_sets.py"].run() start+=1 # TWEET if args.notweets: stepsbyname["get_tweet_by_id.rb"].disable() stepsbyname["extract_mono_tweet.py"].disable() else: tweetprogpath = os.path.join(expdir, 'set0', 'tools', 'twitter-processing', 'bin') stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath tweetdir = os.path.join(outdir, 'tweet') stepsbyname["get_tweet_by_id.rb"].argstring = tweetdir+" -l "+language tweetintab = os.path.join(expdir, setdir, 'docs', 'twitter_info.tab') if os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab else: stepsbyname["get_tweet_by_id.rb"].disable() tweeterr = os.path.join(outdir, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby # # TODO: log tweets! # MONO monoindirs = dirfind(os.path.join(expdir, setdir, 'data', 'monolingual_text'), "%s.ltf.zip" % setdir) monooutdir = os.path.join(outdir, 'mono', 'extracted') monoerr = os.path.join(outdir, 'extract_mono.err') stepsbyname["extract_mono.py"].argstring = "--nogarbage -i %s -o %s" % \ (' '.join(monoindirs), monooutdir) stepsbyname["extract_mono.py"].stderr = monoerr # since we package and extract all at once, use the ltf structure to declare the manifest names manfiles = [x for x in map(lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), monoindirs)] # tweet 2 mono set here so that mono and tweet dirs are already established if stepsbyname["get_tweet_by_id.rb"].disabled: stepsbyname["extract_mono_tweet.py"].disable() else: stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err') manfiles.append("tweets") # PACKAGE monoxml = outfile monostatsfile = outfile+".stats" manarg = ' '.join(manfiles) monoerr = os.path.join(outdir, 'make_mono_release.err') stepsbyname["make_mono_release.py"].argstring = "-r %s -l %s -c %s -s %s | gzip > %s" % \ (monooutdir, language, manarg, monostatsfile, monoxml) stepsbyname["make_mono_release.py"].stderr = monoerr for step in steps[start:stop]: step.run() print("Done.\nFile is %s" % outfile)
def main(): parser = argparse.ArgumentParser( description="filter extracted parallel data directory", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", default="./extracted", help="input directory") parser.add_argument("--lang", "-l", help="input directory") parser.add_argument( "--stds", "-s", type=int, default=1, help="number of standard deviations from mean to filter out") parser.add_argument("--filterdir", "-f", default="./filtered", help="output filter directory") parser.add_argument( "--genre", "-g", default="original", help= "genre to use when filtering (could try tokenized but not available for twitter)" ) parser.add_argument("--remaindir", "-r", default="./remainder", help="output remainder directory") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # crawl indir for expected original files. cat them together, save ratios, get mean and stdev # for each file, including manifest, zip with ratios, determine whether it belongs in filter or remaindir # TODO: add deltas too! indir = args.indir filterdir = args.filterdir remaindir = args.remaindir mkdir_p(filterdir) mkdir_p(remaindir) # assumption: there are a number of *.eng.manifest files, each paired with *.<lang>.manifest, and for each i, there is original/i.eng.flat and original/i.<lang>.flat engmanifests = glob.glob(os.path.join(indir, "*.eng.manifest")) fmanifests = [] ratios = dd(list) deltas = dd(list) blackballs = dd(list) genres = set() for eman in engmanifests: ebase = os.path.basename(eman) genre = '.'.join(ebase.split('.')[:-2]) genres.add(genre) fman = os.path.join(os.path.dirname(eman), "%s.%s.manifest" % (genre, args.lang)) fmanifests.append(fman) eorig = os.path.join(args.indir, args.genre, "%s.%s.eng.flat" % (genre, args.genre)) forig = os.path.join(args.indir, args.genre, "%s.%s.%s.flat" % (genre, args.genre, args.lang)) # test existence for f in [eman, fman, eorig, forig]: if not os.path.exists(f): sys.stderr.write("ERROR: %s does not exist\n" % f) sys.exit(1) #slurp files, calculate ratios, store ratios eorig = prepfile(open(eorig, 'r'), 'r') forig = prepfile(open(forig, 'r'), 'r') for ln, (eline, fline) in enumerate(izip(eorig, forig)): ewords = eline.strip().split() fwords = fline.strip().split() blackballs[genre].append(blackball(eline, fline)) deltas[genre].append(abs(len(ewords) - len(fwords))) try: ratios[genre].append((len(ewords) + 0.0) / (len(fwords) + 0.0)) except ZeroDivisionError: sys.stderr.write( "0-length foreign sentence at line {} of {}\n".format( ln + 1, forig.name)) ratios[genre].append(0.) allratios = np.concatenate(list(map(np.array, ratios.values())), 0) alldeltas = np.concatenate(list(map(np.array, deltas.values())), 0) allblackballs = np.concatenate(list(map(np.array, blackballs.values())), 0) bbrejectsize = Counter(allblackballs)[True] ratiomean = np.mean(allratios) ratiostd = np.std(allratios) lowratio = ratiomean - (args.stds * ratiostd) highratio = ratiomean + (args.stds * ratiostd) rejectratiosize = len( list(filter(lambda x: x < lowratio or x > highratio, allratios))) deltamean = np.mean(alldeltas) deltastd = np.std(alldeltas) lowdelta = deltamean - (args.stds * deltastd) highdelta = deltamean + (args.stds * deltastd) rejectdeltasize = len( list(filter(lambda x: x < lowdelta or x > highdelta, alldeltas))) sys.stderr.write( "Could be rejecting %d of %d lines (%f %%) with ratio below %f or above %f\n" % (rejectratiosize, len(allratios), 100.0 * rejectratiosize / len(allratios), lowratio, highratio)) sys.stderr.write( "Could be rejecting %d of %d lines (%f %%) with delta below %f or above %f\n" % (rejectdeltasize, len(alldeltas), 100.0 * rejectdeltasize / len(alldeltas), lowdelta, highdelta)) reject_ratio_delta_size = len( list( filter( lambda x: (x[0] < lowratio or x[0] > highratio) and (x[1] < lowdelta or x[1] > highdelta), zip(allratios, alldeltas)))) sys.stderr.write( "Actually rejecting %d of %d lines (%f %%) meeting both delta and ratio criteria\n" % (reject_ratio_delta_size, len(alldeltas), 100.0 * reject_ratio_delta_size / len(alldeltas))) sys.stderr.write( "Also rejecting %d of %d lines (%f %%) for blackball criteria\n" % (bbrejectsize, len(allblackballs), 100.0 * bbrejectsize / len(allblackballs))) # iterate through manifests and all files and filter per ratio and delta for manset in (engmanifests, fmanifests): for man in manset: sys.stderr.write("filtering %s\n" % man) base = os.path.basename(man) genre = '.'.join(base.split('.')[:-2]) sys.stderr.write("genre %s\n" % genre) rats = ratios[genre] delts = deltas[genre] bbs = blackballs[genre] reject_ratio_delta_size = len( list( filter( lambda x: (x[0] < lowratio or x[0] > highratio) and (x[1] < lowdelta or x[1] > highdelta), zip(rats, delts)))) #rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, rats))) sys.stderr.write("rejecting %d of %d\n" % (reject_ratio_delta_size, len(rats))) infile = prepfile(open(man, 'r'), 'r') filterfile = prepfile(open(os.path.join(filterdir, base), 'w'), 'w') remainfile = prepfile(open(os.path.join(remaindir, base), 'w'), 'w') filterlines(infile, bbs, (rats, delts), (lowratio, lowdelta), (highratio, highdelta), filterfile, remainfile) # for directories in extracted #http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory for subdir in next(os.walk(indir))[1]: # make parallel directories # for genres in genre set # for languages # filter lines insubdir = os.path.join(indir, subdir) filtersubdir = os.path.join(filterdir, subdir) mkdir_p(filtersubdir) remainsubdir = os.path.join(remaindir, subdir) mkdir_p(remainsubdir) for genre in genres: for lang in (args.lang, 'eng'): base = "%s.%s.%s.flat" % (genre, subdir, lang) infilename = os.path.join(insubdir, base) if os.path.exists(infilename): infile = prepfile(open(infilename, 'r'), 'r') filterfile = prepfile( open(os.path.join(filtersubdir, base), 'w'), 'w') remainfile = prepfile( open(os.path.join(remainsubdir, base), 'w'), 'w') filterlines(infile, blackballs[genre], (ratios[genre], deltas[genre]), (lowratio, lowdelta), (highratio, highdelta), filterfile, remainfile) else: sys.stderr.write("%s does not exist\n" % infilename) # count files in each of the directories; should be the same for dir in (indir, filterdir, remaindir): sys.stderr.write("%d files in %s\n" % (countfiles(dir), dir))
def main(): parser = argparse.ArgumentParser(description="extract parallel data from " \ "expanded lrlp to flat files and manifests.") parser.add_argument("--rootdir", "-r", default=".", help="root lrlp dir") parser.add_argument("--datadirs", nargs='+', default=['data', 'translation'], help="elements in path from root to ltf files") parser.add_argument("--outdir", "-o", default="./parallel/extracted", help="where to write extracted files") parser.add_argument("--src", "-s", default='uzb', help="source language 3 letter code") parser.add_argument("--trg", "-t", default='eng', help="target language 3 letter code") parser.add_argument("--origsubdir", default="raw.original", help="subdirectory for untokenized files") parser.add_argument("--cleanorigsubdir", default="original", help="subdirectory for cleaned raw original") parser.add_argument("--garbagesubdir", default="garbage", help="subdirectory for garbage files (under orig)") parser.add_argument("--nogarbage", action='store_true', default=False, help="turn off garbage filtering") parser.add_argument("--toksubdir", default="raw.tokenized", help="subdirectory for ldc-tokenized but raw files") parser.add_argument("--cleantoksubdir", default="tokenized", help="subdirectory for cleaned ldc-tokenized files") parser.add_argument("--cdectoksubdir", default="cdec-tokenized", help="subdirectory for cdec-tokenized files") parser.add_argument("--agiletoksubdir", default="agile-tokenized", help="subdirectory for agile-tokenized files (target side only)") parser.add_argument("--morphtoksubdir", default="morph-tokenized", help="subdirectory for tokenized files based on " \ "morphological segmentation") parser.add_argument("--morphsubdir", default="morph", help="subdirectory for morphological files") parser.add_argument("--possubdir", default="pos", help="subdirectory for pos tag files") # parser.add_argument("--extwtdir", "-et", default=None, # help="directory of extracted tweet rsd files") parser.add_argument("--agiletokpath", default=os.path.join(scriptdir, 'agiletok.sh'), help="path to agile tokenizer binary") parser.add_argument("--cdectokpath", default=os.path.join(scriptdir, 'cdectok.sh'), help="path to cdec tokenizer binary") parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'), help="path to cleaning script") addonoffarg(parser, 'cdec', help="do cdec tokenization", default=True) addonoffarg(parser, 'swap', help="swap source/translation in found file (il3=true, cmn=false)", default=False) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) origoutdir = args.origsubdir cleanorigoutdir = args.cleanorigsubdir tokoutdir = args.toksubdir cleantokoutdir = args.cleantoksubdir morphtokoutdir = args.morphtoksubdir cdectokoutdir = args.cdectoksubdir agiletokoutdir = args.agiletoksubdir cdectoklcoutdir = args.cdectoksubdir + ".lc" agiletoklcoutdir = args.agiletoksubdir + ".lc" morphoutdir = args.morphsubdir posoutdir = args.possubdir agiletokpath = args.agiletokpath cdectokpath = args.cdectokpath cleanpath = args.cleanpath dirs = [origoutdir, cleanorigoutdir, tokoutdir, cleantokoutdir, morphtokoutdir, cdectokoutdir, agiletokoutdir, cdectoklcoutdir, agiletoklcoutdir, morphoutdir, posoutdir] if args.nogarbage: garbageoutdir = None else: garbageoutdir = os.path.join(origoutdir, args.garbagesubdir) dirs.append(garbageoutdir) for dir in dirs: fulldir = os.path.join(args.outdir, dir) lputil.mkdir_p(fulldir) source_fh = open(os.path.join(args.outdir, "source"), 'a') source_fh.write("Extracted parallel data from %s to %s on %s\nusing %s;" \ " command issued from %s\n" % (args.rootdir, args.outdir, datetime.datetime.now(), ' '.join(sys.argv), os.getcwd())) datadirs = [args.rootdir, ] + args.datadirs ''' from_eng/ -- manual translations from English into LRLP (elicitation, phrasebook, core REFLEX news text, additional news text) from_xxx/ -- manual translations from LRLP into English in multiple genres ''' # name of corpus and location in lrlp (for cases that don't do anything special) corpustuples = [("fromsource.generic", os.path.join(*(datadirs + ["from_%s" % args.src, ]))), ("fromtarget.news", os.path.join(*(datadirs + ["from_%s" % args.trg, "news"]))), ("fromtarget.phrasebook", os.path.join(*(datadirs + ["from_%s" % args.trg, "phrasebook"]))), ("fromtarget.elicitation", os.path.join(*(datadirs + ["from_%s" % args.trg, "elicitation"]))) ] commonargs = [args.src, args.trg, args.outdir, origoutdir, cleanorigoutdir, garbageoutdir, tokoutdir, cleantokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir, agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir, agiletokpath, cdectokpath, cleanpath, args.cdec] for corpustuple in corpustuples: printout(corpustuple[0], corpustuple[1], *commonargs) # Found data printout("found.generic", args.rootdir, *commonargs, stp=lputil.all_found_tuples, el=lputil.get_aligned_sentences, swap=args.swap) # # Tweet data printout("fromsource.tweet", os.path.join(*(datadirs + ["from_%s" % args.src, ])), *commonargs, tweet=True)
def main(): steps = [] # extract_mono.py steps.append(Step('decrypt_sets.py', help="decode encrypted sets")) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) steps.append( Step('ldc_tok.py', help="run ldc tokenizer on tweets ", abortOnFail=False)) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) steps.append(Step('make_mono_release.py', help="package mono flat data")) stepsbyname = {} for step in steps: stepsbyname[step.prog] = step parser = argparse.ArgumentParser(description="Build an eval IL monoset from LDC to elisa form", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--setdir", "-d", default='.', help='name of set directory (i.e. set1, setE, etc.)') parser.add_argument("--language", "-l", default='uzb', help='three letter code of IL language') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--notweets", "-n", action='store_true', default=None, help='do not include tweets (for eval IL setE only)') parser.add_argument("--engset", "-E", action='store_true', default=None, help='assume engset and ilset (for eval IL setE only)') parser.add_argument("--expdir", "-e", help='path to where the extraction is. If starting at ' \ 'step 0 this is ignored') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--outfile", "-o", help='name of the output file') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps) - 1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="/opt/local/bin/ruby2.2", help='path to ruby (2.1 or higher)') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) rootdir = args.root language = args.language setdir = args.setdir outdir = os.path.join(rootdir, language, setdir) outfile = os.path.join(outdir, args.outfile) start = args.start stop = args.stop + 1 if args.engset: emstep = steps.pop(-1) stepsbyname.pop(emstep.name) mmrstep = steps.pop(-1) stepsbyname.pop(mmrstep.name) for flavor in (language, "eng"): newem = Step('extract_mono.py', name='extract_mono_%s' % flavor, help="get flat form mono data in %s" % flavor) steps.append(newem) stepsbyname[newem.name] = newem newmmr = Step('make_mono_release.py', name='make_mono_release_%s' % flavor, help="package mono flat data in %s" % flavor) steps.append(newmmr) stepsbyname[newmmr.name] = newmmr stop += 2 if args.expdir is None: expdir = os.path.join(rootdir, language, 'expanded', 'lrlp') else: expdir = args.expdir mkdir_p(outdir) if args.key is None: stepsbyname["decrypt_sets.py"].disable() else: stepsbyname["decrypt_sets.py"].stderr = os.path.join( outdir, 'decrypt_sets.err') stepsbyname["decrypt_sets.py"].argstring = "-r %s -k %s -s %s" % ( expdir, args.key, setdir) stepsbyname["decrypt_sets.py"].run() start += 1 # from 2018 on, setE has il and eng variants monoindirs = [] # TWEET # hack for tweets; early set of monodir monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text') tweetintab = os.path.join(expdir, setdir, 'docs', 'twitter_info.tab') notweetsinmono = True if args.notweets or not os.path.exists(tweetintab): print("disabling twitter stuff; tweets in regular mono ok") notweetsinmono = False stepsbyname["get_tweet_by_id.rb"].disable() stepsbyname["ldc_tok.py"].disable() else: print( "not disabling twitter stuff; look at {}; avoiding tweets in regular mono" .format(tweetintab)) stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab tweetprogpaths = [] # for toolroot in (os.path.join(expdir, 'set0'), scriptdir): # bad ldc tools for eval for toolroot in (scriptdir, ): tweetprogpaths = dirfind(os.path.join(toolroot, 'tools'), 'get_tweet_by_id.rb') if len(tweetprogpaths) > 0: break if len(tweetprogpaths) == 0: sys.stderr.write("Can't find get_tweet_by_id.rb\n") sys.exit(1) else: tweetprogpath = os.path.dirname(tweetprogpaths[0]) tweetdir = os.path.join(outdir, 'tweet', 'rsd') stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath mkdir_p(tweetdir) stepsbyname[ "get_tweet_by_id.rb"].argstring = tweetdir + " -l " + language tweeterr = os.path.join(outdir, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby # TOKENIZE AND RELOCATE TWEETS # find rb location, params file toxexecpaths = [] thetoolroot = None for toolroot in (expdir, scriptdir): tokexecpaths = dirfind(os.path.join(toolroot, 'tools'), 'token_parse.rb') if len(tokexecpaths) > 0: thetoolroot = toolroot break if len(tokexecpaths) == 0: sys.stderr.write("Can't find token_parse.rb\n") sys.exit(1) tokexec = tokexecpaths[0] tokparamopts = dirfind(os.path.join(thetoolroot, 'tools'), 'yaml') tokparam = "--param {}".format( tokparamopts[0]) if len(tokparamopts) > 0 else "" # ugly: the base of the file monodir/mononame.zip; need to add it to monoindirs and just pass that base so it gets constructed mononame = "tweets.ltf" monoindirs.append(os.path.join(monodir, mononame + ".zip")) stepsbyname[ "ldc_tok.py"].argstring = "--mononame {mononame} -m {monodir} --ruby {ruby} --dldir {tweetdir} --exec {tokexec} {tokparam} --outfile {outfile}".format( mononame=mononame, monodir=monodir, ruby=args.ruby, tweetdir=tweetdir, tokexec=tokexec, tokparam=tokparam, outfile=os.path.join(rootdir, language, 'ldc_tok.stats')) stepsbyname["ldc_tok.py"].stderr = os.path.join( rootdir, language, 'ldc_tok.err') # # TODO: log tweets! # MONO if args.engset: for flavor in (args.language, "eng"): localmonoindirs = copy.deepcopy(monoindirs) monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text', flavor) localmonoindirs.extend( dirfind(monodir, "%s_%s.ltf.zip" % (setdir, flavor))) print(localmonoindirs) # JM: TODO: ugly copy. refactor!!! monooutdir = os.path.join(outdir, 'mono', 'extracted_%s' % flavor) monoerr = os.path.join(outdir, 'extract_mono_%s.err' % flavor) stepsbyname["extract_mono_%s" % flavor].argstring = "--no-cdec --nogarbage -i %s -o %s" % \ (' '.join(localmonoindirs), monooutdir) if notweetsinmono: stepsbyname["extract_mono_%s" % flavor].argstring += " --removesn" stepsbyname["extract_mono_%s" % flavor].stderr = monoerr # since we package and extract all at once, use the ltf structure to declare the manifest names manfiles = [ x for x in map( lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), localmonoindirs) ] # tweet 2 mono set here so that mono and tweet dirs are already established # if stepsbyname["get_tweet_by_id.rb"].disabled: # stepsbyname["extract_mono_tweet.py"].disable() # else: # stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir # stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err') # manfiles.append("tweets") ofcomponents = outfile.split('.') localoutfile = '.'.join( ofcomponents[:-1]) + (".%s." % flavor) + ofcomponents[-1] print(localoutfile) # PACKAGE monoxml = localoutfile monostatsfile = localoutfile + ".stats" manarg = ' '.join(manfiles) monoerr = os.path.join(outdir, 'make_mono_release_%s.err' % flavor) stepsbyname["make_mono_release_%s" % flavor].argstring = "--no-ext -r %s -l %s -c %s -s %s | gzip > %s" % \ (monooutdir, flavor, manarg, monostatsfile, monoxml) stepsbyname["make_mono_release_%s" % flavor].stderr = monoerr else: monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text') monoindirs.extend(dirfind(monodir, "%s.ltf.zip" % setdir)) monooutdir = os.path.join(outdir, 'mono', 'extracted') monoerr = os.path.join(outdir, 'extract_mono.err') stepsbyname["extract_mono.py"].argstring = "--no-cdec --nogarbage -i %s -o %s" % \ (' '.join(monoindirs), monooutdir) if notweetsinmono: stepsbyname["extract_mono.py"].argstring += " --removesn" stepsbyname["extract_mono.py"].stderr = monoerr # since we package and extract all at once, use the ltf structure to declare the manifest names manfiles = [ x for x in map( lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), monoindirs) ] # tweet 2 mono set here so that mono and tweet dirs are already established # if stepsbyname["get_tweet_by_id.rb"].disabled: # stepsbyname["extract_mono_tweet.py"].disable() # else: # stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir # stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err') # manfiles.append("tweets") # PACKAGE monoxml = outfile monostatsfile = outfile + ".stats" manarg = ' '.join(manfiles) monoerr = os.path.join(outdir, 'make_mono_release.err') stepsbyname["make_mono_release.py"].argstring = "--no-ext -r %s -l %s -c %s -s %s | gzip > %s" % \ (monooutdir, language, manarg, monostatsfile, monoxml) stepsbyname["make_mono_release.py"].stderr = monoerr for step in steps[start:stop]: step.run() print("Done.\nLast file is %s" % outfile)
def main(): parser = argparse.ArgumentParser( description="Deterministic subselect designed for nov 2016 uyghur evaluation: per-doc, from end", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--indir", "-i", help="location of parallel data") parser.add_argument("--language", "-l", help="source language three digit code") parser.add_argument( "--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')" ) parser.add_argument("--sizes", "-s", nargs="+", type=int, help="list of sizes desired in each category") parser.add_argument("--categories", "-c", nargs="+", help="list of categories. Must match sizes") parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should be a new category") parser.add_argument( "--devlstfile", "-d", default=None, help="file of desired documents for dev (subject to length constraints, must be a set called 'dev')", ) addonoffarg(parser, "fromFront", default=False, help="do doc assignment from the beginning (instead of the end)") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # reader = codecs.getreader('utf8') # writer = codecs.getwriter('utf8') # outfile = writer(args.outfile) indir = args.indir origsizes = args.sizes # TODO: find these? # doc = keep full docs together (can detect this by counting number of unique docs) # TODO: re-add found.generic to docprefixes docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news"] # IL3: moving found.generic!! nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook", "found.generic"] # TODO: find these filetypes = [ "morph", "morph-tokenized", "original", "pos", "tokenized", "mttok", "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc", "cdec-tokenized.lc", ] extractpath = os.path.join(indir, args.extractpath) origpath = os.path.join(extractpath, "original") outpath = os.path.join(indir, "splits") mkdir_p(outpath) for preflist in [docprefixes, nodocprefixes]: for prefix in list(preflist): # don't deal with it more if there's nothing in the manifest manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0: print("removing " + prefix) preflist.remove(prefix) # doc-based processing for prefix in docprefixes: idfile = os.path.join(outpath, "%s.ids" % prefix) manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) try: check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = runselection( prefix, idfile, args.categories, args.remainder, origsizes, filetypes, args.language, extractpath, outpath, args.devlstfile, fromFront=args.fromFront, ) for i in (args.language, "eng"): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % ( scriptdir, manifest, idfile, catfile, outpath, args.remainder, ) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # nodoc-based processing for prefix in nodocprefixes: idfile = os.path.join(outpath, "%s.fakeids" % prefix) try: mansize = int( check_output("wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True) .decode("utf8") .strip() .split(" ")[0] ) check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = runselection( prefix, idfile, args.categories, args.remainder, origsizes, filetypes, args.language, extractpath, outpath, fromFront=args.fromFront, ) for i in (args.language, "eng"): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % ( scriptdir, manifest, idfile, catfile, outpath, args.remainder, ) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # warning if entries not found in given dev list if args.devlstfile: devlst = set(open(args.devlstfile).read().split()) all_docids = list() for prefix in docprefixes: all_docids += open(os.path.join(outpath, "%s.ids" % prefix)).read().split("\n") for i in devlst - set(all_docids): print("***Warning: docid not found: %s" % i)
def main(): steps = [] # Put additional steps in here. Arguments, stdin/stdout, etc. get set below # unpack_lrlp.sh steps.append( Step('unpack_lrlp.sh', call=check_output, help="untars lrlp into position for further processing")) # gather_ephemera.py steps.append( Step('gather_ephemera.py', help="relocates assorted bits from lrlp")) # extract_lexicon.py steps.append( Step('extract_lexicon.py', help="get flat form of bilingual lexicon", abortOnFail=False)) # clean_lexicon steps.append( Step('clean.sh', name="clean_lexicon", help="wildeclean/nfkc lexicon file", abortOnFail=False)) # normalize_lexicon.py steps.append( Step( 'normalize_lexicon_tg.py', name="normalize_lexicon.py", help= "heuristically convert lexicon into something more machine readable", abortOnFail=False)) # relocate lexicon steps.append( Step('cp', progpath='/bin', name="relocate_lexicon", help="move the lexicon stuff into ephemera", abortOnFail=False)) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) steps.append( Step('ldc_tok.py', help="run ldc tokenizer on tweets ", abortOnFail=False)) # extract_psm_annotation.py steps.append( Step('extract_psm_annotation.py', help="get annotations from psm files into psm.ann", abortOnFail=False)) # extract_entity_annotation.py steps.append( Step('extract_entity_annotation.py', help="get entity and other annotations into entity.ann", abortOnFail=False)) # extract_parallel.py steps.append( Step('extract_parallel.py', help="get flat form parallel data")) steps.append( Step('filter_parallel.py', help="filter parallel data to remove likely mismatches")) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) # extract_comparable.py steps.append( Step('extract_comparable.py', help="get flat form comparable data")) stepsbyname = {} for step in steps: stepsbyname[step.name] = step parser = argparse.ArgumentParser(description="Process a LRLP into flat format", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--tarball", "-t", nargs='+', required=True, help= 'path to gzipped tars for processing (all tars considered to be part of the same package). Ex: lrlp.tar.gz' ) parser.add_argument("--language", "-l", required=True, help='three letter code of language. example "uzb"') parser.add_argument( "--lexversion", "-L", default='1.5', help='version of lexicon to extract (may need to create a new one)') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--set", "-S", default=None, help='decryption set for encrypted il') addonoffarg(parser, "mono", help="extract mono data", default=True) parser.add_argument( "--previous", default=None, help= 'path to previous extraction (equivalent to one level down from root)') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--evalil", "-E", action='store_true', default=False, help='this is an eval il. makes expdir set0 aware') parser.add_argument("--expdir", "-e", help='path to where the extraction is (equivalent to root/lang/expanded/lrlp). If starting at ' \ 'step 0 this is ignored') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps) - 1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="ruby", help='path to ruby (2.1 or higher)') addonoffarg(parser, "swap", help="swap source/target in found data (e.g. il3)", default=False) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) sys.exit(2) if args.expdir is not None and args.start <= 0: sys.stderr.write \ ("Warning: expdir is set but will be ignored and determined dynamically") if args.expdir is None and args.start > 0: sys.stderr.write \ ("Error: must explicitly set expdir if not starting at step 0") sys.exit(1) rootdir = args.root language = args.language start = args.start stop = args.stop + 1 if (args.key is None) ^ (args.set is None): sys.stderr.write("key (-k) and set (-S) must both be set or unset\n") sys.exit(1) # Patchups for step 0 argstring = "-k %s -s %s" % (args.key, args.set) if args.key is not None else "" argstring += " -l %s -r %s %s" % (language, rootdir, ' '.join( args.tarball)) sys.stderr.write("args for unpack lrlp are {}\n".format(argstring)) stepsbyname["unpack_lrlp.sh"].argstring = argstring if start == 0: expdir = steps[0].run().strip().decode("utf-8") if args.evalil: expdir = os.path.join(expdir, 'set0') start += 1 else: expdir = args.expdir monodir = os.path.join(expdir, 'data', 'monolingual_text') # what are the mono files? (needed for later) if args.mono and args.previous is None: monoindirs = dirfind(monodir, "ltf.zip") else: monoindirs = [] # Patchups for the rest if stop > 0: # TWEET tweetintab = os.path.join(expdir, 'docs', 'twitter_info.tab') tweetdir = os.path.join(rootdir, language, 'tweet', 'rsd') if not os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].disable() stepsbyname["ldc_tok.py"].disable() else: tweetprogpaths = [] #for toolroot in (expdir, scriptdir): # bad ldc tools for eval for toolroot in (scriptdir, ): tweetprogpaths = dirfind(os.path.join(toolroot, 'tools'), 'get_tweet_by_id.rb') if len(tweetprogpaths) > 0: break if len(tweetprogpaths) == 0: sys.stderr.write("Can't find get_tweet_by_id.rb\n") sys.exit(1) else: tweetprogpath = os.path.dirname(tweetprogpaths[0]) mkdir_p(tweetdir) tweeterr = os.path.join(rootdir, language, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr # just copy from previous or skip if no mono if not args.mono: if args.previous is None: stepsbyname["get_tweet_by_id.rb"].disable() else: oldtweetdir = os.path.join( args.previous, 'tweet', 'rsd' ) #WARNING: old versions of data won't have this structure stepsbyname["get_tweet_by_id.rb"].progpath = "/bin" stepsbyname["get_tweet_by_id.rb"].prog = "cp" stepsbyname[ "get_tweet_by_id.rb"].argstring = "-r {} {}".format( oldtweetdir, tweetdir) else: stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath stepsbyname[ "get_tweet_by_id.rb"].argstring = tweetdir + " -l " + language stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby if os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab else: stepsbyname["get_tweet_by_id.rb"].disable() # TOKENIZE AND RELOCATE TWEETS # find rb location, params file toxexecpaths = [] thetoolroot = None for toolroot in (expdir, scriptdir): tokexecpaths = dirfind(os.path.join(toolroot, 'tools'), 'token_parse.rb') if len(tokexecpaths) > 0: thetoolroot = toolroot break if len(tokexecpaths) == 0: sys.stderr.write("Can't find token_parse.rb\n") sys.exit(1) tokexec = tokexecpaths[0] tokparamopts = dirfind(os.path.join(thetoolroot, 'tools'), 'yaml') tokparam = "--param {}".format( tokparamopts[0]) if len(tokparamopts) > 0 else "" lrlpdir = os.path.join(expdir, 'data', 'translation', 'from_{}'.format(language), language, 'ltf') # ugly: the base of the file monodir/mononame.zip; need to add it to monoindirs and just pass that base so it gets constructed mononame = "tweets.ltf" monoindirs.append(os.path.join(monodir, mononame + ".zip")) stepsbyname[ "ldc_tok.py"].argstring = "--mononame {mononame} -m {monodir} --ruby {ruby} --dldir {tweetdir} --lrlpdir {lrlpdir} --exec {tokexec} {tokparam} --outfile {outfile}".format( monodir=monodir, mononame=mononame, ruby=args.ruby, tweetdir=tweetdir, lrlpdir=lrlpdir, tokexec=tokexec, tokparam=tokparam, outfile=os.path.join(rootdir, language, 'ldc_tok.stats')) stepsbyname["ldc_tok.py"].stderr = os.path.join( rootdir, language, 'ldc_tok.err') # EPHEMERA ephemdir = os.path.join(rootdir, language, 'ephemera') ephemarg = "-s {} -t {}".format(expdir, ephemdir) if args.previous is not None: ephemarg += " -o {}".format(os.path.join(args.previous, 'ephemera')) stepsbyname['gather_ephemera.py'].argstring = ephemarg ephemerr = os.path.join(rootdir, language, 'gather_ephemera.err') stepsbyname['gather_ephemera.py'].stderr = ephemerr # # LTF2RSD # l2rindir = os.path.join(expdir, 'data', 'translation', 'from_'+language, # 'eng') # Only converts from_SRC_tweet subdir # stepsbyname["ltf2rsd.perl"].argstring = l2rindir # # l2rprogpath = os.path.join(expdir, 'tools', 'ltf2txt') # # stepsbyname["ltf2rsd.perl"].progpath = l2rprogpath # l2rerr = os.path.join(rootdir, language, 'ltf2rsd.err') # stepsbyname["ltf2rsd.perl"].stderr = l2rerr # LEXICON # # IL CHANGE if args.evalil: lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.xml') if args.lexversion == "il6": lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.zip') elif args.lexversion == "il5": lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.txt') else: lexiconinfile = os.path.join(expdir, 'data', 'lexicon', '*.xml') lexiconoutdir = os.path.join(rootdir, language, 'lexicon') lexiconrawoutfile = os.path.join(lexiconoutdir, 'lexicon.raw') lexiconoutfile = os.path.join(lexiconoutdir, 'lexicon') lexiconnormoutfile = os.path.join(lexiconoutdir, 'lexicon.norm') lexiconerr = os.path.join(rootdir, language, 'extract_lexicon.err') lexiconcleanerr = os.path.join(rootdir, language, 'clean_lexicon.err') lexiconnormerr = os.path.join(rootdir, language, 'normalize_lexicon.err') # lexicon v1.5 for y2 stepsbyname[ "extract_lexicon.py"].argstring = " -v {} -i {} -o {}".format( args.lexversion, lexiconinfile, lexiconrawoutfile) stepsbyname["extract_lexicon.py"].stderr = lexiconerr stepsbyname["clean_lexicon"].argstring = "{} {}".format( lexiconrawoutfile, lexiconoutfile) stepsbyname["clean_lexicon"].stderr = lexiconcleanerr stepsbyname["normalize_lexicon.py"].argstring = "-i %s -o %s" % \ (lexiconoutfile, lexiconnormoutfile) stepsbyname["normalize_lexicon.py"].stderr = lexiconnormerr stepsbyname["relocate_lexicon"].argstring = "-r %s %s" % ( lexiconoutdir, ephemdir) # PSM # just copy from previous or skip if no mono psmerr = os.path.join(rootdir, language, 'extract_psm_annotation.err') stepsbyname["extract_psm_annotation.py"].stderr = psmerr psmoutpath = os.path.join(rootdir, language, 'psm.ann') if not args.mono: if args.previous is None: stepsbyname["extract_psm_annotation.py"].disable() else: oldpsm = os.path.join(args.previous, 'psm.ann') stepsbyname["extract_psm_annotation.py"].progpath = "/bin" stepsbyname["extract_psm_annotation.py"].prog = "cp" stepsbyname[ "extract_psm_annotation.py"].argstring = "{} {}".format( oldpsm, psmoutpath) else: psmindir = os.path.join(monodir, 'zipped', '*.psm.zip') stepsbyname["extract_psm_annotation.py"].argstring = "-i %s -o %s" % \ (psmindir, psmoutpath) # ENTITY entityoutpath = os.path.join(rootdir, language, 'entity.ann') entityerr = os.path.join(rootdir, language, 'extract_entity_annotation.err') stepsbyname["extract_entity_annotation.py"].argstring="-r %s -o %s -et %s" \ % (expdir, entityoutpath, tweetdir) stepsbyname["extract_entity_annotation.py"].stderr = entityerr # PARALLEL paralleloutdir = os.path.join(rootdir, language, 'parallel', 'extracted') parallelerr = os.path.join(rootdir, language, 'extract_parallel.err') stepsbyname["extract_parallel.py"].argstring="--no-cdec -r %s -o %s -s %s" % \ (expdir, paralleloutdir, language) stepsbyname["extract_parallel.py"].stderr = parallelerr if args.swap: stepsbyname["extract_parallel.py"].argstring += " --swap" filteroutdir = os.path.join(rootdir, language, 'parallel', 'filtered') rejectoutdir = os.path.join(rootdir, language, 'parallel', 'rejected') filtererr = os.path.join(rootdir, language, 'filter_parallel.err') stepsbyname["filter_parallel.py"].argstring="-s 2 -l %s -i %s -f %s -r %s" % \ (language, paralleloutdir, filteroutdir, rejectoutdir) stepsbyname["filter_parallel.py"].stderr = filtererr # MONO # just copy from previous or skip if no mono monoerr = os.path.join(rootdir, language, 'extract_mono.err') stepsbyname["extract_mono.py"].stderr = monoerr if not args.mono: if args.previous is None: stepsbyname["extract_mono.py"].disable() else: oldmonodir = os.path.join(args.previous, 'mono') monooutdir = os.path.join(rootdir, language, 'mono') stepsbyname["extract_mono.py"].progpath = "/bin" stepsbyname["extract_mono.py"].prog = "cp" stepsbyname["extract_mono.py"].argstring = "-r {} {}".format( oldmonodir, monooutdir) else: monooutdir = os.path.join(rootdir, language, 'mono', 'extracted') stepsbyname["extract_mono.py"].argstring = "--no-cdec -i %s -o %s" % \ (' '.join(monoindirs), monooutdir) # COMPARABLE if os.path.exists( os.path.join(expdir, 'data', 'translation', 'comparable')): compoutdir = os.path.join(rootdir, language, 'comparable', 'extracted') comperr = os.path.join(rootdir, language, 'extract_comparable.err') stepsbyname["extract_comparable.py"].argstring = "-r %s -o %s -s %s" % \ (expdir, compoutdir, language) stepsbyname["extract_comparable.py"].stderr = comperr else: stepsbyname["extract_comparable.py"].disable() for step in steps[start:stop]: step.run() print("Done.\nExpdir is %s" % expdir)
def main(): parser = argparse.ArgumentParser(description="filter extracted parallel data directory", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", default="./extracted", help="input directory") parser.add_argument("--lang", "-l", help="input directory") parser.add_argument("--stds", "-s", type=int, default=1, help="number of standard deviations from mean to filter out") parser.add_argument("--filterdir", "-f", default="./filtered", help="output filter directory") parser.add_argument("--genre", "-g", default="original", help="genre to use when filtering (could try tokenized but not available for twitter)") parser.add_argument("--remaindir", "-r", default="./remainder", help="output remainder directory") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # crawl indir for expected original files. cat them together, save ratios, get mean and stdev # for each file, including manifest, zip with ratios, determine whether it belongs in filter or remaindir # TODO: add deltas too! indir = args.indir filterdir = args.filterdir remaindir = args.remaindir mkdir_p(filterdir) mkdir_p(remaindir) # assumption: there are a number of *.eng.manifest files, each paired with *.<lang>.manifest, and for each i, there is original/i.eng.flat and original/i.<lang>.flat engmanifests = glob.glob(os.path.join(indir, "*.eng.manifest")) fmanifests = [] ratios = dd(list) deltas = dd(list) genres = set() for eman in engmanifests: ebase = os.path.basename(eman) genre = '.'.join(ebase.split('.')[:-2]) genres.add(genre) fman = os.path.join(os.path.dirname(eman), "%s.%s.manifest" % (genre, args.lang)) fmanifests.append(fman) eorig = os.path.join(args.indir, args.genre, "%s.%s.eng.flat" % (genre, args.genre)) forig = os.path.join(args.indir, args.genre, "%s.%s.%s.flat" % (genre, args.genre, args.lang)) # test existence for f in [eman, fman, eorig, forig]: if not os.path.exists(f): sys.stderr.write("ERROR: %s does not exist\n" % f) sys.exit(1) #slurp files, calculate ratios, store ratios eorig = prepfile(open(eorig, 'r'), 'r') forig = prepfile(open(forig, 'r'), 'r') for ln, (eline, fline) in enumerate(izip(eorig, forig)): ewords = eline.strip().split() fwords = fline.strip().split() ratios[genre].append((len(ewords)+0.0)/(len(fwords)+0.0)) deltas[genre].append(abs(len(ewords)-len(fwords))) allratios = np.concatenate(list(map(np.array, ratios.values())), 0) alldeltas = np.concatenate(list(map(np.array, deltas.values())), 0) ratiomean = np.mean(allratios) ratiostd = np.std(allratios) lowratio = ratiomean-(args.stds*ratiostd) highratio = ratiomean+(args.stds*ratiostd) rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, allratios))) deltamean = np.mean(alldeltas) deltastd = np.std(alldeltas) lowdelta = deltamean-(args.stds*deltastd) highdelta = deltamean+(args.stds*deltastd) rejectdeltasize = len(list(filter(lambda x: x<lowdelta or x > highdelta, alldeltas))) sys.stderr.write("Rejecting %d of %d lines (%f %%) with ratio below %f or above %f\n" % (rejectratiosize, len(allratios), 100.0*rejectratiosize/len(allratios), lowratio, highratio)) sys.stderr.write("Rejecting %d of %d lines (%f %%) with delta below %f or above %f\n" % (rejectdeltasize, len(alldeltas), 100.0*rejectdeltasize/len(alldeltas), lowdelta, highdelta)) reject_ratio_delta_size = len(list(filter(lambda x: (x[0]<lowratio or x[0]>highratio) and (x[1]<lowdelta or x[1]>highdelta), zip(allratios, alldeltas)))) sys.stderr.write("Rejecting %d of %d lines (%f %%) meeting both delta and ratio criteria\n" % (reject_ratio_delta_size, len(alldeltas), 100.0*reject_ratio_delta_size/len(alldeltas))) # iterate through manifests and all files and filter per ratio and delta for manset in (engmanifests, fmanifests): for man in manset: sys.stderr.write("filtering %s\n" % man) base = os.path.basename(man) genre = '.'.join(base.split('.')[:-2]) sys.stderr.write("genre %s\n" % genre) rats = ratios[genre] delts = deltas[genre] reject_ratio_delta_size = len(list(filter(lambda x: (x[0]<lowratio or x[0]>highratio) and (x[1]<lowdelta or x[1]>highdelta), zip(rats, delts)))) #rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, rats))) sys.stderr.write("rejecting %d of %d\n" % (reject_ratio_delta_size, len(rats))) infile = prepfile(open(man, 'r'), 'r') filterfile = prepfile(open(os.path.join(filterdir, base), 'w'), 'w') remainfile = prepfile(open(os.path.join(remaindir, base), 'w'), 'w') filterlines(infile, (rats, delts), (lowratio,lowdelta), (highratio,highdelta), filterfile, remainfile) # for directories in extracted #http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory for subdir in next(os.walk(indir))[1]: # make parallel directories # for genres in genre set # for languages # filter lines insubdir = os.path.join(indir, subdir) filtersubdir = os.path.join(filterdir, subdir) mkdir_p(filtersubdir) remainsubdir = os.path.join(remaindir, subdir) mkdir_p(remainsubdir) for genre in genres: for lang in (args.lang, 'eng'): base = "%s.%s.%s.flat" % (genre, subdir, lang) infilename = os.path.join(insubdir, base) if os.path.exists(infilename): infile = prepfile(open(infilename, 'r'), 'r') filterfile = prepfile(open(os.path.join(filtersubdir, base), 'w'), 'w') remainfile = prepfile(open(os.path.join(remainsubdir, base), 'w'), 'w') filterlines(infile, (ratios[genre], deltas[genre]), (lowratio,lowdelta), (highratio,highdelta), filterfile, remainfile) else: sys.stderr.write("%s does not exist\n" % infilename) # count files in each of the directories; should be the same for dir in (indir, filterdir, remaindir): sys.stderr.write("%d files in %s\n" % (countfiles(dir), dir))
def main(): parser = argparse.ArgumentParser( description= "Given a reflex lrlp laf with token ids and a ltf with token-to-start_char/end_char, create an laf with start_char/end_char. Operate per directory", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--indir", "-i", help= "input directory. Presumed to contain x.laf.xml. Might contain x.ltf.xml for all x" ) parser.add_argument("--corpusdirs", "-c", nargs='+', help="directory tree or trees to find x.ltf.xml") parser.add_argument( "--outdir", "-o", help= "output directory. may not exist. will contain modified x.laf.xml for all x" ) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) reader = codecs.getreader('utf8') writer = codecs.getwriter('utf8') stderr = writer(sys.stderr) indir = args.indir outdir = args.outdir mkdir_p(outdir) localcount = 0 remotecount = 0 bothcount = 0 for inlaf in [x for x in os.listdir(indir) if x.endswith(".laf.xml")]: base = inlaf.replace(".laf.xml", "") outlaf = os.path.join(outdir, inlaf) inlaf = os.path.join(indir, inlaf) inltf = os.path.join(indir, base + ".ltf.xml") corpusltf = find( base + ".ltf.xml", args.corpusdirs) if args.corpusdirs is not None else None # cases: # 1. local ltf exists with char offsets. no remote ltf. we use what local ltf gives us # 2. local and remote ltf exist, with the same number of tokens in the same order. we map local id to remote id and use those offsets # 3. remote ltf exists. local does not. We use remote ltf as in case 1 # 4. local and remote exist with different numbers of tokens or nothing exists or something else. complain and skip this document try: # case 2: build id map idmap = {} useidmap = False if os.path.exists( inltf) and corpusltf is not None and os.path.exists( corpusltf): localroot = ET.parse(inltf) corpusroot = ET.parse(corpusltf) localtoks = localroot.findall(".//TOKEN") corpustoks = corpusroot.findall(".//TOKEN") if len(localtoks) != len(corpustoks): stderr.write("Token count mismatch; skipping " + inlaf + "\n") continue ok = True for localtok, corpustok in zip(localtoks, corpustoks): if localtok.text != corpustok.text: stderr.write("Token count mismatch; skipping " + inlaf + "\n") ok = False break idmap[localtok.get("id")] = corpustok.get("id") if not ok: continue useidmap = True # case 1: swap inltf and corpusltf (otherwise below handles case 2, 3) if os.path.exists(inltf) and (corpusltf is None or not os.path.exists(corpusltf)): inltf, corpusltf = corpusltf, inltf remotecount += 1 elif useidmap: bothcount += 1 else: localcount += 1 # Final token id-to-offset starts = {} ends = {} root = ET.parse(corpusltf) for node in root.findall(".//TOKEN"): id = node.get("id") starts[id] = node.get("start_char") ends[id] = node.get("end_char") #root.clear() # re-map root = ET.parse(inlaf) for node in root.findall(".//ANNOTATION"): stok = node.get("start_token") etok = node.get("end_token") stok = idmap[stok] if useidmap else stok etok = idmap[etok] if useidmap else etok start = starts[stok] end = ends[etok] ext = node.find(".//EXTENT") ext.set('start_char', start) ext.set('end_char', end) xmlstr = ET.tostring(root, pretty_print=True, encoding='unicode') writer(open(outlaf, 'w')).write(xmlstr + "\n") except: e = sys.exc_info()[0] stderr.write("Problem with %s: %s\n" % (inltf, e)) continue stderr.write("%d using local only, %d using remote only, %d using both\n" % (localcount, remotecount, bothcount))
def main(): parser = argparse.ArgumentParser( description= "Given category per doc, idfile, data file, put data in category-specific dir", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--catfile", "-c", nargs='?', type=argparse.FileType('r'), help="doc cat file (docid cat)") parser.add_argument("--idfile", "-d", nargs='?', type=argparse.FileType('r'), help="id file (docid per line)") parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file") parser.add_argument("--prefix", "-p", default=".", help="directory prefix for categories") parser.add_argument("--postfix", "-P", default=".", help="directory postfix after categories") parser.add_argument( "--remainder", "-r", default="train", help="remainder category. Should match previous remainder category") addonoffarg( parser, 'backup', help= "backup matches to universal docid, following strict ldc format (in May 2017)", default=True) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) catfile = args.catfile infile = args.infile idfile = args.idfile # the unsplit resource # e.g. base of tmp_170502/yor/parallel/filtered/agile-tokenized/fromsource.generic.agile-tokenized.eng.flat basefile = os.path.basename(args.infile.name) cats = {} backupcats = {} fhs = {} backupcount = 0 for line in catfile: # for each document, what category it belongs in # e.g. YOR_WL_001975_20150409_G0021WWLJ.eng test doc, cat = line.strip().split('\t') # the prefix of the files that will be created # e.g. tmp_170502/yor/parallel/split / test / agile-tokenized prefix = os.path.join(args.prefix, cat, args.postfix) # the file that will be created # e.g. tmp_170502/yor/parallel/split/test/agile-tokenized/fromsource.generic.agile-tokenized.eng.flat innercatfile = os.path.join(prefix, basefile) if innercatfile not in fhs: mkdir_p(prefix) fhs[innercatfile] = open(innercatfile, 'w') # doc -> file to write to # print("{} -> {}".format(doc,innercatfile)) cats[doc] = fhs[innercatfile] if args.backup: backupcats[backup(doc)] = fhs[innercatfile] # catchall remainder file remcatpref = os.path.join(args.prefix, args.remainder, args.postfix) remaindercatfile = os.path.join(remcatpref, basefile) if remaindercatfile not in fhs: mkdir_p(remcatpref) fhs[remaindercatfile] = open(remaindercatfile, 'w') # pairs of docids and lines # e.g. YOR_DF_001261_20031127_G0022DCKG.eng LESBIANISM IN NIGERIA IS A BIG SURPRISE for doc, data in zip(idfile, infile): doc = doc.strip() if doc in cats: fh = cats[doc] # print("{}: writing to {}".format(doc, fh.name)) elif backup(doc) in backupcats: fh = backupcats[backup(doc)] backupcount += 1 else: fh = fhs[remaindercatfile] fh.write(data) if args.backup and backupcount > 0: sys.stderr.write( "{} lines written via backup retrieval\n".format(backupcount))
def main(): parser = argparse.ArgumentParser(description="Make dataset selections for experimentation given previously generated categorization files", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", help="location of parallel data") parser.add_argument("--language", "-l", help="source language three digit code") parser.add_argument("--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')") parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should match previous remainder category") parser.add_argument("--previous", "-p", help="location of previous cat files") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # reader = codecs.getreader('utf8') # writer = codecs.getwriter('utf8') # outfile = writer(args.outfile) indir = args.indir # TODO: find these? # doc = keep full docs together (can detect this by counting number of unique docs) # TODO: re-add found.generic to docprefixes docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news", "found.generic"] nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"] # TODO: find these filetypes = ["morph", "morph-tokenized", "original", "pos", "tokenized", "mttok", "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc", "cdec-tokenized.lc"] extractpath = os.path.join(indir, args.extractpath) origpath = os.path.join(extractpath, 'original') outpath = os.path.join(indir, 'splits') mkdir_p(outpath) for preflist in [docprefixes, nodocprefixes]: for prefix in list(preflist): # don't deal with it more if there's nothing in the manifest manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0: print("removing "+prefix) preflist.remove(prefix) # doc-based processing for prefix in docprefixes: idfile = os.path.join(outpath, "%s.ids" % prefix) manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) try: check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = os.path.join(args.previous, "%s.cats" % prefix) newcatfile = os.path.join(outpath, os.path.basename(catfile)) if os.path.exists(catfile): copy(catfile, newcatfile) else: touch(newcatfile) runselection(prefix, idfile, newcatfile, args.remainder, filetypes, args.language, extractpath, outpath) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (scriptdir, manifest, idfile, newcatfile, outpath, args.remainder) print("Running "+cmd) check_output(cmd, stderr=STDOUT, shell=True) # nodoc-based processing for prefix in nodocprefixes: idfile = os.path.join(outpath, "%s.fakeids" % prefix) try: mansize = int(check_output("wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf-8').strip().split(' ')[0]) check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = os.path.join(args.previous, "%s.cats" % prefix) newcatfile = os.path.join(outpath, os.path.basename(catfile)) if os.path.exists(catfile): copy(catfile, newcatfile) else: touch(newcatfile) runselection(prefix, idfile, newcatfile, args.remainder, filetypes, args.language, extractpath, outpath) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (scriptdir, manifest, idfile, newcatfile, outpath, args.remainder) print("Running "+cmd) check_output(cmd, stderr=STDOUT, shell=True)
def main(): parser = argparse.ArgumentParser( description= "Deterministic subselect designed for nov 2016 uyghur evaluation: per-doc, from end", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", help="location of parallel data") parser.add_argument("--language", "-l", help="source language three digit code") parser.add_argument( "--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')") parser.add_argument("--sizes", "-s", nargs='+', type=int, help="list of sizes desired in each category") parser.add_argument("--categories", "-c", nargs='+', help="list of categories. Must match sizes") parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should be a new category") parser.add_argument( "--devlstfile", "-d", default=None, help= "file of desired documents for dev (subject to length constraints, must be a set called 'dev')" ) addonoffarg( parser, 'fromFront', default=False, help="do doc assignment from the beginning (instead of the end)") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # reader = codecs.getreader('utf8') # writer = codecs.getwriter('utf8') # outfile = writer(args.outfile) indir = args.indir origsizes = args.sizes # TODO: find these? # doc = keep full docs together (can detect this by counting number of unique docs) # TODO: re-add found.generic to docprefixes docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news"] # IL3: moving found.generic!! nodocprefixes = [ "fromtarget.elicitation", "fromtarget.phrasebook", "found.generic" ] # TODO: find these filetypes = [ "morph", "morph-tokenized", "original", "pos", "tokenized", "mttok", "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc", "cdec-tokenized.lc" ] extractpath = os.path.join(indir, args.extractpath) origpath = os.path.join(extractpath, 'original') outpath = os.path.join(indir, 'splits') mkdir_p(outpath) for preflist in [docprefixes, nodocprefixes]: for prefix in list(preflist): # don't deal with it more if there's nothing in the manifest manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0: print("removing " + prefix) preflist.remove(prefix) # doc-based processing for prefix in docprefixes: idfile = os.path.join(outpath, "%s.ids" % prefix) manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix) try: check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = runselection(prefix, idfile, args.categories, args.remainder, origsizes, filetypes, args.language, extractpath, outpath, args.devlstfile, fromFront=args.fromFront) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % ( scriptdir, manifest, idfile, catfile, outpath, args.remainder) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # nodoc-based processing for prefix in nodocprefixes: idfile = os.path.join(outpath, "%s.fakeids" % prefix) try: mansize = int( check_output( "wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf8').strip().split(' ')[0]) check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True) except CalledProcessError as exc: print("Status : FAIL", exc.returncode, exc.output) catfile = runselection(prefix, idfile, args.categories, args.remainder, origsizes, filetypes, args.language, extractpath, outpath, fromFront=args.fromFront) for i in (args.language, 'eng'): manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i)) cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % ( scriptdir, manifest, idfile, catfile, outpath, args.remainder) print("Running " + cmd) check_output(cmd, stderr=STDOUT, shell=True) # warning if entries not found in given dev list if args.devlstfile: devlst = set(open(args.devlstfile).read().split()) all_docids = list() for prefix in docprefixes: all_docids += open(os.path.join(outpath, "%s.ids" % prefix)).read().split('\n') for i in devlst - set(all_docids): print("***Warning: docid not found: %s" % i)
def main(): parser = argparse.ArgumentParser(description="Given a reflex lrlp laf with token ids and a ltf with token-to-start_char/end_char, create an laf with start_char/end_char. Operate per directory", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--indir", "-i", help="input directory. Presumed to contain x.laf.xml. Might contain x.ltf.xml for all x") parser.add_argument("--corpusdirs", "-c", nargs='+', help="directory tree or trees to find x.ltf.xml") parser.add_argument("--outdir", "-o", help="output directory. may not exist. will contain modified x.laf.xml for all x") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) reader = codecs.getreader('utf8') writer = codecs.getwriter('utf8') stderr = writer(sys.stderr) indir = args.indir outdir = args.outdir mkdir_p(outdir) localcount = 0 remotecount = 0 bothcount = 0 for inlaf in [x for x in os.listdir(indir) if x.endswith(".laf.xml")]: base = inlaf.replace(".laf.xml", "") outlaf = os.path.join(outdir, inlaf) inlaf = os.path.join(indir, inlaf) inltf = os.path.join(indir, base+".ltf.xml") corpusltf = find(base+".ltf.xml", args.corpusdirs) if args.corpusdirs is not None else None # cases: # 1. local ltf exists with char offsets. no remote ltf. we use what local ltf gives us # 2. local and remote ltf exist, with the same number of tokens in the same order. we map local id to remote id and use those offsets # 3. remote ltf exists. local does not. We use remote ltf as in case 1 # 4. local and remote exist with different numbers of tokens or nothing exists or something else. complain and skip this document try: # case 2: build id map idmap = {} useidmap = False if os.path.exists(inltf) and corpusltf is not None and os.path.exists(corpusltf): localroot = ET.parse(inltf) corpusroot = ET.parse(corpusltf) localtoks = localroot.findall(".//TOKEN") corpustoks = corpusroot.findall(".//TOKEN") if len(localtoks) != len(corpustoks): stderr.write("Token count mismatch; skipping "+inlaf+"\n") continue ok = True for localtok, corpustok in zip(localtoks, corpustoks): if localtok.text != corpustok.text: stderr.write("Token count mismatch; skipping "+inlaf+"\n") ok = False break idmap[localtok.get("id")]=corpustok.get("id") if not ok: continue useidmap = True # case 1: swap inltf and corpusltf (otherwise below handles case 2, 3) if os.path.exists(inltf) and ( corpusltf is None or not os.path.exists(corpusltf)): inltf, corpusltf = corpusltf, inltf remotecount+=1 elif useidmap: bothcount+=1 else: localcount+=1 # Final token id-to-offset starts = {} ends = {} root = ET.parse(corpusltf) for node in root.findall(".//TOKEN"): id = node.get("id") starts[id]=node.get("start_char") ends[id]=node.get("end_char") #root.clear() # re-map root = ET.parse(inlaf) for node in root.findall(".//ANNOTATION"): stok = node.get("start_token") etok = node.get("end_token") stok = idmap[stok] if useidmap else stok etok = idmap[etok] if useidmap else etok start = starts[stok] end = ends[etok] ext = node.find(".//EXTENT") ext.set('start_char', start) ext.set('end_char', end) xmlstr = ET.tostring(root, pretty_print=True, encoding='unicode') writer(open(outlaf, 'w')).write(xmlstr+"\n") except: e = sys.exc_info()[0] stderr.write("Problem with %s: %s\n" % (inltf, e)) continue stderr.write("%d using local only, %d using remote only, %d using both\n" % (localcount, remotecount, bothcount))