def CreateProtectedCounts(work): command = ( "bash -c 'float-counts-to-histories <{0}/float.all | LC_ALL=C sort {1}|" " histories-to-null-counts >{0}/protected.all'".format( work, sort_mem_opt)) log_file = work + "/log/create_protected_counts.log" RunCommand(command, log_file, args.verbose == 'true')
def GetCountsSingleProcess(source_int_dir, dest_count_dir, ngram_order, n, max_mem, num_splits=0): if num_splits == 0: int_counts_output = "/dev/null " + " ".join([ "{0}/int.{1}.{2}".format(dest_count_dir, n, o) for o in range(2, ngram_order + 1) ]) else: assert num_splits >= 1 int_counts_output = '/dev/stdout | split-int-counts ' + \ ' '.join(["{0}/int.{1}.split{2}".format(dest_count_dir, n, j) for j in range(1, num_splits + 1)]) command = "bash -c 'set -o pipefail; export LC_ALL=C; gunzip -c {source_int_dir}/{n}.txt.gz | "\ "get-text-counts {limit_unk_history} {ngram_order} | sort {mem_opt}| uniq -c | "\ "get-int-counts {int_counts_output}'".format(source_int_dir=source_int_dir, n=n, ngram_order=ngram_order, limit_unk_history="--limit-unk-history" if args.limit_unk_history == 'true' else "", mem_opt="--buffer-size={0}".format(max_mem) if max_mem != '' else '', int_counts_output=int_counts_output) log_file = "{dest_count_dir}/log/get_counts.{n}.log".format( dest_count_dir=dest_count_dir, n=n) RunCommand(command, log_file, args.verbose == 'true')
def EnforceMinCounts(dest_count_dir, formatted_min_counts, ngram_order, num_train_sets, j): inputs = ' '.join([ "{0}/int.{1}.split{2}".format(dest_count_dir, n, j) for n in range(1, num_train_sets + 1) ]) outputs = ' '.join([ ' '.join([ '{0}/int.{1}.split{2}.{3}'.format(dest_count_dir, n, j, o) for o in range(2, ngram_order + 1) ]) for n in range(1, num_train_sets + 1) ]) # e.g. suppose j is 2 and ngram_order is 4, outputs would be as follows # [assuming brace expansion].: # outputs = dir/int.1.split2.{2,3,4} dir/int.2.split2.{2,3,4} ... # dir/int.{num_train_sets}.split2.{2,3,4} command = "int-counts-enforce-min-counts {ngram_order} {formatted_min_counts} {inputs} "\ "{outputs}".format( ngram_order=ngram_order, formatted_min_counts=formatted_min_counts, inputs=inputs, outputs=outputs, j=j) log_file = '{0}/log/enforce_min_counts.{1}.log'.format(dest_count_dir, j) RunCommand(command, log_file, args.verbose == 'true')
def MergeCounts(order): # merge counts of the specified order > 1. assert order > 1 command = "merge-counts" for n in range(1, num_train_sets + 1): command += " {counts}/int.{train_set}.{order},{scale}".format( counts=args.count_dir, train_set=n, order=order, scale=train_set_scale[n]) if args.fold_dev_into_int is not None: command += " {counts}/int.dev.{order},{scale}".format( counts=args.count_dir, order=order, scale=train_set_scale[args.fold_dev_into_int]) # for orders less than the highest order, we also have to include the # discount counts from the one-higher order. there is no scale here, so # the program will expect general-counts, not int-counts. if order < ngram_order: command += " {work}/discount.{order}".format(work=args.work_dir, order=order) # the output gets redirected to the output file. command += " >{work}/merged.{order}".format(work=args.work_dir, order=order) log_file = "{0}/log/merge_counts.{1}.log".format(args.work_dir, order) RunCommand(command, log_file, args.verbose == 'true')
def MergeDevData(dest_count_dir, ngram_order): command = ("merge-int-counts " + ' '.join([ dest_count_dir + "/int.dev." + str(n) for n in range(2, ngram_order + 1) ]) + ">{0}/int.dev".format(dest_count_dir)) log_file = dest_count_dir + '/log/merge_dev_counts.log' RunCommand(command, log_file, args.verbose == 'true')
def MergeCountsOrder1(): # This function merges the order-1 discount counts across all splits. command = ("merge-counts " + " ".join(["{0}/{1}/discount.1".format(split_work_dir, s) for s in range(1, args.num_splits + 1)]) + " >{0}/discount.1".format(args.work_dir)) log_file = "{0}/log/merge_counts_order1.log".format(args.work_dir) RunCommand(command, log_file, args.verbose == 'true')
def MergeAllOrders(): command = ("merge-float-counts " + " ".join([ "{0}/float.{1}".format(args.work_dir, n) for n in range(1, ngram_order + 1) ]) + ">{0}/float.all".format(args.work_dir)) log_file = "{0}/log/merge_all_orders.log".format(args.work_dir) RunCommand(command, log_file, args.verbose == 'true') ParseNumNgrams(args.work_dir, log_file)
def DiscountCountsOrder1Backward(): command = ( "discount-counts-1gram-backward {work}/discount.1 {work}/float.1 " "{work}/float_derivs.1 {work}/discount_derivs.1".format( work=args.work_dir)) log_file = "{0}/log/discount_counts_order1_backward.log".format( args.work_dir) RunCommand(command, log_file, args.verbose == 'true')
def MergeAllSplits(): # merges the float.all acros all splits. command = ("merge-float-counts " + " ".join(["{0}/{1}/float.all".format(split_work_dir, split_index) for split_index in range(1, args.num_splits + 1)]) + ">{0}/float.all".format(args.work_dir)) log_file = "{0}/log/merge_all_splits.log".format(args.work_dir) RunCommand(command, log_file, args.verbose == 'true')
def DiscountCounts(split_index, order): # discount counts of the specified order > 1. assert order > 1 this_split_work = "{0}/{1}".format(split_work_dir, split_index) command = "discount-counts {d1} {d2} {d3} {d4} {sdir}/merged.{order} {sdir}/float.{order} {sdir}/discount.{orderm1} ".format( d1=d1[order], d2=d2[order], d3=d3[order], d4=d4[order], sdir=this_split_work, order=order, orderm1=order - 1) log_file = "{0}/log/discount_counts.{1}.{2}.log".format(args.work_dir, split_index, order) RunCommand(command, log_file, args.verbose == 'true')
def SumFloatDerivsOrder1(): # this has to be called before DiscountCountsOrder1Backward, to sum up the # different parts of the final float-count derivatives w.r.t. the unigram counts, from all the # individual split directories. command = ("sum-float-derivs {0}/float.1 ".format(args.work_dir) + " ".join(["{0}/{1}/float_derivs.1".format(split_work_dir, s) for s in range(1, args.num_splits + 1)]) + " >{0}/float_derivs.1".format(args.work_dir)) log_file = "{0}/log/sum_float_counts_order1.log".format(args.work_dir) RunCommand(command, log_file, args.verbose == 'true')
def MergeCountsOrder1Backward(): # This function merges the order-1 discount counts across all splits. # we pipe it to /dev/null because it writes a newline to stdout (this is # to terimate the derivs w.r.t. the scaling factors, which are written to # stdout but in this case are empty. command = ("merge-counts-backward {0}/discount.1 {0}/discount_derivs.1 ".format( args.work_dir) + " ".join(["{0}/{1}/discount.1 {0}/{1}/discount_derivs.1".format(split_work_dir, s) for s in range(1, args.num_splits + 1)]) + ">/dev/null") log_file = "{0}/log/merge_counts_order1_backward.log".format(args.work_dir) RunCommand(command, log_file, args.verbose == 'true')
def MergeAllOrders(split_index): this_split_work = "{0}/{1}".format(split_work_dir, split_index) # this merges all the orders of float-counts in each of the split # directories. Note that for unigram, it takes the merged-across-all-splits # counts from the top-level work-dir, not the split work-dir. command = ("merge-float-counts " + " ".join(["{0}/float.{1}".format(this_split_work if n > 1 else args.work_dir, n) for n in range(1, ngram_order + 1)]) + ">{0}/float.all".format(this_split_work)) log_file = "{0}/log/merge_all_orders.{1}.log".format(args.work_dir, split_index) RunCommand(command, log_file, args.verbose == 'true') ParseNumNgrams(this_split_work, log_file)
def DiscountCounts(order): # discount counts of the specified order > 1. assert order > 1 command = "discount-counts {d1} {d2} {d3} {d4} {work}/merged.{order} {work}/float.{order} {work}/discount.{orderm1} ".format( d1=d1[order], d2=d2[order], d3=d3[order], d4=d4[order], work=args.work_dir, order=order, orderm1=order - 1) log_file = "{0}/log/discount_counts.{1}.log".format(args.work_dir, order) RunCommand(command, log_file, args.verbose == 'true')
def CreateInitialWorkDir(): # Creates float.all, stats.all, and protected.all in work_dir/step work0dir = work_dir + "/step0" # create float.all if not os.path.isdir(work0dir + "/log"): os.makedirs(work0dir + "/log") SoftLink(args.lm_dir_in + "/num_ngrams", work0dir + "/num_ngrams") if num_splits is None: SoftLink(args.lm_dir_in + "/float.all", work0dir + "/float.all") else: splits_star = ' '.join([ args.lm_dir_in + "/float.all." + str(n) for n in range(1, num_splits + 1) ]) command = "merge-float-counts " + splits_star + " >{0}/float.all".format( work0dir) log_file = work0dir + "/log/merge_initial_float_counts.log" RunCommand(command, log_file, args.verbose == 'true') # create protected.all CreateProtectedCounts(work0dir) stats_star = ' '.join([ "{0}/stats.{1}".format(work0dir, n) for n in range(1, ngram_order + 1) ]) # create stats.{1,2,3..} # e.g. command = 'float-counts-to-float-stats 20000 foo/work/step0/stats.1 ' # 'foo/work/step0/stats.2 <foo/work/step0/float.all' command = ("float-counts-to-float-stats {0} ".format(num_words) + stats_star + " <{0}/float.all".format(work0dir)) log_file = work0dir + "/log/float_counts_to_float_stats.log" RunCommand(command, log_file, args.verbose == 'true') command = "merge-float-counts {0} > {1}/stats.all".format( stats_star, work0dir) log_file = work0dir + "/log/merge_float_counts.log" RunCommand(command, log_file, args.verbose == 'true') for f in stats_star.split(): os.remove(f)
def MergeCounts(dest_count_dir, num_jobs, n, o): if num_jobs > 1: command = ('merge-int-counts ' + ' '.join([ '{0}/int.{1}.split{2}.{3}'.format(dest_count_dir, n, j, o) for j in range(1, num_jobs + 1) ]) + '>{0}/int.{1}.{2}'.format(dest_count_dir, n, o)) log_file = '{0}/log/merge_counts.{1}.{2}.log'.format( dest_count_dir, n, o) RunCommand(command, log_file, args.verbose == 'true') else: assert num_jobs == 1 # we can just move the file if num-jobs == 1. try: os.remove('{0}/int.{1}.{2}'.format(dest_count_dir, n, o)) except: pass os.rename('{0}/int.{1}.split1.{2}'.format(dest_count_dir, n, o), '{0}/int.{1}.{2}'.format(dest_count_dir, n, o))
def RunEmStep(work_in, work_out): # set float_star = 'work_out/float.1 work_out/float.2 ...' float_star = " ".join([ '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1) ]) command = ( 'float-counts-estimate {num_words} {work_in}/float.all {work_in}/stats.all ' '{float_star}'.format(num_words=num_words, work_in=work_in, float_star=float_star)) log_file = work_out + "/log/float_counts_estimate.log" try: output = GetCommandStdout(command, log_file, args.verbose == 'true') # the stdout of this program will be something like: # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758 # representing: total-count, total-like, and for each order, the like-change # for that order. a = output.split() tot_count = float(a[0]) tot_like = float(a[1]) like_change = 0.0 global final_logprob_per_word final_logprob_per_word = tot_like / tot_count for i in range(2, len(a)): # for each n-gram order like_change += float(a[i]) like_change_per_word = like_change / tot_count except Exception as e: ExitProgram("error running command '{0}', error is '{1}'".format( command, repr(e))) command = 'merge-float-counts {0} >{1}/float.all'.format( float_star, work_out) log_file = work_out + '/log/merge_float_counts.log' RunCommand(command, log_file, args.verbose == 'true') for f in float_star.split(): os.remove(f) # soft-link work_out/stats.all to work_in/stats.all SoftLink(work_in + "/stats.all", work_out + "/stats.all") # soft-link work_out/protected.all to work_in/protected.all SoftLink(work_in + "/protected.all", work_out + "/protected.all") SoftLink(work_in + "/num_ngrams", work_out + "/num_ngrams") return like_change_per_word
def GetNames(text_dir, int_dir): command = "get_names.py {0} > {1}/names".format(text_dir, int_dir) log_file = "{int_dir}/log/get_names.log".format(int_dir=int_dir) RunCommand(command, log_file)
def GetCountsMultiProcess(source_int_dir, dest_count_dir, ngram_order, n, num_proc, max_mem, num_splits=0): try: file_size = os.path.getsize('{0}/{1}.txt.gz'.format(source_int_dir, n)) except: ExitProgram('get_counts.py: error getting file size of ' '{0}/{1}.txt.gz'.format(source_int_dir, n)) if IsCygwin() or num_proc <= 1 or file_size < 1000000: if num_proc > 1 and file_size >= 1000000: # it's only because of Cygwin that we're not using multiple # processes this merits a warning. print( "get_counts.py: cygwin platform detected so named pipes won't work; " "using a single process (will be slower)") return GetCountsSingleProcess(source_int_dir, dest_count_dir, ngram_order, n, max_mem, num_splits) if num_splits == 0: int_counts_output = "/dev/null " + " ".join([ "{0}/int.{1}.{2}".format(dest_count_dir, n, o) for o in range(2, ngram_order + 1) ]) else: assert num_splits >= 1 int_counts_output = '/dev/stdout | split-int-counts ' + \ ' '.join(["{0}/int.{1}.split{2}".format(dest_count_dir, n, j) for j in range(1, num_splits + 1)]) try: # we want a temporary directory on a local file system # for tempdir = tempfile.mkdtemp() except Exception as e: ExitProgram("Error creating temporary directory: " + str(e)) # This has several pipes for the internal processing that write to and read # from other internal pipes; and we can't do this using '|' in the shell, we # need to use mkfifo. This does not work properly on cygwin. log_dir = "{dest_count_dir}/log".format(dest_count_dir=dest_count_dir) [ os.remove(x) for x in glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir, n=n)) ] log_file = "{log_dir}/get_counts.{n}.log".format(log_dir=log_dir, n=n) test_command = "bash -c 'set -o pipefail; (echo a; echo b) | "\ "distribute-input-lines /dev/null /dev/null'" # We run the following command just to make sure distribute-input-lines is # on the path and compiled, since we get hard-to-debug errors if it fails. RunCommand(test_command, log_file) if max_mem == '': mem_opt = '' else: mem_opt = "--buffer-size={0}".format( DivideMemory(max_mem, num_proc + 1)) # we use "bash -c '...'" to make sure it gets run in bash, since # for example 'set -o pipefail' would only work in bash. command = ( "bash -c 'set -o pipefail; set -e; export LC_ALL=C; mkdir -p {0}; ". format(tempdir) + ''.join( ['mkfifo {0}/{1}; '.format(tempdir, p) for p in range(num_proc)]) + 'trap "rm -r {0}" SIGINT SIGKILL SIGTERM EXIT; '.format(tempdir) + 'gunzip -c {0}/{1}.txt.gz | distribute-input-lines '.format( source_int_dir, n) + ' '.join(['{0}/{1}'.format(tempdir, p) for p in range(num_proc)]) + '& ' + 'sort -m {0} '.format(mem_opt) + ' '.join([ '<(get-text-counts {4} {0} <{1}/{2} | sort {3} || touch {5}/.{6}.{2}.error)' .format( ngram_order, tempdir, p, mem_opt, "--limit-unk-history" if args.limit_unk_history == 'true' else "", log_dir, n) for p in range(num_proc) ]) + '| uniq -c | get-int-counts {0}'.format(int_counts_output) + "'" ) # end the quote from the 'bash -c'. RunCommand(command, log_file, args.verbose == 'true') if len(glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir, n=n))) > 0: ExitProgram( "Something went wrong for the get-text-counts or sort command for training set {n}." .format(n=n))
word_counts_dir = os.path.join(work_dir, 'word_counts') if os.system("validate_text_dir.py " + args.text_dir) != 0: sys.exit(1) last_done_files = [] for f in os.listdir(args.text_dir): if f.endswith(".txt") or f.endswith(".txt.gz"): last_done_files.append(os.path.join(args.text_dir, f)) done_file = os.path.join(word_counts_dir, '.done') if not CheckFreshness(done_file, last_done_files): LogMessage("Skip getting word counts") else: log_file = os.path.join(log_dir, 'get_word_counts.log') LogMessage("Getting word counts... log in " + log_file) command = "get_word_counts.py {0} {1}".format(args.text_dir, word_counts_dir) RunCommand(command, log_file, args.verbose == 'true') TouchFile(done_file) # get unigram weights unigram_weights = os.path.join(args.text_dir, 'unigram_weights') last_done_files = [done_file] done_file = os.path.join(work_dir, '.unigram_weights.done') if not CheckFreshness(done_file, last_done_files): LogMessage("Skip getting unigram weights") else: log_file = os.path.join(log_dir, 'get_unigram_weights.log') LogMessage("Getting unigram weights... log in " + log_file) command = "get_unigram_weights.py {0} > {1}".format( word_counts_dir, unigram_weights) RunCommand(command, log_file, args.verbose == 'true') TouchFile(done_file)
def GetObjfAndDeriv(x): global iteration y = UnconstrainedToConstrained(x) metaparameter_file = "{0}/{1}.metaparams".format(args.optimize_dir, iteration) deriv_file = "{0}/{1}.derivs".format(args.optimize_dir, iteration) objf_file = "{0}/{1}.objf".format(args.optimize_dir, iteration) log_file = "{0}/{1}.log".format(args.optimize_dir, iteration) changed_or_new = WriteMetaparameters(metaparameter_file, y) prev_metaparameter_file = "{0}/{1}.metaparams".format( args.optimize_dir, iteration - 1) enable_caching = True # if true, enable re-use of files from a previous run. if enable_caching and (not changed_or_new and os.path.exists(deriv_file) and os.path.exists(objf_file) and os.path.getmtime(deriv_file) > os.path.getmtime(metaparameter_file)): print( "optimize_metaparameters.py: using previously computed objf and deriv " "info from {0} and {1} (presumably you are rerunning after a partially " "finished run)".format(deriv_file, objf_file), file=sys.stderr) else: # we need to call get_objf_and_derivs.py command = ( "get_objf_and_derivs{maybe_split}.py {split_opt} --cleanup={cleanup} --derivs-out={derivs} {counts} {metaparams} " "{objf} {work}".format( derivs=deriv_file, counts=args.count_dir, metaparams=metaparameter_file, maybe_split="_split" if args.num_splits > 1 else "", split_opt=("--num-splits={0}".format(args.num_splits) if args.num_splits > 1 else ""), cleanup=args.cleanup, objf=objf_file, work=args.optimize_dir + "/work")) RunCommand(command, log_file, verbose=True) df_dy = ReadMetaparametersOrDerivs(deriv_file) objf = ReadObjf(objf_file) iteration += 1 (x2, df_dx) = ConstrainedToUnconstrained(y, df_dy) # check that x == x2, we just changed variables back and forth so it should # be the same. if math.sqrt(np.dot(x - x2, x - x2)) > 0.001: print( "optimize_metaparameters.py: warning: difference {0} versus {1}\n". format(x, x2)) print("Evaluation %d: objf=%.6f, deriv-magnitude=%.6f " % (iteration, objf, math.sqrt(np.vdot(df_dx, df_dx))), file=sys.stderr) # we need to negate the objective function and derivatives, since we are # minimizing. scale = -1.0 global value0 if value0 is None: value0 = objf * scale return (objf * scale, df_dx * scale)
def RunPruneStep(work_in, work_out, threshold): # set float_star = 'work_out/float.1 work_out/float.2 ...' float_star = " ".join([ '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1) ]) # create work_out/float.{1,2,..} log_file = work_out + '/log/float_counts_prune.log' command = ( "float-counts-prune {threshold} {num_words} {work_in}/float.all " "{work_in}/protected.all {float_star} 2>>{log_file}".format( threshold=threshold, num_words=num_words, work_in=work_in, float_star=float_star, log_file=log_file)) with open(log_file, 'w', encoding="utf-8") as f: print("# " + command, file=f) try: print(command, file=sys.stderr) p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, universal_newlines=True) [word_count, like_change] = p.stdout.readline().split() like_change_per_word = float(like_change) / float(word_count) [tot_xgrams, shadowed, protected, pruned] = p.stdout.readline().split() num_ngrams = p.stdout.readline().split() assert p.stdout.readline() == '' ret = p.wait() assert ret == 0 global current_num_xgrams current_num_xgrams = int(tot_xgrams) - int(pruned) except Exception as e: ExitProgram("error running command '{0}', error is '{1}'".format( command, repr(e))) WriteNumNgrams(work_out, num_ngrams) if args.remove_zeros == 'false': # create work_out/float.all. command = 'merge-float-counts {0} >{1}/float.all'.format( float_star, work_out) log_file = work_out + '/log/merge_float_counts.log' RunCommand(command, log_file, args.verbose == 'true') for f in float_star.split(): os.remove(f) # soft-link work_out/stats.all to work_in/stats.all SoftLink(work_in + "/stats.all", work_out + "/stats.all") else: # in this case we pipe the output of merge-float-counts into # float-counts-stats-remove-zeros. # set stats_star = 'work_out/stats.1 work_out/stats.2 ..' stats_star = " ".join([ '{0}/stats.{1}'.format(work_out, n) for n in range(1, ngram_order + 1) ]) command = ( 'merge-float-counts {float_star} | float-counts-stats-remove-zeros ' '{num_words} /dev/stdin {work_in}/stats.all {work_out}/float.all ' '{stats_star}'.format(num_words=num_words, float_star=float_star, work_in=work_in, work_out=work_out, stats_star=stats_star)) log_file = work_out + '/log/remove_zeros.log' RunCommand(command, log_file, args.verbose == 'true') # create work_out/stats.all command = 'merge-float-counts {0} >{1}/stats.all'.format( stats_star, work_out) log_file = work_out + '/log/merge_float_counts.log' RunCommand(command, log_file, args.verbose == 'true') for f in float_star.split() + stats_star.split(): os.remove(f) # create work_out/protected.all CreateProtectedCounts(work_out) return like_change_per_word
def DiscountCountsOrder1(): command = "discount-counts-1gram {num_words} <{work}/discount.1 >{work}/float.1".format( num_words=num_words, work=args.work_dir) log_file = "{0}/log/discount_counts_order1.log".format(args.work_dir) RunCommand(command, log_file, args.verbose == 'true')