def CombineNumNgrams(): tot_nums = [] for split_index in range(1, args.num_splits + 1): this_split_work = "{0}/{1}".format(split_work_dir, split_index) num_file = this_split_work + "/num_ngrams" try: f = open(num_file, "r") for order, line in enumerate(f): num = int(line.split()[1]) assert(num > 0) if order == 0: if len(tot_nums) == 0: tot_nums.append(num) else: if tot_nums[0] != num: ExitProgram("get_objf_and_derivs_split.py: num-unigrams are not identical") else: if len(tot_nums) < order + 1: tot_nums.append(num) else: tot_nums[order] += num f.close() except: ExitProgram("get_objf_and_derivs_split.py: error reading num-ngrams from: " + num_file) WriteNumNgrams(args.work_dir, tot_nums)
def GetInitialLogprob(): work0dir = work_dir + "/step0" float_star = ' '.join(['/dev/null' for n in range(1, ngram_order + 1)]) command = ('float-counts-estimate {num_words} {work0dir}/float.all ' '{work0dir}/stats.all {float_star} '.format( num_words=num_words, work0dir=work0dir, float_star=float_star)) try: print(command, file=sys.stderr) p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, universal_newlines=True) # the stdout of this program will be something like: # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758 # representing: total-count, total-like, and for each order, the like-change # for that order. line = p.stdout.readline() print(line, file=sys.stderr) a = line.split() tot_count = float(a[0]) tot_like = float(a[1]) like_change = 0.0 logprob_per_word = tot_like / tot_count for i in range(2, len(a)): # for each n-gram order like_change += float(a[i]) like_change_per_word = like_change / tot_count assert like_change_per_word < 0.0001 # should be exactly zero. except Exception as e: ExitProgram("error running command '{0}', error is '{1}'".format( command, repr(e))) global initial_logprob_per_word initial_logprob_per_word = logprob_per_word
def MergeCountsBackward(split_index, order): global scale_derivs # merge counts of the specified order > 1; the backprop phase. assert order > 1 command = "merge-counts-backward {swork}/{s}/merged.{order} {swork}/{s}/merged_derivs.{order} ".format( swork=split_work_dir, s=split_index, order=order) for n in range(1, num_train_sets + 1): command += " {split_counts}/{s}/int.{train_set}.{order} {scale}".format( split_counts=split_count_dir, s=split_index, train_set=n, order=order, scale=train_set_scale[n]) # for orders less than the highest order, we also have to include the # discount counts from the one-higher order, and provide a filename # for it to output the derivatives w.r.t. that file. if order < ngram_order: command += " {swork}/{s}/discount.{order} {swork}/{s}/discount_derivs.{order}".format( swork=split_work_dir, s=split_index, order=order) log_file = "{0}/log/merge_counts_backward.{1}.{2}.log".format(args.work_dir, split_index, order) output = GetCommandStdout(command, log_file, args.verbose == 'true') try: this_scale_derivs = [float(n) / num_dev_set_words_total for n in output.split()] assert len(scale_derivs) == num_train_sets # the scaling factors are applied for each order > 1, and the # derivatives will be a sum over the derivatives for each of these # orders (and also a sum over the different split-directories). for n in range(num_train_sets): scale_derivs[n] += this_scale_derivs[n] except: ExitProgram("get_objf_and_derivs_split.py: unexpected output from command:" + output)
def FindThreshold(initial_threshold): global initial_num_xgrams, current_num_xgrams, num_unigrams, steps global logprob_changes, effective_logprob_changes model = PruneSizeModel(num_unigrams, args.target_num_ngrams, args.target_lower_threshold, args.target_upper_threshold) # model.SetDebug(True) model.SetInitialThreshold(initial_threshold, initial_num_xgrams) cur_threshold = initial_threshold backtrack_iter = 0 step = 0 iter2step = [ 0 ] # This maps a iter-index to the step-index of the last step of that iteration while True: steps += ['prune*1.0'] logprob_change = RunStep(step, cur_threshold, in_step=iter2step[backtrack_iter]) logprob_changes.append(logprob_change) effective_logprob_changes.append(logprob_change) thresholds.append(cur_threshold) step += 1 (action, arguments) = model.GetNextAction(current_num_xgrams) if action == 'overshoot': return (0.0, 0) if action == 'backtrack': (cur_threshold, backtrack_iter) = arguments assert (iter2step[backtrack_iter] > 0) del effective_logprob_changes[iter2step[backtrack_iter]:] iter2step.append(-1) continue # EM steps steps += 'EM EM'.split() while step < len(steps): logprob_change = RunStep(step, 0.0) logprob_changes.append(logprob_change) effective_logprob_changes.append(logprob_change) step += 1 iter2step.append(step) if action == 'success': return (cur_threshold, model.iter) # action == 'continue': if model.iter > args.max_iter: ExitProgram( "Too many iterations, please set a higher --initial-threshold and rerun." ) cur_threshold = arguments backtrack_iter = model.iter
def CopyMetaInfo(source_int_dir, dest_count_dir): for f in ['num_train_sets', 'num_words', 'names', 'words.txt']: try: src = source_int_dir + os.path.sep + f dest = dest_count_dir + os.path.sep + f shutil.copy(src, dest) except: ExitProgram('error copying {0} to {1}'.format(src, dest))
def SoftLink(src, dest): if os.path.lexists(dest): os.remove(dest) try: os.symlink(os.path.abspath(src), dest) except: ExitProgram("error linking {0} to {1}".format(os.path.abspath(src), dest))
def WriteNumNgrams(out_dir, num_ngrams): out_file = out_dir + "/num_ngrams" try: f = open(out_file, "w") for order, num in enumerate(num_ngrams): print(str(order + 1) + ' ' + str(num), file=f) f.close() except: ExitProgram("get_objf_and_derivs_split.py: error writing num-ngrams to: " + out_file)
def WriteNumNgrams(out_dir, num_ngrams): out_file = out_dir + "/num_ngrams" try: f = open(out_file, "w", encoding="utf-8") for order, num in enumerate(num_ngrams): print(str(order + 1) + ' ' + str(num), file=f) f.close() except: ExitProgram("error writing num-ngrams to: " + out_file)
def SaveNgramOrder(dest_count_dir, ngram_order): try: f = open('{0}/ngram_order'.format(dest_count_dir), 'w') except: ExitProgram('error opening file {0}/ngram_order for writing'.format( dest_count_dir)) assert ngram_order >= 2 print(ngram_order, file=f) f.close()
def WriteObjectiveFunction(): objf = loglike_total / num_dev_set_words_total print("get_objf_and_derivs_split.py: objf is {0} over {1} " "words".format(objf, num_dev_set_words_total), file=sys.stderr) # Write the objective function. try: f = open(args.objf_out, "w") print(str(objf), file=f) f.close() except: ExitProgram("get_objf_and_derivs_split.py: error writing objective function to: " + args.objf_out)
def ParseNumNgrams(out_dir, merge_all_orders_log): try: num_ngrams = [] f = open(merge_all_orders_log, "r") for line in f: if line[0] == '#': continue m = re.search('Write (.*) individual n-grams.', line) if m: # The matched string should be 'num1 + num2 = tot' or just 'num1' for unigram model nums_str = m.group(1).split('=')[0] nums_str = nums_str.strip() num_ngrams = re.split('[\+| ]+', nums_str) f.close() except: ExitProgram("get_objf_and_derivs_split.py: error reading merge_all_orders_log from: " + merge_all_orders_log) if len(num_ngrams) == 0: ExitProgram("get_objf_and_derivs_split.py: error parsing num_ngrams from: " + merge_all_orders_log) WriteNumNgrams(out_dir, num_ngrams)
def GetNumTrainSets(int_dir): with open(int_dir) as f: for line in f: try: a = line.split() assert len(a) == 2 ans = int(a[0]) except: ExitProgram("failed to get the num_train_sets from {0}".format( int_dir)) return ans
def GetNumWords(vocab): command = "tail -n 1 {0}".format(vocab) line = subprocess.check_output(command, shell=True, universal_newlines=True) try: a = line.split() assert len(a) == 2 ans = int(a[1]) except: ExitProgram("failed to get the num_words from {0}".format(vocab)) return ans
def DivideMemory(total, n): (value, unit) = ParseMemoryString(total) sub_memory = value / n if sub_memory != float(value) / n: if unit in ['K', 'k', '']: sub_memory = value * 1024 / n unit = 'b' elif unit in ['M', 'm']: sub_memory = value * 1024 / n unit = 'K' elif unit in ['G', 'g']: sub_memory = value * 1024 / n unit = 'M' elif (unit in ['B', 'b', '%']) and (sub_memory == 0): ExitProgram("max_memory for each of the {0} train sets is {1}{2}." "Please reset a larger max_memory value".format( n, float(value) / n, unit)) else: ExitProgram("Invalid format for max_memory. " "Please 'man sort' to see how to set buffer size.") return str(int(sub_memory)) + unit
def GetNumWords(lm_dir_in): command = "tail -n 1 {0}/words.txt".format(lm_dir_in) line = subprocess.check_output(command, shell=True, universal_newlines=True) try: a = line.split() assert len(a) == 2 ans = int(a[1]) except: ExitProgram("error: unexpected output '{0}' from command {1}".format( line, command)) return ans
def WriteDerivs(): try: f = open(args.derivs_out, "w") except: ExitProgram("get_objf_and_derivs_split.py: error opening --derivs-out={0} for writing".format( args.derivs_out)) for n in range(num_train_sets): print("count_scale_{0} {1}".format(n + 1, scale_derivs[n]), file=f) for o in range(2, ngram_order + 1): print("order{0}_D1 {1}".format(o, d1_deriv[o]), file=f) print("order{0}_D2 {1}".format(o, d2_deriv[o]), file=f) print("order{0}_D3 {1}".format(o, d3_deriv[o]), file=f) print("order{0}_D4 {1}".format(o, d4_deriv[o]), file=f) f.close()
def RunStep(step_number, threshold, **kwargs): if 'in_step' in kwargs: work_in = work_dir + "/step" + str(kwargs['in_step']) else: work_in = work_dir + "/step" + str(step_number) work_out = work_dir + "/step" + str(step_number + 1) if not os.path.isdir(work_out + "/log"): os.makedirs(work_out + "/log") step_text = steps[step_number] if step_text[0:6] == 'prune*': try: scale = float(step_text[6:]) assert scale != 0.0 except: ExitProgram("invalid step (wrong --steps " "option): '{0}'".format(step_text)) return RunPruneStep(work_in, work_out, threshold * scale) elif step_text == 'EM': return RunEmStep(work_in, work_out) else: ExitProgram("invalid step (wrong --steps " "option): '{0}'".format(step_text))
def FinalizeOutput(final_work_out): try: shutil.move(final_work_out + "/float.all", args.lm_dir_out + "/float.all") except: ExitProgram("error moving {0}/float.all to {1}/float.all".format( final_work_out, args.lm_dir_out)) try: shutil.copy(final_work_out + "/num_ngrams", args.lm_dir_out + "/num_ngrams") except: ExitProgram("error copying {0}/num_ngrams to {1}/num_ngrams".format( final_work_out, args.lm_dir_out)) f = open(args.lm_dir_out + "/was_pruned", "w", encoding="utf-8") print("true", file=f) f.close() for f in ['names', 'words.txt', 'ngram_order', 'metaparameters']: try: shutil.copy(args.lm_dir_in + "/" + f, args.lm_dir_out + "/" + f) except: ExitProgram("error copying {0}/{1} to {2}/{1}".format( args.lm_dir_in, f, args.lm_dir_out)) if os.path.exists(args.lm_dir_out + "/num_splits"): os.remove(args.lm_dir_out + "/num_splits")
def RunEmStep(work_in, work_out): # set float_star = 'work_out/float.1 work_out/float.2 ...' float_star = " ".join([ '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1) ]) command = ( 'float-counts-estimate {num_words} {work_in}/float.all {work_in}/stats.all ' '{float_star}'.format(num_words=num_words, work_in=work_in, float_star=float_star)) log_file = work_out + "/log/float_counts_estimate.log" try: output = GetCommandStdout(command, log_file, args.verbose == 'true') # the stdout of this program will be something like: # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758 # representing: total-count, total-like, and for each order, the like-change # for that order. a = output.split() tot_count = float(a[0]) tot_like = float(a[1]) like_change = 0.0 global final_logprob_per_word final_logprob_per_word = tot_like / tot_count for i in range(2, len(a)): # for each n-gram order like_change += float(a[i]) like_change_per_word = like_change / tot_count except Exception as e: ExitProgram("error running command '{0}', error is '{1}'".format( command, repr(e))) command = 'merge-float-counts {0} >{1}/float.all'.format( float_star, work_out) log_file = work_out + '/log/merge_float_counts.log' RunCommand(command, log_file, args.verbose == 'true') for f in float_star.split(): os.remove(f) # soft-link work_out/stats.all to work_in/stats.all SoftLink(work_in + "/stats.all", work_out + "/stats.all") # soft-link work_out/protected.all to work_in/protected.all SoftLink(work_in + "/protected.all", work_out + "/protected.all") SoftLink(work_in + "/num_ngrams", work_out + "/num_ngrams") return like_change_per_word
def ComputeObjfAndFinalDerivs(split_index, need_derivs): global num_dev_set_words_total, loglike_total command = "compute-probs {swork}/{s}/float.all {scount}/{s}/int.dev ".format( swork=split_work_dir, s=split_index, scount=split_count_dir) if need_derivs: command += " ".join(["{swork}/{s}/float_derivs.{order}".format( swork=split_work_dir, s=split_index, order=o) for o in range(1, ngram_order + 1)]) log_file = "{0}/log/compute_objf_and_final_derivs.{1}.log".format( args.work_dir, split_index) output = GetCommandStdout(command, log_file, args.verbose == 'true') try: [num_dev_set_words, tot_objf] = output.split() num_dev_set_words_total += int(num_dev_set_words) loglike_total += float(tot_objf) except: ExitProgram("get_objf_and_derivs_split.py: error interpreting the output of compute-probs: " "output was: " + output)
def DiscountCountsBackward(split_index, order): # discount counts of the specified order > 1; backprop version. assert order > 1 this_split_work = "{0}/{1}".format(split_work_dir, split_index) command = ("discount-counts-backward {d1} {d2} {d3} {d4} {sdir}/merged.{order} {sdir}/float.{order} " "{sdir}/float_derivs.{order} {sdir}/discount.{orderm1} {sdir}/discount_derivs.{orderm1} " "{sdir}/merged_derivs.{order}".format( d1=d1[order], d2=d2[order], d3=d3[order], d4=d4[order], sdir=this_split_work, order=order, orderm1=order - 1)) log_file = "{0}/log/discount_counts_backward.{1}.{2}.log".format(args.work_dir, split_index, order) output = GetCommandStdout(command, log_file, args.verbose == 'true') try: [deriv1, deriv2, deriv3, deriv4] = output.split() except: ExitProgram("get_objf_and_derivs_split.py: could not parse output of command: " + output) d1_deriv[order] += float(deriv1) / num_dev_set_words_total d2_deriv[order] += float(deriv2) / num_dev_set_words_total d3_deriv[order] += float(deriv3) / num_dev_set_words_total d4_deriv[order] += float(deriv4) / num_dev_set_words_total
def CopyFile(src, dest): try: shutil.copy(src, dest) except: ExitProgram("prepare_int_data.py: error copying {0} to {1}".format( src, dest))
float(value) / n, unit)) else: ExitProgram("Invalid format for max_memory. " "Please 'man sort' to see how to set buffer size.") return str(int(sub_memory)) + unit # make sure 'scripts' and 'src' directory are on the path os.environ['PATH'] = (os.environ['PATH'] + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + "/../src") if os.system("validate_int_dir.py " + args.source_int_dir) != 0: ExitProgram("command validate_int_dir.py {0} failed".format( args.source_int_dir)) if args.ngram_order < 2: ExitProgram("ngram-order is {0}; it must be at least 2. If you " "want a unigram LM, do it by hand".format(args.ngram_order)) # read the variable 'num_train_sets' # from the corresponding file in source_int_dir This shouldn't fail # because we just called validate_int-dir.py.. f = open(args.source_int_dir + "/num_train_sets") num_train_sets = int(f.readline()) f.close() if not os.path.isdir(args.dest_count_dir): try: os.makedirs(args.dest_count_dir + '/log')
.format(args.derivs_out)) for n in range(num_train_sets): print("count_scale_{0} {1}".format(n + 1, scale_derivs[n]), file=f) for o in range(2, ngram_order + 1): print("order{0}_D1 {1}".format(o, d1_deriv[o]), file=f) print("order{0}_D2 {1}".format(o, d2_deriv[o]), file=f) print("order{0}_D3 {1}".format(o, d3_deriv[o]), file=f) print("order{0}_D4 {1}".format(o, d4_deriv[o]), file=f) f.close() if not os.path.isdir(args.work_dir + "/log"): try: os.makedirs(args.work_dir + "/log") except: ExitProgram("error creating directory {0}/log".format(args.work_dir)) # for n-gram orders down to 2, do the merging and discounting. for o in range(ngram_order, 1, -1): MergeCounts(o) DiscountCounts(o) DiscountCountsOrder1() MergeAllOrders() ComputeObjfAndFinalDerivs(args.derivs_out is not None) if args.derivs_out is None: if args.cleanup == 'true': Cleanup() sys.exit(0)
def RunPruneStep(work_in, work_out, threshold): # set float_star = 'work_out/float.1 work_out/float.2 ...' float_star = " ".join([ '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1) ]) # create work_out/float.{1,2,..} log_file = work_out + '/log/float_counts_prune.log' command = ( "float-counts-prune {threshold} {num_words} {work_in}/float.all " "{work_in}/protected.all {float_star} 2>>{log_file}".format( threshold=threshold, num_words=num_words, work_in=work_in, float_star=float_star, log_file=log_file)) with open(log_file, 'w', encoding="utf-8") as f: print("# " + command, file=f) try: print(command, file=sys.stderr) p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, universal_newlines=True) [word_count, like_change] = p.stdout.readline().split() like_change_per_word = float(like_change) / float(word_count) [tot_xgrams, shadowed, protected, pruned] = p.stdout.readline().split() num_ngrams = p.stdout.readline().split() assert p.stdout.readline() == '' ret = p.wait() assert ret == 0 global current_num_xgrams current_num_xgrams = int(tot_xgrams) - int(pruned) except Exception as e: ExitProgram("error running command '{0}', error is '{1}'".format( command, repr(e))) WriteNumNgrams(work_out, num_ngrams) if args.remove_zeros == 'false': # create work_out/float.all. command = 'merge-float-counts {0} >{1}/float.all'.format( float_star, work_out) log_file = work_out + '/log/merge_float_counts.log' RunCommand(command, log_file, args.verbose == 'true') for f in float_star.split(): os.remove(f) # soft-link work_out/stats.all to work_in/stats.all SoftLink(work_in + "/stats.all", work_out + "/stats.all") else: # in this case we pipe the output of merge-float-counts into # float-counts-stats-remove-zeros. # set stats_star = 'work_out/stats.1 work_out/stats.2 ..' stats_star = " ".join([ '{0}/stats.{1}'.format(work_out, n) for n in range(1, ngram_order + 1) ]) command = ( 'merge-float-counts {float_star} | float-counts-stats-remove-zeros ' '{num_words} /dev/stdin {work_in}/stats.all {work_out}/float.all ' '{stats_star}'.format(num_words=num_words, float_star=float_star, work_in=work_in, work_out=work_out, stats_star=stats_star)) log_file = work_out + '/log/remove_zeros.log' RunCommand(command, log_file, args.verbose == 'true') # create work_out/stats.all command = 'merge-float-counts {0} >{1}/stats.all'.format( stats_star, work_out) log_file = work_out + '/log/merge_float_counts.log' RunCommand(command, log_file, args.verbose == 'true') for f in float_star.split() + stats_star.split(): os.remove(f) # create work_out/protected.all CreateProtectedCounts(work_out) return like_change_per_word
def GetCountsMultiProcess(source_int_dir, dest_count_dir, ngram_order, n, num_proc, max_mem, num_splits=0): try: file_size = os.path.getsize('{0}/{1}.txt.gz'.format(source_int_dir, n)) except: ExitProgram('get_counts.py: error getting file size of ' '{0}/{1}.txt.gz'.format(source_int_dir, n)) if IsCygwin() or num_proc <= 1 or file_size < 1000000: if num_proc > 1 and file_size >= 1000000: # it's only because of Cygwin that we're not using multiple # processes this merits a warning. print( "get_counts.py: cygwin platform detected so named pipes won't work; " "using a single process (will be slower)") return GetCountsSingleProcess(source_int_dir, dest_count_dir, ngram_order, n, max_mem, num_splits) if num_splits == 0: int_counts_output = "/dev/null " + " ".join([ "{0}/int.{1}.{2}".format(dest_count_dir, n, o) for o in range(2, ngram_order + 1) ]) else: assert num_splits >= 1 int_counts_output = '/dev/stdout | split-int-counts ' + \ ' '.join(["{0}/int.{1}.split{2}".format(dest_count_dir, n, j) for j in range(1, num_splits + 1)]) try: # we want a temporary directory on a local file system # for tempdir = tempfile.mkdtemp() except Exception as e: ExitProgram("Error creating temporary directory: " + str(e)) # This has several pipes for the internal processing that write to and read # from other internal pipes; and we can't do this using '|' in the shell, we # need to use mkfifo. This does not work properly on cygwin. log_dir = "{dest_count_dir}/log".format(dest_count_dir=dest_count_dir) [ os.remove(x) for x in glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir, n=n)) ] log_file = "{log_dir}/get_counts.{n}.log".format(log_dir=log_dir, n=n) test_command = "bash -c 'set -o pipefail; (echo a; echo b) | "\ "distribute-input-lines /dev/null /dev/null'" # We run the following command just to make sure distribute-input-lines is # on the path and compiled, since we get hard-to-debug errors if it fails. RunCommand(test_command, log_file) if max_mem == '': mem_opt = '' else: mem_opt = "--buffer-size={0}".format( DivideMemory(max_mem, num_proc + 1)) # we use "bash -c '...'" to make sure it gets run in bash, since # for example 'set -o pipefail' would only work in bash. command = ( "bash -c 'set -o pipefail; set -e; export LC_ALL=C; mkdir -p {0}; ". format(tempdir) + ''.join( ['mkfifo {0}/{1}; '.format(tempdir, p) for p in range(num_proc)]) + 'trap "rm -r {0}" SIGINT SIGKILL SIGTERM EXIT; '.format(tempdir) + 'gunzip -c {0}/{1}.txt.gz | distribute-input-lines '.format( source_int_dir, n) + ' '.join(['{0}/{1}'.format(tempdir, p) for p in range(num_proc)]) + '& ' + 'sort -m {0} '.format(mem_opt) + ' '.join([ '<(get-text-counts {4} {0} <{1}/{2} | sort {3} || touch {5}/.{6}.{2}.error)' .format( ngram_order, tempdir, p, mem_opt, "--limit-unk-history" if args.limit_unk_history == 'true' else "", log_dir, n) for p in range(num_proc) ]) + '| uniq -c | get-int-counts {0}'.format(int_counts_output) + "'" ) # end the quote from the 'bash -c'. RunCommand(command, log_file, args.verbose == 'true') if len(glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir, n=n))) > 0: ExitProgram( "Something went wrong for the get-text-counts or sort command for training set {n}." .format(n=n))
log_file = "{int_dir}/log/{int}.log".format(int_dir=args.int_dir, int=int) output = GetCommandStdout(command, log_file) # make sure 'scripts', 'scripts/internal', and 'src' directory are on the path os.environ['PATH'] = (os.environ['PATH'] + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + "/../src" + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + "/internal") if os.system("validate_text_dir.py " + args.text_dir) != 0: ExitProgram("command validate_text_dir.py {0} failed".format( args.text_dir)) if os.system("validate_vocab.py " + args.vocab) != 0: ExitProgram("command validate_vocab.py {0} failed".format(args.vocab)) if not os.path.exists( os.path.abspath(os.path.dirname(sys.argv[0])) + "/text_to_int.py"): ExitProgram( "prepare_int_data.py: expected text_to_int.py to be on the path") # create the output data directory if not os.path.exists(args.int_dir + "/log"): os.makedirs(args.int_dir + "/log") # remove any old *.int.gz files in the output data directory filelist = [f for f in os.listdir(args.int_dir) if f.endswith(".int.gz")]
parser.add_argument("lm_dir_in", help="Source directory, for the input language model.") parser.add_argument( "lm_dir_out", help="Output directory where the language model is created.") args = parser.parse_args() # Add the script dir and the src dir to the path. os.environ['PATH'] = (os.environ['PATH'] + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + os.pathsep + os.path.abspath(os.path.dirname(sys.argv[0])) + "/../src") if os.system("validate_lm_dir.py " + args.lm_dir_in) != 0: ExitProgram("failed to validate input LM-dir") # verify the input string max_memory if args.max_memory != '': # valid string max_memory must have at least two items if len(args.max_memory) >= 2: s = args.max_memory # valid string max_memory can be formatted as: # "a positive integer + a letter or a '%'" or "a positive integer" # the unit of memory size can also be 'T', 'P', 'E', 'Z', or 'Y'. They # are not included here considering their rare use in practice if s[-1] in ['b', 'B', '%', 'k', 'K', 'm', 'M', 'g', 'G' ] or s[-1].isdigit(): for x in s[:-1]: if not x.isdigit(): sys.exit(
"{lm_dir}/float.all.{n} | sort {mem_opt} || touch {lm_dir}/.{n}.error)" .format(opt=('--no-unigram' if n > 1 else ''), ngram_order=ngram_order, num_words=num_words, lm_dir=args.lm_dir, n=n, mem_opt=mem_opt) for n in range(1, num_splits + 1) ]) + " | pre-arpa-to-arpa {lm_dir}/words.txt'".format(lm_dir=args.lm_dir)) print("format_arpa_lm.py: running " + command, file=sys.stderr) ret = os.system(command) if ret != 0: sys.exit("format_arpa_lm.py: command {0} exited with status {1}".format( command, ret)) if len(glob.glob("{lm_dir}/.*.error".format(lm_dir=args.lm_dir))) > 0: ExitProgram( "Something went wrong for the float-counts-to-pre-arpa or sort command." ) print("format_arpa_lm.py: succeeded formatting ARPA lm from {0}".format( args.lm_dir), file=sys.stderr) t1 = time.time() print('Total time formatting to ARPA = ' + str(t1 - t0), file=sys.stderr) print('-' * 100, file=sys.stderr)