Ejemplo n.º 1
0
def MergeCountsBackward(split_index, order):
    global scale_derivs
    # merge counts of the specified order > 1; the backprop phase.
    assert order > 1

    command = "merge-counts-backward {swork}/{s}/merged.{order} {swork}/{s}/merged_derivs.{order} ".format(
            swork=split_work_dir, s=split_index, order=order)

    for n in range(1, num_train_sets + 1):
        command += " {split_counts}/{s}/int.{train_set}.{order} {scale}".format(
                split_counts=split_count_dir, s=split_index, train_set=n,
                order=order, scale=train_set_scale[n])
    # for orders less than the highest order, we also have to include the
    # discount counts from the one-higher order, and provide a filename
    # for it to output the derivatives w.r.t. that file.
    if order < ngram_order:
        command += " {swork}/{s}/discount.{order} {swork}/{s}/discount_derivs.{order}".format(
                swork=split_work_dir, s=split_index, order=order)

    log_file = "{0}/log/merge_counts_backward.{1}.{2}.log".format(args.work_dir, split_index, order)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        this_scale_derivs = [float(n) / num_dev_set_words_total for n in output.split()]
        assert len(scale_derivs) == num_train_sets
        # the scaling factors are applied for each order > 1, and the
        # derivatives will be a sum over the derivatives for each of these
        # orders (and also a sum over the different split-directories).
        for n in range(num_train_sets):
            scale_derivs[n] += this_scale_derivs[n]
    except:
        ExitProgram("get_objf_and_derivs_split.py: unexpected output from command:" + output)
Ejemplo n.º 2
0
def ComputeObjfAndFinalDerivs(need_derivs):
    global num_dev_set_words, objf
    command = "compute-probs {work}/float.all {counts}/int.dev ".format(
        work=args.work_dir, counts=args.count_dir)
    if need_derivs:
        command += " ".join([
            "{work}/float_derivs.{order}".format(work=args.work_dir, order=n)
            for n in range(1, ngram_order + 1)
        ])
    log_file = "{0}/log/compute_objf_and_final_derivs.log".format(
        args.work_dir)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        [num_dev_set_words, tot_objf] = output.split()
        num_dev_set_words = int(num_dev_set_words)
        objf = float(tot_objf) / num_dev_set_words
    except:
        sys.exit(
            "get_objf_and_derivs.py: error interpreting the output of compute-probs: "
            "output was: " + output)
    print("get_objf_and_derivs.py: objf is {0} over {1} "
          "words".format(objf, num_dev_set_words),
          file=sys.stderr)
    # Write the objective function.
    try:
        f = open(args.objf_out, "w", encoding="utf-8")
        print(str(objf), file=f)
        f.close()
    except:
        sys.exit(
            "get_objf_and_derivs.py: error writing objective function to: " +
            args.objf_out)
Ejemplo n.º 3
0
def DiscountCountsBackward(order):
    # discount counts of the specified order > 1; backprop version.
    assert order > 1
    command = (
        "discount-counts-backward {d1} {d2} {d3} {d4} {work}/merged.{order} {work}/float.{order} "
        "{work}/float_derivs.{order} {work}/discount.{orderm1} {work}/discount_derivs.{orderm1} "
        "{work}/merged_derivs.{order}".format(d1=d1[order],
                                              d2=d2[order],
                                              d3=d3[order],
                                              d4=d4[order],
                                              work=args.work_dir,
                                              order=order,
                                              orderm1=order - 1))
    log_file = "{0}/log/discount_counts_backward.{1}.log".format(
        args.work_dir, order)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        [deriv1, deriv2, deriv3, deriv4] = output.split()
    except:
        sys.exit(
            "get_objf_and_derivs.py: could not parse output of command: " +
            output)
    d1_deriv[order] = float(deriv1) / num_dev_set_words
    d2_deriv[order] = float(deriv2) / num_dev_set_words
    d3_deriv[order] = float(deriv3) / num_dev_set_words
    d4_deriv[order] = float(deriv4) / num_dev_set_words
Ejemplo n.º 4
0
def GetData(int, name):
    if os.path.exists(args.text_dir + "/" + name + ".txt.gz"):
        command = "set -o pipefail; gunzip -c {text_dir}/{name}.txt.gz | "\
                "text_to_int.py {vocab} | gzip -c > {int_dir}/{int}.txt.gz "\
                "2>{int_dir}/log/{int}.log".format(text_dir=args.text_dir, name=name,
                                                   vocab=args.vocab, int_dir=args.int_dir, int=int)
        log_file = "{int_dir}/log/{int}.log".format(int_dir=args.int_dir,
                                                    int=int)
        output = GetCommandStdout(command, log_file)
    else:
        command = "set -o pipefail; cat {text_dir}/{name}.txt | text_to_int.py {vocab} "\
                "| gzip -c > {int_dir}/{int}.txt.gz 2>{int_dir}/log/{int}.log".format(
                        text_dir=args.text_dir, name=name, vocab=args.vocab,
                        int_dir=args.int_dir, int=int)
        log_file = "{int_dir}/log/{int}.log".format(int_dir=args.int_dir,
                                                    int=int)
        output = GetCommandStdout(command, log_file)
Ejemplo n.º 5
0
def RunEmStep(work_in, work_out):
    # set float_star = 'work_out/float.1 work_out/float.2 ...'
    float_star = " ".join([
        '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1)
    ])

    command = (
        'float-counts-estimate {num_words} {work_in}/float.all {work_in}/stats.all '
        '{float_star}'.format(num_words=num_words,
                              work_in=work_in,
                              float_star=float_star))
    log_file = work_out + "/log/float_counts_estimate.log"
    try:
        output = GetCommandStdout(command, log_file, args.verbose == 'true')
        # the stdout of this program will be something like:
        # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758
        # representing: total-count, total-like, and for each order, the like-change
        # for that order.
        a = output.split()
        tot_count = float(a[0])
        tot_like = float(a[1])
        like_change = 0.0
        global final_logprob_per_word
        final_logprob_per_word = tot_like / tot_count
        for i in range(2, len(a)):  # for each n-gram order
            like_change += float(a[i])
        like_change_per_word = like_change / tot_count
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
            command, repr(e)))

    command = 'merge-float-counts {0} >{1}/float.all'.format(
        float_star, work_out)
    log_file = work_out + '/log/merge_float_counts.log'
    RunCommand(command, log_file, args.verbose == 'true')
    for f in float_star.split():
        os.remove(f)
    # soft-link work_out/stats.all to work_in/stats.all
    SoftLink(work_in + "/stats.all", work_out + "/stats.all")
    # soft-link work_out/protected.all to work_in/protected.all
    SoftLink(work_in + "/protected.all", work_out + "/protected.all")
    SoftLink(work_in + "/num_ngrams", work_out + "/num_ngrams")
    return like_change_per_word
Ejemplo n.º 6
0
def ComputeObjfAndFinalDerivs(split_index, need_derivs):
    global num_dev_set_words_total, loglike_total
    command = "compute-probs {swork}/{s}/float.all {scount}/{s}/int.dev ".format(
            swork=split_work_dir, s=split_index, scount=split_count_dir)
    if need_derivs:
        command += " ".join(["{swork}/{s}/float_derivs.{order}".format(
            swork=split_work_dir, s=split_index, order=o)
            for o in range(1, ngram_order + 1)])

    log_file = "{0}/log/compute_objf_and_final_derivs.{1}.log".format(
            args.work_dir, split_index)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        [num_dev_set_words, tot_objf] = output.split()
        num_dev_set_words_total += int(num_dev_set_words)
        loglike_total += float(tot_objf)
    except:
        ExitProgram("get_objf_and_derivs_split.py: error interpreting the output of compute-probs: "
                    "output was: " + output)
Ejemplo n.º 7
0
def RunEmStep(work_in, work_out):
    # set float_star = 'work_out/float.1 work_out/float.2 ...'
    float_star = " ".join(['{0}/float.{1}'.format(work_out, n)
                          for n in range(1, ngram_order + 1)])

    command = ('float-counts-estimate {num_words} {work_in}/float.all {work_in}/stats.all '
               '{float_star}'.format(num_words=num_words, work_in=work_in,
                                     float_star=float_star))
    log_file = work_out + "/log/float_counts_estimate.log"
    try:
        output = GetCommandStdout(command, log_file, args.verbose == 'true')
        # the stdout of this program will be something like:
        # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758
        # representing: total-count, total-like, and for each order, the like-change
        # for that order.
        a = output.split()
        tot_count = float(a[0])
        tot_like = float(a[1])
        like_change = 0.0
        global final_logprob_per_word
        final_logprob_per_word = tot_like / tot_count
        for i in range(2, len(a)):  # for each n-gram order
            like_change += float(a[i])
        like_change_per_word = like_change / tot_count
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
                command, repr(e)))

    command = 'merge-float-counts {0} >{1}/float.all'.format(float_star, work_out)
    log_file = work_out + '/log/merge_float_counts.log'
    RunCommand(command, log_file, args.verbose == 'true')
    for f in float_star.split():
        os.remove(f)
    # soft-link work_out/stats.all to work_in/stats.all
    SoftLink(work_in + "/stats.all",
             work_out + "/stats.all")
    # soft-link work_out/protected.all to work_in/protected.all
    SoftLink(work_in + "/protected.all",
             work_out + "/protected.all")
    SoftLink(work_in + "/num_ngrams",
             work_out + "/num_ngrams")
    return like_change_per_word
Ejemplo n.º 8
0
def DiscountCountsBackward(split_index, order):
    # discount counts of the specified order > 1; backprop version.
    assert order > 1
    this_split_work = "{0}/{1}".format(split_work_dir, split_index)
    command = ("discount-counts-backward {d1} {d2} {d3} {d4} {sdir}/merged.{order} {sdir}/float.{order} "
               "{sdir}/float_derivs.{order} {sdir}/discount.{orderm1} {sdir}/discount_derivs.{orderm1} "
               "{sdir}/merged_derivs.{order}".format(
                   d1=d1[order], d2=d2[order], d3=d3[order], d4=d4[order],
                   sdir=this_split_work, order=order, orderm1=order - 1))
    log_file = "{0}/log/discount_counts_backward.{1}.{2}.log".format(args.work_dir,
                                                                     split_index, order)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        [deriv1, deriv2, deriv3, deriv4] = output.split()
    except:
        ExitProgram("get_objf_and_derivs_split.py: could not parse output of command: " + output)
    d1_deriv[order] += float(deriv1) / num_dev_set_words_total
    d2_deriv[order] += float(deriv2) / num_dev_set_words_total
    d3_deriv[order] += float(deriv3) / num_dev_set_words_total
    d4_deriv[order] += float(deriv4) / num_dev_set_words_total