コード例 #1
0
ファイル: baseline.py プロジェクト: ylgh2011/nlp-hw4
def main():
    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french,())):
        if (word,) not in tm:
            tm[(word,)] = [models.phrase(word, 0.0)]

    total_prob = 0
    sys.stderr.write("Decoding %s...\n" % (opts.input,))
    for idx,f in enumerate(french):
        initial_hypothesis = hypothesis(lm.begin(), 0.0, 0, 0, None, None)
        heaps = [{} for _ in f] + [{}]
        heaps[0][lm.begin(), 0, 0] = initial_hypothesis
        for i, heap in enumerate(heaps[:-1]):
            # maintain beam heap
            # front_item = sorted(heap.itervalues(), key=lambda h: -h.logprob)[0]
            # for k in heap.keys():
            #      if heap[k].logprob < front_item.logprob - opts.bwidth:
            #         del heap[k]

            for h in sorted(heap.itervalues(),key=lambda h: -h.logprob)[:opts.s]: # prune
                fopen = prefix1bits(h.coverage)
                for j in xrange(fopen,min(fopen+1+opts.disord, len(f)+1)):
                    for k in xrange(j+1, len(f)+1):
                        if f[j:k] in tm:
                            if (h.coverage & bitmap(range(j, k))) == 0:
                                for phrase in tm[f[j:k]]:
                                    lm_prob = 0
                                    lm_state = h.lm_state
                                    for word in phrase.english.split():
                                        (lm_state, prob) = lm.score(lm_state, word)
                                        lm_prob += prob
                                    lm_prob += lm.end(lm_state) if k == len(f) else 0.0
                                    coverage = h.coverage | bitmap(range(j, k))
                                    logprob = h.logprob + opts.alpha*lm_prob + opts.beta*phrase.logprob + opts.eta*abs(h.end + 1 - j)
                                    
                                    new_hypothesis = hypothesis(lm_state, logprob, coverage, k, h, phrase)

                                    # add to heap
                                    num = onbits(coverage)
                                    if (lm_state, coverage, k) not in heaps[num] or new_hypothesis.logprob > heaps[num][lm_state, coverage, k].logprob:
                                            heaps[num][lm_state, coverage, k] = new_hypothesis


        winner = max(heaps[-1].itervalues(), key=lambda h: h.logprob)
        def extract_english(h): 
            return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)
        out = extract_english(winner)
        print out
        sys.stderr.write("#{0}:{2} - {1}\n".format(idx, out , winner.logprob))
        total_prob += winner.logprob

        # if opts.verbose:
        #     def extract_tm_logprob(h):
        #         return 0.0 if h.predecessor is None else h.phrase.logprob + extract_tm_logprob(h.predecessor)
        #     tm_logprob = extract_tm_logprob(winner)
        #     sys.stderr.write("LM = %f, TM = %f, Total = %f\n" % 
        #         (winner.logprob - tm_logprob, tm_logprob, winner.logprob))
    sys.stderr.write("Total score: {0}\n".format(total_prob))
コード例 #2
0
ファイル: seed_decode.py プロジェクト: goldcase/decode
def decode(n):
    ret = []
    tm = models.TM(opts.tm, opts.k)
    lm = models.LM(opts.lm)
    french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:min(n,opts.num_sents)]]

    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french,())):
        if (word,) not in tm:
            tm[(word,)] = [models.phrase(word, 0.0)]

    sys.stderr.write("Decoding %s...\n" % (opts.input,))
    for f in french:
        # The following code implements a monotone decoding
        # algorithm (one that doesn't permute the target phrases).
        # Hence all hypotheses in stacks[i] represent translations of
        # the first i words of the input sentence. You should generalize
        # this so that they can represent translations of *any* i words.
        hypothesis = namedtuple("hypothesis", "logprob, lm_state, predecessor, phrase, i, j, f")
        initial_hypothesis = hypothesis(0.0, lm.begin(), None, None, 0, 0, f[0])
        stacks = [{} for _ in f] + [{}]
        stacks[0][lm.begin()] = initial_hypothesis
        for i, stack in enumerate(stacks[:-1]):
            #print "Stack for " + str(french[i]) + ": " + str(stack) + "\n"
            for h in sorted(stack.itervalues(),key=lambda h: -h.logprob)[:opts.s]: # prune
                for j in xrange(i+1,len(f)+1):
                    if f[i:j] in tm:
                        for phrase in tm[f[i:j]]:
                            logprob = h.logprob + phrase.logprob
                            lm_state = h.lm_state
                            for word in phrase.english.split():
                                (lm_state, word_logprob) = lm.score(lm_state, word)
                                logprob += word_logprob
                            logprob += lm.end(lm_state) if j == len(f) else 0.0
                            new_hypothesis = hypothesis(logprob, lm_state, h, phrase, i, j, f)
                            if lm_state not in stacks[j] or stacks[j][lm_state].logprob < logprob: # second case is recombination
                                stacks[j][lm_state] = new_hypothesis
        winner = max(stacks[-1].itervalues(), key=lambda h: h.logprob)
        english_phrases = []
        tm_logprob_phrases = []
        def extract_english(h):
            if h.predecessor is not None:
                english_phrases.insert(0, (h.phrase.english, h.i, h.j, h.f))
                tm_logprob_phrases.insert(0, h.phrase.logprob)

            return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)

        ret.append((extract_english(winner), english_phrases, tm_logprob_phrases))

        if opts.verbose:
            def extract_tm_logprob(h):
                return 0.0 if h.predecessor is None else h.phrase.logprob + extract_tm_logprob(h.predecessor)
            tm_logprob = extract_tm_logprob(winner)
            sys.stderr.write("LM = %f, TM = %f, Total = %f\n" %
                (winner.logprob - tm_logprob, tm_logprob, winner.logprob))
    return ret
コード例 #3
0
 def __init__(self, opts):
     self.opts = opts
     self.tm = models.TM(opts.tm, sys.maxint)
     self.lm = models.LM(opts.lm)
     self.french = [
         tuple(line.strip().split())
         for line in open(opts.input).readlines()
     ]
     # tm should translate unknown words as-is with probability 1
     for word in set(sum(self.french, ())):
         if (word, ) not in self.tm:
             self.tm[(word, )] = [models.phrase(word, 0.0)]
コード例 #4
0
def make_agtsp(f):
    # make AGTSP
    nodes = [Node(0.0, START_SYMBOL, -1, -1, 0, models.phrase('', 0.0))]
    groups = defaultdict(list) # french word => [tsp_tuple, ...]
    groups[nodes[0].word_index] = [nodes[0]] # add startword group
    for i in xrange(len(f)):
        for j in xrange(i+1,len(f)+1):
            if f[i:j] in tm:
                for phrase in tm[f[i:j]]:
                    phrase = Phrase(phrase.english, phrase.logprob, random())
                    for (i_w, word) in enumerate(f[i:j]):
                        word_index = i+i_w
                        n = Node(phrase.logprob, word, word_index, i, j, phrase)
                        nodes.append(n)
                        groups[word_index] = groups[word_index] + [n]
    return nodes, groups
コード例 #5
0
def main(w0 = None):
    # tm should translate unknown words as-is with probability 1

    w = w0
    if w is None:
        # lm_logprob, distortion penenalty, direct translate logprob, direct lexicon logprob, inverse translation logprob, inverse lexicon logprob
        if opts.weights == "no weights specify":
            w = [1.0/7] * 7
            # w = [1.76846735947, 0.352553835525, 1.00071564481, 1.49937872683, 0.562198294709, -0.701483985454, 1.80395218437]
        else:
            w = [float(line.strip()) for line in open(opts.weights)]
    sys.stderr.write(str(w) + '\n')

    tm = models.TM(opts.tm, opts.k, opts.mute)
    lm = models.LM(opts.lm, opts.mute)
    # ibm_t = {} 
    ibm_t = init('./data/ibm.t.gz')
    french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]
    french = french[opts.start : opts.end]
    bound_width = float(opts.bwidth)

    for word in set(sum(french,())):
        if (word,) not in tm:
            tm[(word,)] = [models.phrase(word, [0.0, 0.0, 0.0, 0.0])]



    nbest_output = []
    total_prob = 0
    if opts.mute == 0:
        sys.stderr.write("Start decoding %s ...\n" % (opts.input,))
    for idx,f in enumerate(french):
        if opts.mute == 0:
            sys.stderr.write("Decoding sentence #%s ...\n" % (str(idx)))
        initial_hypothesis = hypothesis(lm.begin(), 0.0, 0, 0, None, None, None)
        heaps = [{} for _ in f] + [{}]
        heaps[0][lm.begin(), 0, 0] = initial_hypothesis
        for i, heap in enumerate(heaps[:-1]):
            # maintain beam heap
            # front_item = sorted(heap.itervalues(), key=lambda h: -h.logprob)[0]
            for h in sorted(heap.itervalues(),key=lambda h: -h.logprob)[:opts.s]: # prune
                # if h.logprob < front_item.logprob - float(opts.bwidth):
                #    continue

                fopen = prefix1bits(h.coverage)
                for j in xrange(fopen,min(fopen+1+opts.disord, len(f)+1)):
                    for k in xrange(j+1, len(f)+1):
                        if f[j:k] in tm:
                            if (h.coverage & bitmap(range(j, k))) == 0:
                                for phrase in tm[f[j:k]]:
                                    lm_prob = 0
                                    lm_state = h.lm_state
                                    for word in phrase.english.split():
                                        (lm_state, prob) = lm.score(lm_state, word)
                                        lm_prob += prob
                                    lm_prob += lm.end(lm_state) if k == len(f) else 0.0
                                    coverage = h.coverage | bitmap(range(j, k))
                                    # logprob = h.logprob + lm_prob*w[0] + getDotProduct(phrase.several_logprob, w[2:6]) + abs(h.end+1-j)*w[1] + ibm_model_1_w_score(ibm_t, f, phrase.english)*w[6]
                                    logprob  = h.logprob
                                    logprob += lm_prob*w[0]
                                    logprob += getDotProduct(phrase.several_logprob, w[1:5])
                                    # logprob += opts.diseta*abs(h.end+1-j)*w[1]
                                    logprob += ibm_model_1_w_score(ibm_t, f, phrase.english)*w[5]
                                    logprob += (len(phrase.english.split()) - (k - j)) * w[6]

                                    new_hypothesis = hypothesis(lm_state, logprob, coverage, k, h, phrase, abs(h.end + 1 - j))

                                    # add to heap
                                    num = onbits(coverage)
                                    if (lm_state, coverage, k) not in heaps[num] or new_hypothesis.logprob > heaps[num][lm_state, coverage, k].logprob:
                                        heaps[num][lm_state, coverage, k] = new_hypothesis

        winners = sorted(heaps[-1].itervalues(), key=lambda h: -h.logprob)[0:opts.nbest]

        def get_lm_logprob(test_list):
            stance = []
            for i in test_list:
                stance += (i.split())
            stance = tuple(stance)
            lm_state = ("<s>",)
            score = 0.0
            for word in stance:
                (lm_state, word_score) = lm.score(lm_state, word)
                score += word_score
            return score
        def get_list_and_features(h, idx_self):
            lst = [];
            features = [0, 0, 0, 0, 0, 0, 0]
            current_h = h;
            while current_h.phrase is not None:
                # print current_h
                lst.append(current_h.phrase.english)
                # features[1] += current_h.distortionPenalty
                features[1] += current_h.phrase.several_logprob[0]      # translation feature 1
                features[2] += current_h.phrase.several_logprob[1]      # translation feature 2
                features[3] += current_h.phrase.several_logprob[2]      # translation feature 3
                features[4] += current_h.phrase.several_logprob[3]      # translation feature 4
                current_h = current_h.predecessor
            lst.reverse()
            features[0] = get_lm_logprob(lst)                           # language model score
            features[5] = ibm_model_1_score(ibm_t, f, lst)
            features[6] = len(lst) - len(french[idx_self])
            return (lst, features)

        for win in winners:
            # s = str(idx) + " ||| "
            (lst, features) = get_list_and_features(win, idx)
            print local_search.local_search(lst, lm)
コード例 #6
0
ファイル: beam7.py プロジェクト: patelam/NLP_Final_Project
else:
    w = []
    for line in open(opts.weights):
        w.extend([float(line)])

tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [
    tuple(line.strip().split())
    for line in open(opts.input).readlines()[:opts.num_sents]
]

# tm should translate unknown words as-is with probability 1
for word in set(sum(french, ())):
    if (word, ) not in tm:
        tm[(word, )] = [models.phrase(word, 0.0, [0.0] * 4)]


def getrange(data):
    ranges = []
    for key, group in groupby(enumerate(data), lambda
                              (index, item): index - item):
        group = map(itemgetter(1), group)
        ranges.append(xrange(group[0], group[-1] + 1))
    return ranges


def bitmap(sequence):
    return reduce(lambda x, y: x | y,
                  map(lambda i: long('1' + '0' * i, 2), sequence), 0)
コード例 #7
0
    w = [0.2]*5
else:
    w = []
    for line in open(opts.weights):
        w.extend([float(line)])



tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]

# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
    if (word,) not in tm:
        tm[(word,)] = [models.phrase(word, 0.0, [0.0]*4)]

def getrange(data):
    ranges = []
    for key, group in groupby(enumerate(data), lambda (index, item): index - item):
        group = map(itemgetter(1), group)
        ranges.append(xrange(group[0], group[-1] + 1))
    return ranges

def bitmap(sequence):
    return reduce(lambda x,y: x|y, map(lambda i: long('1'+'0'*i,2), sequence), 0)

def bitmap2str(b, n, on='o', off='.'):
    return '' if n==0 else (on if b&1==1 else off) + bitmap2str(b>>1, n-1, on, off)

def cand_phrases(ranges, f):
コード例 #8
0
ファイル: decode2.py プロジェクト: abhambh1/mt_spring_2016
optparser.add_option("-s", "--stack-size", dest="s", default=15, type="int", help="Maximum stack size (default=15)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,
                     help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

# store the stack length in a global
stack_size = opts.s

tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in
          open(opts.input).readlines()[:opts.num_sents]]

for english_word in set(sum(french, ())):
    if (english_word,) not in tm:
        tm[(english_word,)] = [models.phrase(english_word, 0.0)]


def determine_expense(scope):
    """
    Given a translation scope evaluates the expense of decoding it.
    :param scope: the scope to evaluate
    :return: the potential expense associated with evaluating this scope
    """
    potential_expense = 0
    start = -1
    for j, translated_flag in enumerate(scope):
        if not translated_flag:
            if start == -1:
                start = j
        else:
コード例 #9
0
ファイル: Decoder.py プロジェクト: lynnhl0504/NLP
optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

opts.k = 4
tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]




# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
  if (word,) not in tm:
    tm[(word,)] = [models.phrase(word, 0.0)]
# values for the model parameter
dd = 5
nn = -4
beta = 2

gooby = tm[("de", "ce")]
#print(len(gooby))
#print(gooby)

class state:
  def __init__(self, e1, e2, b, r, alpha):
    self.e1 = e1
    self.e2 = e2
    self.b = b
    self.r = r
コード例 #10
0
ファイル: decode_dual_old.py プロジェクト: mewsicalcat/dreamt
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
optparser.add_option("-e", "--number-iterations", dest="e", default=10, type="int", help="number of iterations (default=10)")
opts = optparser.parse_args()[0]

tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)

french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]

# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
  if (word,) not in tm:
    tm[(word,)] = [models.phrase(word, 0.0)]
converged = [0]*(len(french)+1) #starts at 0!!! 
sys.stderr.write("Decoding %s...\n" % (opts.input,))
i_sen = 0
print('Iters\tViolations')
  
for f in french:
  i_sen += 1
  print('SENTENCE ' + str(i_sen))

  u = [0 for _ in f]              #Lagrangian                                                                                                 
  hypothesis = namedtuple("hypothesis", "logprob, lm_state, predecessor, phrase, start, end, num_trans, y_i") 
  initial_hypothesis = hypothesis(0.0, lm.begin(), None, None, 0, 0, 0, [0 for _ in f]) 
  stacks = [{} for _ in f] + [{}]
  stacks[0][(lm.begin(), 0,0)] = initial_hypothesis
  num_words = len(f)
コード例 #11
0
ファイル: swap.py プロジェクト: Huluk/mt_decoder
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]

def extract_english(h):
    return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)

# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
    if (word,) not in tm:
        tm[(word,)] = [models.phrase(word, 0.0)]

# adding empty phrase to the translation dictionary
tm[()] = [models.phrase("", 0.0)]

def update_lm_state(lm_state, logprob, phrase):
    for word in phrase.english.split():
        (lm_state, word_logprob) = lm.score(lm_state, word)
        logprob += word_logprob
    return lm_state, logprob

sys.stderr.write("Decoding %s...\n" % (opts.input,))
for f in french:
    # The following code implements a local-reordering decoding algorithm.
    # All hypotheses in stacks[i] represent translations of the first i
    # words of the input sentence.
コード例 #12
0



########################################################################################################################################
## init for decoder part
lm = models.LM(opts.lm, opts.mute)
tm = models.TM(opts.tm, opts.k, opts.mute)


french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]
bound_width = float(opts.bwidth)

for word in set(sum(french,())):
    if (word,) not in tm:
        tm[(word,)] = [models.phrase(word, [0.0, 0.0, 0.0, 0.0])]



# ibm_t = {}
ibm_t = library.init('./data/ibm.t.gz')


########################################################################################################################################
## init for reranker part
references = [[], [], [], []]
sys.stderr.write("Reading English Sentences ... \n")
def readReference(ref_fileName):
    ref = []
    for i, line in enumerate(open(ref_fileName)):
        # Initialize references to correct english sentences
コード例 #13
0
def get_candidates(inputfile,
                   tm,
                   lm,
                   weights,
                   stack_size=10,
                   nbest=None,
                   simpmode=True,
                   separate_unknown_words=False,
                   verbose=False):
    if nbest is None:
        nbest = stack_size

    print >> sys.stderr, "Decoding: " + inputfile
    print >> sys.stderr, "Reading input..."
    french = [line.strip().split()
              for line in open(inputfile).readlines()]  # list of list
    if simpmode:
        from mafan import simplify
        for li, line in enumerate(french):
            for wi, word in enumerate(line):
                french[li][wi] = simplify(word.decode('utf-8')).encode('utf-8')

    # tm should translate unknown words as-is with a small probability
    # (i.e. only fallback to copying unknown words over as the last resort)
    for i in xrange(len(french)):
        j = 0
        while j < len(french[i]):
            word = french[i][j]
            if (word, ) not in tm:
                flag = True
                if len(word) >= 2 and separate_unknown_words:
                    for separate in xrange(1, len(word)):
                        if (word[:separate], ) in tm and (
                                word[separate:], ) in tm:
                            french[i][j] = word[:separate]
                            j += 1
                            french[i].insert(j, word[separate:])
                            flag = False
                            break
                if flag:
                    tm[(word, )] = [
                        models.phrase(word, [unknown_word_logprob] *
                                      number_of_features_PT)
                    ]
            j += 1

    print >> sys.stderr, "Start decoding..."
    for n, f in enumerate(french):
        if verbose:
            print >> sys.stderr, "Input: " + ' '.join(f)
        # Generate cache for phrase segmentations.
        f_cache = generate_phrase_cache(f, tm)
        # Pre-calculate future cost table
        future_cost_table = precalcuate_future_cost(
            f, tm, weights[:number_of_features_PT])

        # score = dot(features, weights)
        # features = sums of each log feature
        # predecessor = previous hypothesis
        # lm_state = N-gram state (the last one or two words)
        # last_frange = (i, j) the range of last translated phrase in f
        # phrase = the last TM phrase object (correspondence to f[last_frange])
        # coverage = bit string representing the translation coverage on f
        # future_cost = a safe estimation to be added to total_score
        hypothesis = namedtuple(
            "hypothesis",
            "score, features, lm_state, predecessor, last_frange, phrase, coverage, future_cost"
        )
        initial_hypothesis = hypothesis(0.0, [0.0] * number_of_features,
                                        lm.begin(), None, (0, 0), None, 0, 0)

        # stacks[# of covered words in f] (from 0 to |f|)
        stacks = [{} for _ in xrange(len(f) + 1)]
        # stacks[size][(lm_state, last_frange[1], coverage)]:
        # recombination based on (lm_state, last_frange[1], coverage).
        # For different hypotheses with the same tuple, keep the one with the higher score.
        # lm_state affects LM; last_frange affects distortion; coverage affects available choices.
        stacks[0][(lm.begin(), None, 0)] = initial_hypothesis

        for i, stack in enumerate(stacks[:-1]):
            if verbose:
                print >> sys.stderr, "Stack[%d]:" % i

            # Top-k pruning
            s_hypotheses = sorted(stack.values(),
                                  key=lambda h: h.score + h.future_cost,
                                  reverse=True)
            for h in s_hypotheses[:stack_size]:
                if verbose:
                    print >> sys.stderr, h.score, h.lm_state, bin(
                        h.coverage), ' '.join(f[h.last_frange[0]:h.
                                                last_frange[1]]), h.future_cost

                for (f_range, delta_coverage,
                     tm_phrases) in enumerate_phrases(f_cache, h.coverage):
                    # f_range = (i, j) of the enumerated next phrase to be translated
                    # delta_coverage = coverage of f_range
                    # tm_phrases = TM entries corresponding to fphrase f[f_range]
                    length = i + f_range[1] - f_range[0]
                    coverage = h.coverage | delta_coverage
                    distance = abs(f_range[0] - h.last_frange[1])
                    # if distance > max_distance and i < len(stacks) / 2:
                    #   continue

                    # TM might give us multiple candidates for a fphrase.
                    for phrase in tm_phrases:
                        features = h.features[:]  # copy!
                        # Features from phrase table
                        for fid in range(number_of_features_PT):
                            features[fid] += phrase.features[fid]
                        # log_lmprob (N-gram)
                        lm_state = h.lm_state
                        loglm = 0.0
                        for word in phrase.english.split():
                            (lm_state, word_logprob) = lm.score(lm_state, word)
                            loglm += word_logprob
                        # Don't forget the STOP N-gram if we just covered the whole sentence.
                        loglm += lm.end(lm_state) if length == len(f) else 0.0
                        features[4] += loglm
                        # log distortion (distance ** alpha)
                        features[5] += log(alpha) * distance
                        # length of the translation (-length)
                        features[6] += -len(phrase.english.split())

                        score = calculate_total_score(features, weights)
                        future_list = get_future_list(coverage, len(f))
                        future_cost = get_future_cost(future_list,
                                                      future_cost_table)

                        new_state = (lm_state, f_range[1], coverage)
                        new_hypothesis = hypothesis(score, features, lm_state,
                                                    h, f_range, phrase,
                                                    coverage, future_cost)
                        # Recombination
                        if new_state not in stacks[length] or \
                            score + future_cost > stacks[length][new_state].score + stacks[length][new_state].future_cost:
                            stacks[length][new_state] = new_hypothesis

        winners = sorted(stacks[len(f)].values(),
                         key=lambda h: h.score,
                         reverse=True)
        if nbest == 1:
            yield extract_english(winners[0])
        else:
            for s in winners[:nbest]:
                yield ("%d ||| %s |||" + " %f" * number_of_features) % \
                  ((n, extract_english(s)) + tuple(s.features))
    print >> sys.stderr, "Decoding completed"
コード例 #14
0
ファイル: sto_decode.py プロジェクト: vcha/sp2013.11-731
def translate(input_sentence, n_iter, reordering_limit):
    def lm_score(phrases):
        score = 0
        lm_state = lm.begin()
        for phrase in phrases:
            for word in phrase.english:
                lm_state, word_logprob = lm.score(lm_state, word)
                score += word_logprob
        score += lm.end(lm_state)
        return score

    def replace_moves(i):
        iphrase = source[i]
        ophrase = target[alignment[i]]
        for alternative in tm[iphrase]:
            if alternative == ophrase:
                continue
            # modify
            replace_apply(i, alternative)
            # score
            tm_delta = alternative.logprob - ophrase.logprob
            yield (i, alternative), tm_delta
            # revert
            replace_apply(i, ophrase)

    def replace_apply(i, alternative):
        target[alignment[i]] = alternative

    def merge_moves(i):
        i1, i2 = source[i - 1], source[i]
        a1, a2 = alignment[i - 1], alignment[i]
        # |a1 - a2| = 1
        a_min = min(a1, a2)  # replace
        a_max = max(a1, a2)  # remove
        # a_max = a_min + 1
        o1, o2 = target[a_min], target[a_max]
        for alternative in tm.get(i1 + i2, []):
            # modify
            merge_apply(i, i1 + i2, alternative, a_min)
            # score
            tm_delta = alternative.logprob - o1.logprob - o2.logprob
            yield (i, i1 + i2, alternative, a_min), tm_delta
            # revert
            split_apply(i, i1, i2, o1, o2, a1, a2)

    def split_apply(i, i1, i2, o1, o2, a1, a2):
        source.insert(i, i2)
        source[i - 1] = i1
        al = min(a1, a2)
        target.insert(al + 1, o2)
        target[al] = o1
        for k, a in enumerate(alignment):
            if a >= al + 1:
                alignment[k] += 1
        alignment.insert(i, a2)
        alignment[i - 1] = a1

    def merge_apply(i, src, tgt, al):
        del source[i]
        source[i - 1] = src
        del target[al + 1]
        target[al] = tgt
        del alignment[i]
        alignment[i - 1] = al
        for k, a in enumerate(alignment):
            if a >= al + 1:
                alignment[k] -= 1

    def split_moves(i):
        src, tgt = source[i], target[alignment[i]]
        al = alignment[i]
        for k in range(1, len(src)):
            i1, i2 = src[:k], src[k:]
            for o1 in tm.get(i1, []):
                for o2 in tm.get(i2, []):
                    # modify
                    split_apply(i + 1, i1, i2, o1, o2, al, al + 1)
                    # score
                    tm_delta = o1.logprob + o2.logprob - tgt.logprob
                    yield (i + 1, i1, i2, o1, o2, al, al + 1), tm_delta
                    # revert
                    merge_apply(i + 1, src, tgt, al)

    def swap_moves(i, j):
        # modify
        swap_apply(i, j)
        # score
        yield (i, j), 0
        # revert
        swap_apply(i, j)

    def swap_apply(i, j):
        target[alignment[i]], target[alignment[j]] = target[alignment[j]], target[alignment[i]]
        alignment[i], alignment[j] = alignment[j], alignment[i]

    def violates_reordering(i, al):
        d_source_left = sum(len(phrase) for phrase in source[:i])
        d_target_left = sum(len(phrase.english) for phrase in target[:al])
        d_source_right = sum(len(phrase) for phrase in source[i + 1 :])
        d_target_right = sum(len(phrase.english) for phrase in target[al + 1 :])
        d = max(abs(d_source_left - d_target_left), abs(d_source_right - d_target_right))
        return d > reordering_limit

    def full_score(moves):
        for m, tm_delta in moves:
            yield m, tm_delta, lm_score(target) - score[1]

    def stochastic_strategy(moves, apply_move):
        choice = None
        for m, tm_delta, lm_delta in full_score(moves):
            if sigmoid(tm_delta + lm_delta, alpha) > random.random():
                choice = m, tm_delta, lm_delta
        if choice:
            m, tm_delta, lm_delta = choice
            apply_move(*m)
            score[0] += tm_delta
            score[1] += lm_delta

    # Make initial decoding easy
    for w in input_sentence:
        if not (w,) in tm:
            tm[(w,)] = [models.phrase((w,), -20)]

    source = [(w,) for w in input_sentence]
    target = [max(tm[(w,)], key=lambda phrase: phrase.logprob) for w in input_sentence]
    alignment = [i for i in range(len(input_sentence))]
    score = [tm_score(target), lm_score(target)]

    logging.info(source_output(source))
    logging.info(target_output(target))
    logging.info(" ".join(map(str, alignment)))
    logging.info("Initial score: %s -> %d", score, score[0] + score[1])

    strategy = stochastic_strategy

    history = [((score[:], source[:], target[:], alignment[:]))]

    for it in xrange(n_iter):
        history.append((score[:], source[:], target[:], alignment[:]))
        alpha = 1 - math.exp(-it * 10.0 / n_iter)

        # replace
        for i in range(len(source)):
            strategy(replace_moves(i), replace_apply)
        # merge
        i = 1
        while True:
            if i >= len(source):
                break
            # adjacent target phrases only:
            if abs(alignment[i] - alignment[i - 1]) == 1:
                strategy(merge_moves(i), merge_apply)
            i += 1
        # swap
        for i in range(0, len(source)):
            for j in range(0, len(source)):
                if i == j:
                    continue
                if violates_reordering(i, alignment[j]) or violates_reordering(j, alignment[i]):
                    continue
                strategy(swap_moves(i, j), swap_apply)
        # split
        for i in range(0, len(source)):
            strategy(split_moves(i), split_apply)

        if it % (n_iter / 100) == 0:
            logging.info("%d | %.2f %s %.2f", it, alpha, target_output(target), score[0] + score[1])

    score, source, target, alignment = max(history, key=lambda t: sum(t[0]))
    logging.info(source_output(source))
    logging.info(target_output(target))
    logging.info(" ".join(map(str, alignment)))
    logging.info("Final score: %s -> %d", score, score[0] + score[1])

    return " ".join(" ".join(phrase.english) for phrase in target)
コード例 #15
0
ファイル: ch_baseline.py プロジェクト: Minzc/GhotiNLP
                     action="store_true",
                     default=False,
                     help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [
    tuple(line.strip().split())
    for line in open(opts.input).readlines()[:opts.num_sents]
]

# tm should translate unknown words as-is with probability 1
for word in set(sum(french, ())):
    if (word, ) not in tm:
        tm[(word, )] = [models.phrase(word, 0.0)]

sys.stderr.write("Decoding %s...\n" % (opts.input, ))
for f in french:
    # The following code implements a monotone decoding
    # algorithm (one that doesn't permute the target phrases).
    # Hence all hypotheses in stacks[i] represent translations of
    # the first i words of the input sentence. You should generalize
    # this so that they can represent translations of *any* i words.
    hypothesis = namedtuple("hypothesis",
                            "logprob, lm_state, predecessor, phrase")
    initial_hypothesis = hypothesis(0.0, lm.begin(), None, None)
    stacks = [{} for _ in f] + [{}]
    stacks[0][lm.begin()] = initial_hypothesis
    for i, stack in enumerate(stacks[:-1]):
        for h in sorted(stack.itervalues(),
コード例 #16
0
def handle_unk_words(french, tm):
    for word in set(sum(french,())):
        if (word,) not in tm:
            tm[(word,)] = [models.phrase(word, [0.0,0.0,0.0,0.0], 0.0)]
コード例 #17
0
def get_candidates(input, tm, lm, weights, s=1):

    alpha = 0.95  #reordering parameter
    french = [list(line.strip().split()) for line in open(input).readlines()]
    for li, line in enumerate(french):
        for wi, word in enumerate(line):
            french[li][wi] = simplify(word.decode('utf-8')).encode('utf-8')

    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french, [])):
        if (word, ) not in tm:
            tm[(word, )] = [models.phrase(word, [0.0, 0.0, 0.0, 0.0])]

    def generate_phrase_cache(f):
        cache = []
        for i in range(0, len(f)):
            entries = []
            bitstring = 0
            for j in range(i + 1, len(f) + 1):
                bitstring += 1 << (len(f) - j)
                if tuple(f[i:j]) in tm:
                    entries.append({
                        'end': j,
                        'bitstring': bitstring,
                        'phrase': tm[tuple(f[i:j])]
                    })
            cache.append(entries)
        return cache

    def enumerate_phrases(f_cache, coverage):
        for i in range(0, len(f_cache)):
            bitstring = 0
            for entry in f_cache[i]:
                if (entry['bitstring'] & coverage) == 0:
                    yield ((i, entry['end']), entry['bitstring'],
                           entry['phrase'])

    def precalcuate_future_cost(f):
        phraseCheapestTable = {}
        futureCostTable = {}
        for i in range(0, len(f)):
            for j in range(i + 1, len(f) + 1):
                if f[i:j] in tm:
                    phraseCheapestTable[i, j] = -sys.maxint
                    for phrase in tm[f[i:j]]:
                        if phrase.logprob > phraseCheapestTable[i, j]:
                            phraseCheapestTable[i, j] = phrase.logprob
        for i in range(0, len(f)):
            futureCostTable[i, 1] = phraseCheapestTable[i, i + 1]
            for j in range(2, len(f) + 1 - i):
                if (i, i + j) in phraseCheapestTable:
                    futureCostTable[i, j] = phraseCheapestTable[i, i + j]
                else:
                    futureCostTable[i, j] = -sys.maxint
                for k in range(1, j):
                    if (((i + k, i + j) in phraseCheapestTable) and
                        (futureCostTable[i, j] < futureCostTable[i, k] +
                         phraseCheapestTable[i + k, i + j])):
                        futureCostTable[i, j] = futureCostTable[
                            i, k] + phraseCheapestTable[i + k, i + j]
        return futureCostTable

    def get_future_list(bitstring):
        bitList = bin(bitstring)[2:]
        futureList = []
        count = 0
        index = 0
        findZeroBit = False
        for i in range(len(bitList)):
            if bitList[i] == '0':
                if not findZeroBit:
                    index = i
                findZeroBit = True
                count = count + 1
            else:
                if findZeroBit:
                    futureList.append((index, count))
                findZeroBit = False
                count = 0
        if findZeroBit:
            futureList.append((index, count))
        return futureList

    def get_future_cost(bitList, futureCostTable):
        cost = 0
        for item in bitList:
            cost = cost + futureCostTable[item]
        return cost

    def extract_english(h):
        return "" if h.predecessor is None else "%s%s " % (extract_english(
            h.predecessor), h.phrase.english)

    results = []
    sys.stderr.write("Decoding %s...\n" % (input, ))
    for n, f in enumerate(french):
        # Generate cache for phrase segmentations.
        f_cache = generate_phrase_cache(f)
        # Pre-calculate future cost table
        #future_cost_table = precalcuate_future_cost(f)

        # logprob = log_lmprob + log_tmprob + distortion_penalty
        # predecessor = previous hypothesis
        # lm_state = N-gram state (the last one or two words)
        # last_frange = (i, j) the range of last translated phrase in f
        # phrase = the last TM phrase object (correspondence to f[last_frange])
        # coverage = bit string representing the translation coverage on f
        # future_cost
        hypothesis = namedtuple(
            "hypothesis",
            "logprob, features, lm_score, lm_state, predecessor, last_frange, phrase, coverage"
        )
        initial_hypothesis = hypothesis(0.0, [0.0, 0.0, 0.0, 0.0], 0.0,
                                        lm.begin(), None, (0, 0), None, 0)
        # stacks[# of covered words in f] (from 0 to |f|)
        stacks = [{} for _ in range(len(f) + 1)]
        # stacks[size][(lm_state, last_frange, coverage)]:
        # recombination based on (lm_state, last_frange, coverage).
        # For different hypotheses with the same tuple, keep the one with the higher logprob.
        # lm_state affects LM; last_frange affects distortion; coverage affects available choices.
        stacks[0][(lm.begin(), None, 0)] = initial_hypothesis
        for i, stack in enumerate(stacks[:-1]):

            # Top-k pruning
            for h in sorted(stack.itervalues(), key=lambda h: -h.logprob)[:s]:
                for (f_range, delta_coverage,
                     tm_phrases) in enumerate_phrases(f_cache, h.coverage):
                    # f_range = (i, j) of the enumerated next phrase to be translated
                    # delta_coverage = coverage of f_range
                    # tm_phrases = TM entries corresponding to fphrase f[f_range]
                    length = i + f_range[1] - f_range[0]
                    coverage = h.coverage | delta_coverage
                    distance = f_range[0] - h.last_frange[1]

                    # TM might give us multiple candidates for a fphrase.
                    for phrase in tm_phrases:
                        # log_tmprob and distortion
                        features = map(add, h.features, phrase.features)
                        # log_lmprob (N-gram)
                        lm_state = h.lm_state
                        lm_score = h.lm_score
                        for word in phrase.english.split():
                            (lm_state, word_logprob) = lm.score(lm_state, word)
                            lm_score += word_logprob
                        # Don't forget the STOP N-gram if we just covered the whole sentence.
                        lm_score += lm.end(lm_state) if length == len(
                            f) else 0.0

                        # Future cost.
                        #future_list = get_future_list(delta_coverage)
                        #future_cost = get_future_cost(future_list, future_cost_table)

                        logprob = sum(
                            p * q
                            for p, q in zip((features + [lm_score]), weights))
                        new_state = (lm_state, f_range, coverage)
                        new_hypothesis = hypothesis(logprob, features,
                                                    lm_score, lm_state, h,
                                                    f_range, phrase, coverage)
                        if new_state not in stacks[length] or \
                            logprob > stacks[length][new_state].logprob:  # recombination
                            stacks[length][new_state] = new_hypothesis

        winner = sorted(stacks[len(f)].itervalues(),
                        key=lambda h: h.logprob,
                        reverse=True)[0:100]
        for i in range(len(winner)):
            results += [
                "%d ||| %s ||| %f %f %f %f %f" %
                (n, extract_english(winner[i]), winner[i].features[0],
                 winner[i].features[1], winner[i].features[2],
                 winner[i].features[3], winner[i].lm_score)
            ]

    return results
コード例 #18
0
ファイル: baseline.py プロジェクト: ylgh2011/nlp-hw4
def main():
    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french,())):
        if (word,) not in tm:
            tm[(word,)] = [models.phrase(word, 0.0)]

    total_prob = 0
    if opts.mute == 0:
        sys.stderr.write("Decoding %s...\n" % (opts.input,))
    for idx,f in enumerate(french):
        initial_hypothesis = hypothesis(lm.begin(), 0.0, 0, 0, None, None)
        heaps = [{} for _ in f] + [{}]
        heaps[0][lm.begin(), 0, 0] = initial_hypothesis
        for i, heap in enumerate(heaps[:-1]):
            # maintain beam heap
            # front_item = sorted(heap.itervalues(), key=lambda h: -h.logprob)[0]
            # for k in heap.keys():
            #      if heap[k].logprob < front_item.logprob - bound_width:
            #         del heap[k]

            for h in sorted(heap.itervalues(),key=lambda h: -h.logprob)[:opts.s]: # prune
                fopen = prefix1bits(h.coverage)
                for j in xrange(fopen,min(fopen+1+opts.disord, len(f)+1)):
                    for k in xrange(j+1, len(f)+1):
                        if f[j:k] in tm:
                            if (h.coverage & bitmap(range(j, k))) == 0:
                                for phrase in tm[f[j:k]]:
                                    lm_prob = 0
                                    lm_state = h.lm_state
                                    for word in phrase.english.split():
                                        (lm_state, prob) = lm.score(lm_state, word)
                                        lm_prob += prob
                                    lm_prob += lm.end(lm_state) if k == len(f) else 0.0
                                    coverage = h.coverage | bitmap(range(j, k))
                                    logprob = h.logprob + opts.alpha*lm_prob + opts.beta*phrase.logprob # + eta*abs(h.end + 1 - j)

                                    new_hypothesis = hypothesis(lm_state, logprob, coverage, k, h, phrase)

                                    # add to heap
                                    num = onbits(coverage)
                                    if (lm_state, coverage, k) not in heaps[num] or new_hypothesis.logprob > heaps[num][lm_state, coverage, k].logprob:
                                            heaps[num][lm_state, coverage, k] = new_hypothesis


        winner = max(heaps[-1].itervalues(), key=lambda h: h.logprob)
        eng_list = ["<s>"]
        def get_list(h, output_list):
            if h.predecessor is not None:
                get_list(h.predecessor, output_list)
                output_list.append(h.phrase.english)
        def get_prob(test_list):
            stance = []
            for i in test_list:
                stance += (i.split())
            stance = tuple(stance)
            lm_state = (stance[0],)
            score = 0.0
            for word in stance[1:]:
                (lm_state, word_score) = lm.score(lm_state, word)
                score += word_score
            return score
        get_list(winner, eng_list)
        eng_list.append("</s>")

        if opts.mute == 0:
            sys.stderr.write("Start local search ...\n")

        while True:
            best_list = copy.deepcopy(eng_list)
            
            # insert
            for i in range(1,len(eng_list)-1):
                for j in range(1, i):
                    now_list = copy.deepcopy(eng_list)
                    now_list.pop(i)
                    now_list.insert(j, eng_list[i])
                    if get_prob(now_list) > get_prob(best_list):
                        best_list = now_list

                for j in range(i+2, len(eng_list)-1):
                    now_list = copy.deepcopy(eng_list)
                    now_list.insert(j, eng_list[i])
                    now_list.pop(i)
                    if get_prob(now_list) > get_prob(best_list):
                        best_list = now_list
            # swap
            for i in range(1,len(eng_list)-2):
                for j in range(i+1,len(eng_list)-1):
                    now_list = copy.deepcopy(eng_list)
                    now_list[i], now_list[j] = now_list[j], now_list[i]
                    if get_prob(now_list) > get_prob(best_list):
                        best_list = now_list
            
            if get_prob(best_list) == get_prob(eng_list):
                break
            else:
                eng_list = best_list

        for i in eng_list[1:-1]:
            print i,
        print

        if opts.mute == 0:
            sys.stderr.write("#{0}:{2} - {1}\n".format(idx, eng_list , get_prob(eng_list)))
コード例 #19
0
def main():
    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french, ())):
        if (word, ) not in tm:
            tm[(word, )] = [models.phrase(word, 0.0)]

    total_prob = 0
    if opts.mute == 0:
        sys.stderr.write("Decoding %s...\n" % (opts.input, ))
    for idx, f in enumerate(french):
        initial_hypothesis = hypothesis(lm.begin(), 0.0, 0, 0, None, None)
        heaps = [{} for _ in f] + [{}]
        heaps[0][lm.begin(), 0, 0] = initial_hypothesis
        for i, heap in enumerate(heaps[:-1]):
            # maintain beam heap
            # front_item = sorted(heap.itervalues(), key=lambda h: -h.logprob)[0]
            # for k in heap.keys():
            #      if heap[k].logprob < front_item.logprob - bound_width:
            #         del heap[k]

            for h in sorted(heap.itervalues(),
                            key=lambda h: -h.logprob)[:opts.s]:  # prune
                fopen = prefix1bits(h.coverage)
                for j in xrange(fopen, min(fopen + 1 + opts.disord,
                                           len(f) + 1)):
                    for k in xrange(j + 1, len(f) + 1):
                        if f[j:k] in tm:
                            if (h.coverage & bitmap(range(j, k))) == 0:
                                for phrase in tm[f[j:k]]:
                                    lm_prob = 0
                                    lm_state = h.lm_state
                                    for word in phrase.english.split():
                                        (lm_state,
                                         prob) = lm.score(lm_state, word)
                                        lm_prob += prob
                                    lm_prob += lm.end(lm_state) if k == len(
                                        f) else 0.0
                                    coverage = h.coverage | bitmap(range(j, k))
                                    logprob = h.logprob + opts.alpha * lm_prob + opts.beta * phrase.logprob  # + eta*abs(h.end + 1 - j)

                                    new_hypothesis = hypothesis(
                                        lm_state, logprob, coverage, k, h,
                                        phrase)

                                    # add to heap
                                    num = onbits(coverage)
                                    if (lm_state, coverage, k) not in heaps[
                                            num] or new_hypothesis.logprob > heaps[
                                                num][lm_state, coverage,
                                                     k].logprob:
                                        heaps[num][lm_state, coverage,
                                                   k] = new_hypothesis

        winner = max(heaps[-1].itervalues(), key=lambda h: h.logprob)
        eng_list = ["<s>"]

        def get_list(h, output_list):
            if h.predecessor is not None:
                get_list(h.predecessor, output_list)
                output_list.append(h.phrase.english)

        def get_prob(test_list):
            stance = []
            for i in test_list:
                stance += (i.split())
            stance = tuple(stance)
            lm_state = (stance[0], )
            score = 0.0
            for word in stance[1:]:
                (lm_state, word_score) = lm.score(lm_state, word)
                score += word_score
            return score

        get_list(winner, eng_list)
        eng_list.append("</s>")

        if opts.mute == 0:
            sys.stderr.write("Start local search ...\n")

        while True:
            best_list = copy.deepcopy(eng_list)

            # insert
            for i in range(1, len(eng_list) - 1):
                for j in range(1, i):
                    now_list = copy.deepcopy(eng_list)
                    now_list.pop(i)
                    now_list.insert(j, eng_list[i])
                    if get_prob(now_list) > get_prob(best_list):
                        best_list = now_list

                for j in range(i + 2, len(eng_list) - 1):
                    now_list = copy.deepcopy(eng_list)
                    now_list.insert(j, eng_list[i])
                    now_list.pop(i)
                    if get_prob(now_list) > get_prob(best_list):
                        best_list = now_list
            # swap
            for i in range(1, len(eng_list) - 2):
                for j in range(i + 1, len(eng_list) - 1):
                    now_list = copy.deepcopy(eng_list)
                    now_list[i], now_list[j] = now_list[j], now_list[i]
                    if get_prob(now_list) > get_prob(best_list):
                        best_list = now_list

            if get_prob(best_list) == get_prob(eng_list):
                break
            else:
                eng_list = best_list

        for i in eng_list[1:-1]:
            print i,
        print

        if opts.mute == 0:
            sys.stderr.write("#{0}:{2} - {1}\n".format(idx, eng_list,
                                                       get_prob(eng_list)))
コード例 #20
0
ファイル: baseline.py プロジェクト: ylgh2011/nlp-hw4
def main():
    # tm should translate unknown words as-is with probability 1
    for word in set(sum(french, ())):
        if (word, ) not in tm:
            tm[(word, )] = [models.phrase(word, 0.0)]

    total_prob = 0
    sys.stderr.write("Decoding %s...\n" % (opts.input, ))
    for idx, f in enumerate(french):
        initial_hypothesis = hypothesis(lm.begin(), 0.0, 0, 0, None, None)
        heaps = [{} for _ in f] + [{}]
        heaps[0][lm.begin(), 0, 0] = initial_hypothesis
        for i, heap in enumerate(heaps[:-1]):
            # maintain beam heap
            # front_item = sorted(heap.itervalues(), key=lambda h: -h.logprob)[0]
            # for k in heap.keys():
            #      if heap[k].logprob < front_item.logprob - opts.bwidth:
            #         del heap[k]

            for h in sorted(heap.itervalues(),
                            key=lambda h: -h.logprob)[:opts.s]:  # prune
                fopen = prefix1bits(h.coverage)
                for j in xrange(fopen, min(fopen + 1 + opts.disord,
                                           len(f) + 1)):
                    for k in xrange(j + 1, len(f) + 1):
                        if f[j:k] in tm:
                            if (h.coverage & bitmap(range(j, k))) == 0:
                                for phrase in tm[f[j:k]]:
                                    lm_prob = 0
                                    lm_state = h.lm_state
                                    for word in phrase.english.split():
                                        (lm_state,
                                         prob) = lm.score(lm_state, word)
                                        lm_prob += prob
                                    lm_prob += lm.end(lm_state) if k == len(
                                        f) else 0.0
                                    coverage = h.coverage | bitmap(range(j, k))
                                    logprob = h.logprob + opts.alpha * lm_prob + opts.beta * phrase.logprob + opts.eta * abs(
                                        h.end + 1 - j)

                                    new_hypothesis = hypothesis(
                                        lm_state, logprob, coverage, k, h,
                                        phrase)

                                    # add to heap
                                    num = onbits(coverage)
                                    if (lm_state, coverage, k) not in heaps[
                                            num] or new_hypothesis.logprob > heaps[
                                                num][lm_state, coverage,
                                                     k].logprob:
                                        heaps[num][lm_state, coverage,
                                                   k] = new_hypothesis

        winner = max(heaps[-1].itervalues(), key=lambda h: h.logprob)

        def extract_english(h):
            return "" if h.predecessor is None else "%s%s " % (extract_english(
                h.predecessor), h.phrase.english)

        out = extract_english(winner)
        print out
        sys.stderr.write("#{0}:{2} - {1}\n".format(idx, out, winner.logprob))
        total_prob += winner.logprob

        # if opts.verbose:
        #     def extract_tm_logprob(h):
        #         return 0.0 if h.predecessor is None else h.phrase.logprob + extract_tm_logprob(h.predecessor)
        #     tm_logprob = extract_tm_logprob(winner)
        #     sys.stderr.write("LM = %f, TM = %f, Total = %f\n" %
        #         (winner.logprob - tm_logprob, tm_logprob, winner.logprob))
    sys.stderr.write("Total score: {0}\n".format(total_prob))
コード例 #21
0
ファイル: swap.py プロジェクト: Huluk/mt_decoder
lm = models.LM(opts.lm)
french = [
    tuple(line.strip().split())
    for line in open(opts.input).readlines()[:opts.num_sents]
]


def extract_english(h):
    return "" if h.predecessor is None else "%s%s " % (extract_english(
        h.predecessor), h.phrase.english)


# tm should translate unknown words as-is with probability 1
for word in set(sum(french, ())):
    if (word, ) not in tm:
        tm[(word, )] = [models.phrase(word, 0.0)]

# adding empty phrase to the translation dictionary
tm[()] = [models.phrase("", 0.0)]


def update_lm_state(lm_state, logprob, phrase):
    for word in phrase.english.split():
        (lm_state, word_logprob) = lm.score(lm_state, word)
        logprob += word_logprob
    return lm_state, logprob


sys.stderr.write("Decoding %s...\n" % (opts.input, ))
for f in french:
    # The following code implements a local-reordering decoding algorithm.
コード例 #22
0
ファイル: decode.py プロジェクト: query/mt-submissions
def decode(tm, lm, source_sentence,
           stack_size=1, max_reordering=None):
    """Return the most probable decoding of *source_sentence* under the
    provided probabilistic translation and language models."""
    # Compute the future cost table.
    future_costs = {}
    for segment_length in xrange(1, len(source_sentence) + 1):
        for start in xrange(len(source_sentence) - segment_length + 1):
            end = start + segment_length
            future_costs[(start, end)] = float('-inf')
            candidates = tm.get(source_sentence[start:end], [])
            if candidates:
                logprob = candidates[0].logprob
                lm_state = tuple()
                for target_word in candidates[0].english.split():
                    lm_state, word_logprob = lm.score(lm_state, target_word)
                    logprob += word_logprob
                future_costs[(start, end)] = logprob
            for mid in xrange(start + 1, end):
                future_costs[(start, end)] = max(
                    future_costs[(start, mid)] + future_costs[(mid, end)],
                    future_costs[(start, end)])
    # Actually start decoding.
    initial = Hypothesis(0.0, future_costs[(0, len(source_sentence))],
                         (False,) * len(source_sentence),
                         lm.begin(), None, None)
    # We add 1 here because we need to have stacks for both ends: 0 and
    # len(source_sentence).
    stacks = [{} for _ in xrange(len(source_sentence) + 1)]
    stacks[0][lm.begin()] = initial
    # Iterate over every stack but the last.  It's not possible to add
    # anything to a hypothesis in the last stack anyway, so we skip it.
    for i, stack in enumerate(stacks[:-1]):
        # Take only the best *stack_size* hypotheses.  Using the sum of
        # the log-probability and the future cost negatively impacts the
        # model score (??).
        hypotheses = sorted(stack.itervalues(),
                            key=lambda h: -h.logprob)[:stack_size]
        for hypothesis in hypotheses:
            # Save ourselves a couple of levels of indentation later on.
            def untranslated_segments():
                if max_reordering is None:
                    starts = xrange(len(source_sentence))
                else:
                    starts = xrange(min(i + max_reordering,
                                        len(source_sentence)))
                for start in starts:
                    if hypothesis.coverage[start]:
                        continue
                    ends = xrange(start, len(source_sentence))
                    for end in ends:
                        if hypothesis.coverage[end]:
                            break
                        yield (start, end + 1)
            # Iterate over blocks of untranslated source words.
            for start, end in untranslated_segments():
                source_phrase = source_sentence[start:end]
                # Get all of the potential candidate translations.
                candidates = tm.get(source_phrase, [])
                # Translate unknown unigrams to themselves.
                if not candidates and len(source_phrase) == 1:
                    candidates.append(models.phrase(source_phrase[0], 0.0))
                for candidate in candidates:
                    logprob = hypothesis.logprob + candidate.logprob
                    # Make a new coverage vector with the appropriate
                    # elements set to True.  This isn't pretty.  Sorry.
                    coverage = (hypothesis.coverage[:start] +
                                (True,) * (end - start) +
                                hypothesis.coverage[end:])
                    # Find the future cost estimate for this hypothesis
                    # by summing over contiguous incomplete segments.
                    future_cost = 0.0
                    cost_start = None
                    for cost_i, covered in enumerate(coverage + (True,)):
                        if covered:
                            if cost_start is not None:
                                future_cost += \
                                    future_costs[(cost_start, cost_i)]
                            cost_start = None
                        else:
                            if cost_start is None:
                                cost_start = cost_i
                    # Make a new LM state.
                    lm_state = hypothesis.lm_state
                    for target_word in candidate.english.split():
                        lm_state, word_logprob = \
                            lm.score(lm_state, target_word)
                        logprob += word_logprob
                    # Add the final transition probability if the end of
                    # this segment is also the end of the sentence.
                    if end == len(source_sentence):
                        logprob += lm.end(lm_state)
                    # If the new hypothesis is the best hypothesis for
                    # its state and number of completed words, push it
                    # onto the stack, replacing any that is present.
                    completed = sum(int(x) for x in coverage)
                    if (lm_state not in stacks[completed] or
                            (stacks[completed][lm_state].logprob +
                             stacks[completed][lm_state].future_cost) <
                            logprob + future_cost):
                        stacks[completed][lm_state] = Hypothesis(
                            logprob, future_cost, coverage,
                            lm_state, hypothesis, candidate)
    # We don't need to specify a key, since we're looking for the best
    # log-probability, and that's the first element of a hypothesis.
    best = max(stacks[-1].itervalues())
    current = best
    decoding = []
    while current.candidate:
        decoding.insert(0, current.candidate.english)
        current = current.predecessor
    return tuple(decoding)