Exemple #1
0
def computeBleu(system, reference):
    stats = [0 for i in xrange(10)]
    stats = [
        sum(scores)
        for scores in zip(stats, bleu.bleu_stats(system, reference))
    ]
    return bleu.smoothed_bleu(stats)
def main(opts, references, input_nbest, theta0=None):
    entry = namedtuple("entry", "sentence, smoothed_bleu, feature_list")
    nbests = None
    if nbests is None:
        nbests = []
        sys.stderr.write("No nbests on disk, so calculating ndests ... \n")
        for j,line in enumerate(input_nbest):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)

            # lst_smoothed_bleu_score = []
            # for ref in references:
            #     stats = list(bleu.bleu_stats(sentence, ref[i]))
            #     lst_smoothed_bleu_score.append( bleu.smoothed_bleu(stats) )
            # # making the feature string to float list
            # avg_smoothed_bleu_score = float(sum(lst_smoothed_bleu_score)) / len(lst_smoothed_bleu_score)

            stats = list(bleu.bleu_stats(sentence, references[i]))
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            
            feature_list = [float(x) for x in features.split()]
            if len(nbests)<=i:
                nbests.append([])
            # nbests[i].append(entry(sentence, avg_smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

            if j%5000 == 0:
                sys.stderr.write(".")

    arg_num = len(nbests[0][0].feature_list)

    theta = theta0
    if theta is None:
        theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    avg_theta = [ 0.0 for _ in xrange(arg_num)]
    avg_cnt = 0
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        mistake = 0;
        for nbest in nbests:
            sample = get_sample(nbest, opts)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta)
                    
                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake,))
    

    weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of return the averaged-out weights, return the weights that maximize the BLEU score    
    return "\n".join([str(weight) for weight in weights])
Exemple #3
0
def main():
    nbests = defaultdict(list)
    references = {}
    for i, line in enumerate(open(opts.en)):
        '''
        Initialize references to correct english sentences
        '''
        references[i] = line

    for line in open(opts.nbest):
        (i, sentence, features) = line.strip().split("|||")
        stats = list(bleu_stats(sentence, references[int(i)]))
        bleu_score = bleu(stats)
        smoothed_bleu_score = smoothed_bleu(stats)
        # making the feature string to float list
        feature_list = [float(x) for x in features.split()]
        nbests[int(i)].append(
            (sentence, bleu_score, smoothed_bleu_score, feature_list))

    theta = [1.0 / 6 for _ in xrange(6)]  #initialization

    for i in range(0, opts.epo):
        mistake = 0
        for nbest in nbests:
            sample = get_sample(nbests[nbest])
            sample.sort(key=lambda i: i[0][2] - i[1][2], reverse=True)
            for i in range(0, min(len(sample), opts.xi)):
                for j in range(0, 6):
                    if theta[j] * sample[i][0][3][j] <= theta[j] * sample[i][
                            1][3][j]:
                        mistake = mistake + 1
                        theta[j] = theta[j] + opts.eta * (sample[i][0][3][j] -
                                                          sample[i][1][3][j])
        sys.stderr.write("Mistake:  %s\n" % (mistake, ))
    print "\n".join([str(weight) for weight in theta])
Exemple #4
0
def main():
    nbests = defaultdict(list)
    references = {}
    for i, line in enumerate(open(opts.en)):
        '''
        Initialize references to correct english sentences
        '''
        references[i] = line

    for line in open(opts.nbest):
        (i, sentence, features) = line.strip().split("|||")
        stats =  list(bleu_stats(sentence, references[int(i)]))
        bleu_score = bleu(stats)
        smoothed_bleu_score = smoothed_bleu(stats)
        # making the feature string to float list 
        feature_list = [float(x) for x in features.split()]
        nbests[int(i)].append((sentence, bleu_score, smoothed_bleu_score, feature_list))

    theta = [1.0/6 for _ in xrange(6)] #initialization
    

    for i in range(0, opts.epo):
        mistake = 0;
        for nbest in nbests:
            sample = get_sample(nbests[nbest])
            sample.sort(key=lambda i: i[0][2] - i[1][2], reverse=True)
            for i in range(0, min(len(sample), opts.xi)):
                for j in range(0, 6):
                    if theta[j] * sample[i][0][3][j] <= theta[j] * sample[i][1][3][j]:
                        mistake = mistake + 1
                        theta[j] = theta[j] + opts.eta * (sample[i][0][3][j] - sample[i][1][3][j])
        sys.stderr.write("Mistake:  %s\n" % (mistake,))
    print "\n".join([str(weight) for weight in theta])
Exemple #5
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i%100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading nbests datastructure from disk ... \n")
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("No nbests on disk, so calculating ndests ... \n")
        for j,line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests)<=i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

            if j%5000 == 0:
                sys.stderr.write(".")
        write_ds_to_file(nbests, opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    avg_theta = [ 0.0 for _ in xrange(arg_num)]
    avg_cnt = 0
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        mistake = 0;
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta)
                    
                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake,))
    

    weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score    
    print "\n".join([str(weight) for weight in weights])
Exemple #6
0
 def bleu(self, hyp, ref):
     hyp = list(hyp)
     ref = list(ref)
     if self._end_token in hyp:
         hyp = hyp[: hyp.index(self._end_token)]
     if self._end_token in ref:
         ref = ref[: ref.index(self._end_token)]
     return smoothed_bleu(hyp, ref)
Exemple #7
0
 def bleu(self, hyp, ref):
     hyp = list(hyp)
     ref = list(ref)
     if self._end_token in hyp:
         hyp = hyp[:hyp.index(self._end_token)]
     if self._end_token in ref:
         ref = ref[:ref.index(self._end_token)]
     return smoothed_bleu(hyp, ref)
Exemple #8
0
 def run(self, data_x):
     output_vars = self.compute(*data_x)
     _, _, tgt_tokens, tgt_masks = data_x
     bleus = []
     for i in range(tgt_tokens.shape[0]):
         target_len = int(tgt_masks[i].sum())
         ref_tokens = tgt_tokens[i, :target_len]
         out_tokens = output_vars.outputs[i, :target_len]
         bleus.append(smoothed_bleu(out_tokens, ref_tokens))
     output_vars.bleu = numpy.mean(bleus)
     if self._criteria == 'mixed':
         output_vars.mixed = output_vars.cost - output_vars.bleu
     return self._extract_costs(output_vars)
Exemple #9
0
def main():
    nbests = []
    references = []
    sys.stderr.write("Reading English Sentences")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i % 100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nReading ndests")
    for j, line in enumerate(open(opts.nbest)):
        (i, sentence, features) = line.strip().split("|||")
        i = int(i)
        stats = list(bleu_stats(sentence, references[i]))
        bleu_score = bleu(stats)
        smoothed_bleu_score = smoothed_bleu(stats)
        # making the feature string to float list
        feature_list = [float(x) for x in features.split()]
        if len(nbests) <= i:
            nbests.append([])
        nbests[i].append(
            entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
        if j % 5000 == 0:
            sys.stderr.write(".")

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0 / arg_num for _ in xrange(arg_num)]  #initialization

    sys.stderr.write("\nTraining...\n")
    for i in xrange(opts.epo):
        mistake = 0
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu,
                        reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1),
                                        opts.eta)


#                for j in xrange(arg_num):
#                    if theta[j] * sample[i][0][3][j] <= theta[j] * sample[i][1][3][j]:
#                        mistake = mistake + 1
#                        theta[j] += opts.eta * (sample[i][0].feature_list[j] - sample[i][1].feature_list[j])
        sys.stderr.write("Mistake:  %s\n" % (mistake, ))
    print "\n".join([str(weight) for weight in theta])
Exemple #10
0
def main():
    nbests = []
    references = []
    sys.stderr.write("Reading English Sentences")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i%100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nReading ndests")
    for j,line in enumerate(open(opts.nbest)):
        (i, sentence, features) = line.strip().split("|||")
        i = int(i)
        stats = list(bleu_stats(sentence, references[i]))
        # bleu_score = bleu(stats)
        smoothed_bleu_score = smoothed_bleu(stats)
        # making the feature string to float list
        feature_list = [float(x) for x in features.split()]
        if j == 10:
            break
        if len(nbests)<=i:
            nbests.append([])
        # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
        nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))
        if j%5000 == 0:
            sys.stderr.write(".")

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    avg_theta = [ 0 for _ in xrange(arg_num)]
    avg_cnt = 0
    sys.stderr.write("\nTraining...\n")
    for i in xrange(opts.epo):
        mistake = 0;
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta)
                    avg_theta = vector_plus(avg_theta, theta)
                    avg_cnt += 1
        sys.stderr.write("Mistake:  %s\n" % (mistake,))
    final_theta = [ t / avg_cnt for t in avg_theta]
    print "\n".join([str(weight) for weight in final_theta])
Exemple #11
0
def get_nbest(nbest, source, target):
    src = [line.strip().split() for line in open(source).readlines()]
    ref = [line.strip().split() for line in open(target).readlines()]
    translations = [
        line.strip().split("|||") for line in open(nbest).readlines()
    ]
    nbests = [[] for _ in ref]
    original_feature_count = 0
    sys.stderr.write("Computing smoothed bleu...")
    translation = namedtuple("translation", "features, smoothed_bleu")
    for (i, sentence, features) in translations:
        (i, sentence,
         features) = (int(i), sentence.strip(),
                      [float(f) for f in features.strip().split()])
        sentence_split = sentence.strip().split()
        stats = tuple(bleu.bleu_stats(sentence_split, ref[i]))
        nbests[i].append(translation(features, bleu.smoothed_bleu(stats)))
    return nbests
Exemple #12
0
optparser.add_option("-r", "--reference", dest="reference", default="dev/all.cn-en.en0", help="English reference sentences")
(opts, _) = optparser.parse_args()
ref = [(line.strip().split()) for line in open(opts.reference).readlines()]
ref = ref[:int(opts.length)]
nbests = []

for n, line in enumerate(open(opts.nbest)):
    (i, sentence, features) = line.strip().split("|||")
    (i, sentence) = (int(i), sentence.strip())
    features = [float(h) for h in features.strip().split()]
    if len(ref) <= i:
        break
    while len(nbests) <= i:
        nbests.append([])
    scores = tuple(bleu_stats(sentence.split(), ref[i]))
    bleu_scores = smoothed_bleu(scores)
    inverse_scores = tuple([-x for x in scores])
    nbests[i].append(translation_candidate(sentence, scores, features, bleu_scores))
    if n % 2000 == 0:
        sys.stderr.write(".")
sys.stderr.write("\n")
for i in xrange(len(nbests)):
    nbests[i] = sorted(nbests[i], key=lambda h: h.smoothed_bleu)
    nbests[i] = nbests[i][::-1]

num_features = 5
w = [float(1)/5]*num_features
updates = [0]*num_features
k = 5
r = 5
margin = 0.2
Exemple #13
0
# In[150]:

nbests = []
for n, line in enumerate(open(opts.nbest)):
    (i, sentence, features) = line.strip().split("|||")
    (i, sentence) = (int(i), sentence.strip())
    features = np.array([float(it) for it in features.split()])
    if len(ref) <= i:
        break

    while len(nbests) <= i:
        nbests.append([])

    scores = tuple(bleu.bleu_stats(sentence.split(), ref[i]))
    inverse_scores = tuple([-x for x in scores])
    smoothed_score = bleu.smoothed_bleu(inverse_scores)

    nbests[i].append((translation_candidate(sentence, inverse_scores,
                                            features), smoothed_score))

    if n % 2000 == 0:
        sys.stderr.write(".")

    # small size for testing, delete it when release
    # if n > 4000:
    #     break

# In[151]:

tau = 5000
alpha = 0.1
cnt = 0  #count # of sentence
#we can run the first part for only once and save it.
###1st part,compute blue score for each candidate translation.
for line in open(opts.nbest):
    cnt = cnt + 1
    #print '{0}\r'.format("\rIteration: %d/%d." %(cnt, 432303)),
    (i, sentence, features) = line.strip().split("|||")
    if len(nbests) <= int(i):
        nbests.append([])
    features = [float(h) for h in features.strip().split()]
    stats = [0 for kk in xrange(10)]  #code from score-reranker.py
    stats = [
        sum(scores) for scores in zip(
            stats, bleu.bleu_stats(sentence.strip().split(), ref[int(i)]))
    ]
    score = bleu.smoothed_bleu(stats)
    nbests[int(i)].append(candidate(sentence.strip(), features, score))
cPickle.dump(nbests, open(
    'my_nbests_add.p',
    'wb'))  #save the result. no need to run the first part each time
#print "finished calculating nbests."
nbests = cPickle.load(open('my_nbests_add.p', 'rb'))  #load pickled file

#2nd part,learn the optimal weight
epochs = 20  #setup parameters mentioned in pseudocode
tau_maxsize = 100  #5000
xi = 10  #50
tau = []
alpha = 0.05
eta = 0.1
theta = [1.0 / num_features
Exemple #15
0
def main():
    nbests = []
    references = []
    sys.stderr.write("Reading English Sentences")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i%100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nReading ndests")
    for j,line in enumerate(open(opts.nbest)):
        (i, sentence, features) = line.strip().split("|||")
        i = int(i)
        stats = list(bleu.bleu_stats(sentence, references[i]))
        # bleu_score = bleu.bleu(stats)
        smoothed_bleu_score = bleu.smoothed_bleu(stats)
        # making the feature string to float list
        feature_list = [float(x) for x in features.split()]
        if len(nbests)<=i:
            nbests.append([])
        # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
        nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

        if j%5000 == 0:
            sys.stderr.write(".")

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    weights = [ [] for _ in xrange(opts.epo)]
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        avg_theta = [ 0.0 for _ in xrange(arg_num)]
        avg_cnt = 0
        mistake = 0;
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta)
                    
                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake,))
        weights[j] = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]



    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score    
    # print "\n".join([str(weight) for weight in final_theta])

    bleu_score = [0 for _ in weights]
    for j, w in enumerate(weights):
        trans = []
        translation = namedtuple("translation", "english, score")
        system = []
        for i, nbest in enumerate(nbests):
            # for one sentence
            for et in nbest:
                if len(trans) <= int(i):
                    trans.append([])

                trans[int(i)].append(translation(et.sentence, sum([x*y for x,y in zip(w, et.feature_list)])))

            for tran in trans:
                system.append(sorted(tran, key=lambda x: -x.score)[0].english)
        
        stats = [0 for i in xrange(10)]
        for (r,s) in zip(references, system):
            stats = [sum(scores) for scores in zip(stats, bleu.bleu_stats(s,r))]

        bleu_score[j] = bleu.bleu(stats)

    idx = [i for i, bscore in enumerate(bleu_score) if bscore == max(bleu_score)][0]
    sys.stderr.write("Maximum BLEU score of training data is: {}\n".format(max(bleu_score)))
    sys.stderr.write("Corresponding weights are: {}\n".format(" ".join([ str(w) for w in weights[idx] ])))
    print "\n".join([str(weight) for weight in weights[idx]])
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i%100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS)
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("%s is not on disk, so calculating it ... \n" % opts.nbestDS)
        for j,line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests)<=i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

            if j%5000 == 0:
                sys.stderr.write(".")
        sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS)
        write_ds_to_file(nbests, opts.nbestDS)
        sys.stderr.write("Finish writing %s\n" % opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization

    # avg_theta = [ 0.0 for _ in xrange(arg_num)]
    # avg_cnt = 0

    tau = opts.tau # positive learning margin
    sys.stderr.write("\nTraining...\n")
    for iter_num in xrange(opts.epo):
        sys.stderr.write("\nIteration#{} ".format(iter_num + 1))
        cnt = 0;
        # sentence wise updating

        for i, nbest in enumerate(nbests):
            y = sorted(nbest, key = lambda h: h.smoothed_bleu, reverse = True)
            mu = [0.0]*len(nbest)
            w_times_x = [0.0]*len(nbest)
            for j, best in enumerate(nbest):
                # calculate linear function result
                w_times_x[j] = dot_product(theta, best.feature_list)

            # processing pairs 
            top_r = int(len(y)*opts.r)
            bottom_k = int(len(y)*opts.k)
            for j in xrange(len(nbest) - 1):
                for l in xrange(j+1, len(nbest)):
                    if nbest[j].smoothed_bleu <= y[top_r].smoothed_bleu \
                    and nbest[l].smoothed_bleu >= y[- bottom_k].smoothed_bleu \
                    and w_times_x[j] > w_times_x[l] + tau:
                        mu[j] = mu[j] + 1
                        mu[l] = mu[l] - 1
                    elif nbest[j].smoothed_bleu >= y[- bottom_k].smoothed_bleu \
                    and nbest[l].smoothed_bleu <= y[top_r].smoothed_bleu \
                    and w_times_x[j] > w_times_x[l] - tau:
                        mu[j] = mu[j] - 1
                        mu[l] = mu[l] + 1
                    else:
                        cnt += 1
                if (j + 1) % 100 == 0:
                    sys.stderr.write(".")

            vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))]
            for m, best in enumerate(nbest):
                vector_sum = vector_plus(vector_sum, scale_product(mu[m], best.feature_list))

            theta = vector_plus(theta, vector_sum, opts.eta)

            # avg_theta = vector_plus(avg_theta, theta)
            # avg_cnt += 1

        sys.stderr.write("\n Non-supported vectors:  %s\n" % (cnt,))
    

    # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score    
    print "\n".join([str(weight) for weight in theta])
Exemple #17
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i % 100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading nbests datastructure from disk ... \n")
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("No nbests on disk, so calculating ndests ... \n")
        for j, line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests) <= i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score,
                                   feature_list))

            if j % 5000 == 0:
                sys.stderr.write(".")
        write_ds_to_file(nbests, opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0 / arg_num for _ in xrange(arg_num)]  #initialization

    avg_theta = [0.0 for _ in xrange(arg_num)]
    avg_cnt = 0
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        mistake = 0
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu,
                        reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1),
                                        opts.eta)

                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake, ))

    weights = [
        avg / avg_cnt if avg_cnt != 0 else 1 / float(arg_num)
        for avg in avg_theta
    ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score
    print "\n".join([str(weight) for weight in weights])
Exemple #18
0
                     help="English reference sentences")
(opts, _) = optparser.parse_args()
ref = [(line.strip().split()) for line in open(opts.reference).readlines()]
ref = ref[:int(opts.length)]
nbests = []

for n, line in enumerate(open(opts.nbest)):
    (i, sentence, features) = line.strip().split("|||")
    (i, sentence) = (int(i), sentence.strip())
    features = [float(h) for h in features.strip().split()]
    if len(ref) <= i:
        break
    while len(nbests) <= i:
        nbests.append([])
    scores = tuple(bleu_stats(sentence.split(), ref[i]))
    bleu_scores = smoothed_bleu(scores)
    inverse_scores = tuple([-x for x in scores])
    nbests[i].append(
        translation_candidate(sentence, scores, features, bleu_scores))
    if n % 2000 == 0:
        sys.stderr.write(".")
sys.stderr.write("\n")
for i in xrange(len(nbests)):
    nbests[i] = sorted(nbests[i], key=lambda h: h.smoothed_bleu)
    nbests[i] = nbests[i][::-1]

num_features = 5
w = [float(1) / 5] * num_features
updates = [0] * num_features
k = 5
r = 5
Exemple #19
0
def main():
    references = []
    sys.stderr.write("Reading English Sentences\n")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i % 100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS)
    nbests = read_ds_from_file(opts.nbestDS)
    if nbests is None:
        nbests = []
        sys.stderr.write("%s is not on disk, so calculating it ... \n" %
                         opts.nbestDS)
        for j, line in enumerate(open(opts.nbest)):
            (i, sentence, features) = line.strip().split("|||")
            i = int(i)
            stats = list(bleu.bleu_stats(sentence, references[i]))
            # bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            # making the feature string to float list
            feature_list = [float(x) for x in features.split()]
            if len(nbests) <= i:
                nbests.append([])
            # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
            nbests[i].append(entry(sentence, smoothed_bleu_score,
                                   feature_list))

            if j % 5000 == 0:
                sys.stderr.write(".")
        sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS)
        write_ds_to_file(nbests, opts.nbestDS)
        sys.stderr.write("Finish writing %s\n" % opts.nbestDS)

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0 / arg_num for _ in xrange(arg_num)]  #initialization

    # avg_theta = [ 0.0 for _ in xrange(arg_num)]
    # avg_cnt = 0

    tau = opts.tau  # positive learning margin
    sys.stderr.write("\nTraining...\n")
    for iter_num in xrange(opts.epo):
        sys.stderr.write("\nIteration#{} ".format(iter_num + 1))
        cnt = 0
        # sentence wise updating

        for i, nbest in enumerate(nbests):
            y = sorted(nbest, key=lambda h: h.smoothed_bleu, reverse=True)
            mu = [0.0] * len(nbest)
            w_times_x = [0.0] * len(nbest)
            for j, best in enumerate(nbest):
                # calculate linear function result
                w_times_x[j] = dot_product(theta, best.feature_list)

            # processing pairs
            top_r = int(len(y) * opts.r)
            bottom_k = int(len(y) * opts.k)
            for j in xrange(len(nbest) - 1):
                for l in xrange(j + 1, len(nbest)):
                    yj = nbest[j].smoothed_bleu
                    yl = nbest[l].smoothed_bleu
                    if yj < yl \
                    and dist(yj, yl) > opts.epsilon \
                    and w_times_x[j] - w_times_x[l] < g_learn(yj, yl)*tau:
                        mu[j] = mu[j] + g_learn(yj, yl)
                        mu[l] = mu[l] - g_learn(yj, yl)
                    elif yj > yl \
                    and dist(yj, yl) > opts.epsilon \
                    and w_times_x[l] - w_times_x[y] < g_learn(yl, yj)*tau:
                        mu[j] = mu[j] - g_learn(yl, yj)
                        mu[l] = mu[l] + g_learn(yl, yj)
                    else:
                        cnt += 1
                if (j + 1) % 10000 == 0:
                    sys.stderr.write(".")

            vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))]
            for m, best in enumerate(nbest):
                vector_sum = vector_plus(
                    vector_sum, scale_product(mu[m], best.feature_list))

            theta = vector_plus(theta, vector_sum, opts.eta)

            # avg_theta = vector_plus(avg_theta, theta)
            # avg_cnt += 1

        sys.stderr.write("\n Non-supported vectors:  %s\n" % (cnt, ))

    # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ]
    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score
    print "\n".join([str(weight) for weight in theta])
    #sys.stderr.write(opts.nbest)
    #sys.stderr.write(opts.train_en1)
    for n, line in enumerate(open(opts.nbest, 'r')):
        (i, sentence, features) = line.strip().split("|||")
        features = [float(h) for h in features.strip().split()]
        features = features  #   + align_feature + wc_feature
        (i, sentence) = (int(i), sentence.strip())
        if len(ref_en1) <= i:
            break
        # using my own bleu
        scores = tuple(
            bleu.bleu_stats_modified(
                sentence.split(),
                [ref_en1[i], ref_en2[i], ref_en3[i], ref_en4[i]]))
        smoothed_scores = bleu.smoothed_bleu(scores)

        nbests[i].append(
            translation_candidate(sentence, smoothed_scores, features, 0))
        if n % 10000 == 0:
            sys.stderr.write(".")
            sys.stderr.write("Smoothed Score...%f" % smoothed_scores)

    nbests = assign_rank(nbests)  # ranking
    sys.stderr.write("Ranking Completed ...")
    sys.stderr.write("Percpeptron Started %d" % len(nbests))

    if opts.theta is None:
        theta = [0.0 for _ in xrange(len(features))]  #initialize theta
    else:
        theta = [float(line) for line in open(opts.theta, 'r')]
Exemple #21
0
def train(nbest_candidates,
          reference_files,
          init_weights=None,
          epochs=5,
          alpha=0.04,
          tau=100,
          xi=20,
          eta=0.0001):

    # initialization
    print >> sys.stderr, "Initializing training data"
    candidate = namedtuple("candidate",
                           "sentence, features, bleu, smoothed_bleu")
    refs = []
    for reference_file in reference_files:
        refs.append([line.strip().split() for line in open(reference_file)])
    nbests = []
    for n, line in enumerate(nbest_candidates):
        (i, sentence, features) = line.strip().split("|||")
        i = int(i)
        sentence = sentence.strip()
        features = np.array([float(h) for h in features.strip().split()])

        # calculate bleu score and smoothed bleu score
        max_bleu_score = -float('inf')
        for ref in refs:
            stats = tuple(bleu.bleu_stats(sentence.split(), ref[i]))
            bleu_score = bleu.bleu(stats)
            smoothed_bleu_score = bleu.smoothed_bleu(stats)
            max_bleu_score = max(max_bleu_score, smoothed_bleu_score)

        while len(nbests) <= i:
            nbests.append([])
        nbests[i].append(
            candidate(sentence, features, bleu_score, max_bleu_score))

        if n % 2000 == 0:
            sys.stderr.write(".")
    print >> sys.stderr, "\nRetrieved %d candidates for %d sentences" % (
        n, len(nbests))

    # set weights to default
    w = init_weights if init_weights is not None else \
        np.array([1.0/len(nbests[0][0].features)] * len(nbests[0][0].features))
    assert len(w) == len(nbests[0][0].features)
    w_sum = np.zeros(len(nbests[0][0].features))

    # training
    random.seed()
    for i in range(epochs):
        print >> sys.stderr, "Training epoch %d:" % i
        mistakes = 0
        for nbest in nbests:
            if len(nbest) < 2:
                continue

            sample = []
            for j in range(tau):
                (s1, s2) = (nbest[k]
                            for k in random.sample(range(len(nbest)), 2))
                if fabs(s1.smoothed_bleu - s2.smoothed_bleu) > alpha:
                    if s1.smoothed_bleu > s2.smoothed_bleu:
                        sample.append((s1, s2))
                    else:
                        sample.append((s2, s1))
                else:
                    continue

            sample.sort(key=lambda s: s[0].smoothed_bleu - s[1].smoothed_bleu,
                        reverse=True)
            for (s1, s2) in sample[:xi]:
                if np.dot(w, s1.features) <= np.dot(w, s2.features):
                    mistakes += 1
                    w += eta * (s1.features - s2.features
                                )  # this is vector addition!

        w_sum += w
        print >> sys.stderr, "Number of mistakes: %d" % mistakes

    w = w_sum / float(epochs)
    return w
Exemple #22
0
def main():
    nbests = []
    references = []
    sys.stderr.write("Reading English Sentences")
    for i, line in enumerate(open(opts.en)):
        '''Initialize references to correct english sentences'''
        references.append(line)
        if i % 100 == 0:
            sys.stderr.write(".")

    sys.stderr.write("\nReading ndests")
    for j, line in enumerate(open(opts.nbest)):
        (i, sentence, features) = line.strip().split("|||")
        i = int(i)
        stats = list(bleu.bleu_stats(sentence, references[i]))
        # bleu_score = bleu.bleu(stats)
        smoothed_bleu_score = bleu.smoothed_bleu(stats)
        # making the feature string to float list
        feature_list = [float(x) for x in features.split()]
        if len(nbests) <= i:
            nbests.append([])
        # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list))
        nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list))

        if j % 5000 == 0:
            sys.stderr.write(".")

    arg_num = len(nbests[0][0].feature_list)
    theta = [1.0 / arg_num for _ in xrange(arg_num)]  #initialization

    weights = [[] for _ in xrange(opts.epo)]
    sys.stderr.write("\nTraining...\n")
    for j in xrange(opts.epo):
        avg_theta = [0.0 for _ in xrange(arg_num)]
        avg_cnt = 0
        mistake = 0
        for nbest in nbests:
            sample = get_sample(nbest)
            sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu,
                        reverse=True)
            for i in xrange(min(len(sample), opts.xi)):
                v1 = sample[i][0].feature_list
                v2 = sample[i][1].feature_list
                if dot_product(theta, v1) <= dot_product(theta, v2):
                    mistake += 1
                    theta = vector_plus(theta, vector_plus(v1, v2, -1),
                                        opts.eta)

                avg_theta = vector_plus(avg_theta, theta)
                avg_cnt += 1

        sys.stderr.write("Mistake:  %s\n" % (mistake, ))
        weights[j] = [
            avg / avg_cnt if avg_cnt != 0 else 1 / float(arg_num)
            for avg in avg_theta
        ]

    sys.stderr.write("Computing best BLEU score and outputing...\n")
    # instead of print the averaged-out weights, print the weights that maximize the BLEU score
    # print "\n".join([str(weight) for weight in final_theta])

    bleu_score = [0 for _ in weights]
    for j, w in enumerate(weights):
        trans = []
        translation = namedtuple("translation", "english, score")
        system = []
        for i, nbest in enumerate(nbests):
            # for one sentence
            for et in nbest:
                if len(trans) <= int(i):
                    trans.append([])

                trans[int(i)].append(
                    translation(
                        et.sentence,
                        sum([x * y for x, y in zip(w, et.feature_list)])))

            for tran in trans:
                system.append(sorted(tran, key=lambda x: -x.score)[0].english)

        stats = [0 for i in xrange(10)]
        for (r, s) in zip(references, system):
            stats = [
                sum(scores) for scores in zip(stats, bleu.bleu_stats(s, r))
            ]

        bleu_score[j] = bleu.bleu(stats)

    idx = [
        i for i, bscore in enumerate(bleu_score) if bscore == max(bleu_score)
    ][0]
    sys.stderr.write("Maximum BLEU score of training data is: {}\n".format(
        max(bleu_score)))
    sys.stderr.write("Corresponding weights are: {}\n".format(" ".join(
        [str(w) for w in weights[idx]])))
    print "\n".join([str(weight) for weight in weights[idx]])