def computeBleu(system, reference): stats = [0 for i in xrange(10)] stats = [ sum(scores) for scores in zip(stats, bleu.bleu_stats(system, reference)) ] return bleu.smoothed_bleu(stats)
def main(opts, references, input_nbest, theta0=None): entry = namedtuple("entry", "sentence, smoothed_bleu, feature_list") nbests = None if nbests is None: nbests = [] sys.stderr.write("No nbests on disk, so calculating ndests ... \n") for j,line in enumerate(input_nbest): (i, sentence, features) = line.strip().split("|||") i = int(i) # lst_smoothed_bleu_score = [] # for ref in references: # stats = list(bleu.bleu_stats(sentence, ref[i])) # lst_smoothed_bleu_score.append( bleu.smoothed_bleu(stats) ) # # making the feature string to float list # avg_smoothed_bleu_score = float(sum(lst_smoothed_bleu_score)) / len(lst_smoothed_bleu_score) stats = list(bleu.bleu_stats(sentence, references[i])) smoothed_bleu_score = bleu.smoothed_bleu(stats) feature_list = [float(x) for x in features.split()] if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, avg_smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") arg_num = len(nbests[0][0].feature_list) theta = theta0 if theta is None: theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization avg_theta = [ 0.0 for _ in xrange(arg_num)] avg_cnt = 0 sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): mistake = 0; for nbest in nbests: sample = get_sample(nbest, opts) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake,)) weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of return the averaged-out weights, return the weights that maximize the BLEU score return "\n".join([str(weight) for weight in weights])
def main(): nbests = defaultdict(list) references = {} for i, line in enumerate(open(opts.en)): ''' Initialize references to correct english sentences ''' references[i] = line for line in open(opts.nbest): (i, sentence, features) = line.strip().split("|||") stats = list(bleu_stats(sentence, references[int(i)])) bleu_score = bleu(stats) smoothed_bleu_score = smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] nbests[int(i)].append( (sentence, bleu_score, smoothed_bleu_score, feature_list)) theta = [1.0 / 6 for _ in xrange(6)] #initialization for i in range(0, opts.epo): mistake = 0 for nbest in nbests: sample = get_sample(nbests[nbest]) sample.sort(key=lambda i: i[0][2] - i[1][2], reverse=True) for i in range(0, min(len(sample), opts.xi)): for j in range(0, 6): if theta[j] * sample[i][0][3][j] <= theta[j] * sample[i][ 1][3][j]: mistake = mistake + 1 theta[j] = theta[j] + opts.eta * (sample[i][0][3][j] - sample[i][1][3][j]) sys.stderr.write("Mistake: %s\n" % (mistake, )) print "\n".join([str(weight) for weight in theta])
def main(): nbests = defaultdict(list) references = {} for i, line in enumerate(open(opts.en)): ''' Initialize references to correct english sentences ''' references[i] = line for line in open(opts.nbest): (i, sentence, features) = line.strip().split("|||") stats = list(bleu_stats(sentence, references[int(i)])) bleu_score = bleu(stats) smoothed_bleu_score = smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] nbests[int(i)].append((sentence, bleu_score, smoothed_bleu_score, feature_list)) theta = [1.0/6 for _ in xrange(6)] #initialization for i in range(0, opts.epo): mistake = 0; for nbest in nbests: sample = get_sample(nbests[nbest]) sample.sort(key=lambda i: i[0][2] - i[1][2], reverse=True) for i in range(0, min(len(sample), opts.xi)): for j in range(0, 6): if theta[j] * sample[i][0][3][j] <= theta[j] * sample[i][1][3][j]: mistake = mistake + 1 theta[j] = theta[j] + opts.eta * (sample[i][0][3][j] - sample[i][1][3][j]) sys.stderr.write("Mistake: %s\n" % (mistake,)) print "\n".join([str(weight) for weight in theta])
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i%100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading nbests datastructure from disk ... \n") nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("No nbests on disk, so calculating ndests ... \n") for j,line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") write_ds_to_file(nbests, opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization avg_theta = [ 0.0 for _ in xrange(arg_num)] avg_cnt = 0 sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): mistake = 0; for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake,)) weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in weights])
def bleu(self, hyp, ref): hyp = list(hyp) ref = list(ref) if self._end_token in hyp: hyp = hyp[: hyp.index(self._end_token)] if self._end_token in ref: ref = ref[: ref.index(self._end_token)] return smoothed_bleu(hyp, ref)
def bleu(self, hyp, ref): hyp = list(hyp) ref = list(ref) if self._end_token in hyp: hyp = hyp[:hyp.index(self._end_token)] if self._end_token in ref: ref = ref[:ref.index(self._end_token)] return smoothed_bleu(hyp, ref)
def run(self, data_x): output_vars = self.compute(*data_x) _, _, tgt_tokens, tgt_masks = data_x bleus = [] for i in range(tgt_tokens.shape[0]): target_len = int(tgt_masks[i].sum()) ref_tokens = tgt_tokens[i, :target_len] out_tokens = output_vars.outputs[i, :target_len] bleus.append(smoothed_bleu(out_tokens, ref_tokens)) output_vars.bleu = numpy.mean(bleus) if self._criteria == 'mixed': output_vars.mixed = output_vars.cost - output_vars.bleu return self._extract_costs(output_vars)
def main(): nbests = [] references = [] sys.stderr.write("Reading English Sentences") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i % 100 == 0: sys.stderr.write(".") sys.stderr.write("\nReading ndests") for j, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu_stats(sentence, references[i])) bleu_score = bleu(stats) smoothed_bleu_score = smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests) <= i: nbests.append([]) nbests[i].append( entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) if j % 5000 == 0: sys.stderr.write(".") arg_num = len(nbests[0][0].feature_list) theta = [1.0 / arg_num for _ in xrange(arg_num)] #initialization sys.stderr.write("\nTraining...\n") for i in xrange(opts.epo): mistake = 0 for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) # for j in xrange(arg_num): # if theta[j] * sample[i][0][3][j] <= theta[j] * sample[i][1][3][j]: # mistake = mistake + 1 # theta[j] += opts.eta * (sample[i][0].feature_list[j] - sample[i][1].feature_list[j]) sys.stderr.write("Mistake: %s\n" % (mistake, )) print "\n".join([str(weight) for weight in theta])
def main(): nbests = [] references = [] sys.stderr.write("Reading English Sentences") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i%100 == 0: sys.stderr.write(".") sys.stderr.write("\nReading ndests") for j,line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu_stats(sentence, references[i])) # bleu_score = bleu(stats) smoothed_bleu_score = smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if j == 10: break if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") arg_num = len(nbests[0][0].feature_list) theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization avg_theta = [ 0 for _ in xrange(arg_num)] avg_cnt = 0 sys.stderr.write("\nTraining...\n") for i in xrange(opts.epo): mistake = 0; for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake,)) final_theta = [ t / avg_cnt for t in avg_theta] print "\n".join([str(weight) for weight in final_theta])
def get_nbest(nbest, source, target): src = [line.strip().split() for line in open(source).readlines()] ref = [line.strip().split() for line in open(target).readlines()] translations = [ line.strip().split("|||") for line in open(nbest).readlines() ] nbests = [[] for _ in ref] original_feature_count = 0 sys.stderr.write("Computing smoothed bleu...") translation = namedtuple("translation", "features, smoothed_bleu") for (i, sentence, features) in translations: (i, sentence, features) = (int(i), sentence.strip(), [float(f) for f in features.strip().split()]) sentence_split = sentence.strip().split() stats = tuple(bleu.bleu_stats(sentence_split, ref[i])) nbests[i].append(translation(features, bleu.smoothed_bleu(stats))) return nbests
optparser.add_option("-r", "--reference", dest="reference", default="dev/all.cn-en.en0", help="English reference sentences") (opts, _) = optparser.parse_args() ref = [(line.strip().split()) for line in open(opts.reference).readlines()] ref = ref[:int(opts.length)] nbests = [] for n, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") (i, sentence) = (int(i), sentence.strip()) features = [float(h) for h in features.strip().split()] if len(ref) <= i: break while len(nbests) <= i: nbests.append([]) scores = tuple(bleu_stats(sentence.split(), ref[i])) bleu_scores = smoothed_bleu(scores) inverse_scores = tuple([-x for x in scores]) nbests[i].append(translation_candidate(sentence, scores, features, bleu_scores)) if n % 2000 == 0: sys.stderr.write(".") sys.stderr.write("\n") for i in xrange(len(nbests)): nbests[i] = sorted(nbests[i], key=lambda h: h.smoothed_bleu) nbests[i] = nbests[i][::-1] num_features = 5 w = [float(1)/5]*num_features updates = [0]*num_features k = 5 r = 5 margin = 0.2
# In[150]: nbests = [] for n, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") (i, sentence) = (int(i), sentence.strip()) features = np.array([float(it) for it in features.split()]) if len(ref) <= i: break while len(nbests) <= i: nbests.append([]) scores = tuple(bleu.bleu_stats(sentence.split(), ref[i])) inverse_scores = tuple([-x for x in scores]) smoothed_score = bleu.smoothed_bleu(inverse_scores) nbests[i].append((translation_candidate(sentence, inverse_scores, features), smoothed_score)) if n % 2000 == 0: sys.stderr.write(".") # small size for testing, delete it when release # if n > 4000: # break # In[151]: tau = 5000 alpha = 0.1
cnt = 0 #count # of sentence #we can run the first part for only once and save it. ###1st part,compute blue score for each candidate translation. for line in open(opts.nbest): cnt = cnt + 1 #print '{0}\r'.format("\rIteration: %d/%d." %(cnt, 432303)), (i, sentence, features) = line.strip().split("|||") if len(nbests) <= int(i): nbests.append([]) features = [float(h) for h in features.strip().split()] stats = [0 for kk in xrange(10)] #code from score-reranker.py stats = [ sum(scores) for scores in zip( stats, bleu.bleu_stats(sentence.strip().split(), ref[int(i)])) ] score = bleu.smoothed_bleu(stats) nbests[int(i)].append(candidate(sentence.strip(), features, score)) cPickle.dump(nbests, open( 'my_nbests_add.p', 'wb')) #save the result. no need to run the first part each time #print "finished calculating nbests." nbests = cPickle.load(open('my_nbests_add.p', 'rb')) #load pickled file #2nd part,learn the optimal weight epochs = 20 #setup parameters mentioned in pseudocode tau_maxsize = 100 #5000 xi = 10 #50 tau = [] alpha = 0.05 eta = 0.1 theta = [1.0 / num_features
def main(): nbests = [] references = [] sys.stderr.write("Reading English Sentences") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i%100 == 0: sys.stderr.write(".") sys.stderr.write("\nReading ndests") for j,line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") arg_num = len(nbests[0][0].feature_list) theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization weights = [ [] for _ in xrange(opts.epo)] sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): avg_theta = [ 0.0 for _ in xrange(arg_num)] avg_cnt = 0 mistake = 0; for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake,)) weights[j] = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score # print "\n".join([str(weight) for weight in final_theta]) bleu_score = [0 for _ in weights] for j, w in enumerate(weights): trans = [] translation = namedtuple("translation", "english, score") system = [] for i, nbest in enumerate(nbests): # for one sentence for et in nbest: if len(trans) <= int(i): trans.append([]) trans[int(i)].append(translation(et.sentence, sum([x*y for x,y in zip(w, et.feature_list)]))) for tran in trans: system.append(sorted(tran, key=lambda x: -x.score)[0].english) stats = [0 for i in xrange(10)] for (r,s) in zip(references, system): stats = [sum(scores) for scores in zip(stats, bleu.bleu_stats(s,r))] bleu_score[j] = bleu.bleu(stats) idx = [i for i, bscore in enumerate(bleu_score) if bscore == max(bleu_score)][0] sys.stderr.write("Maximum BLEU score of training data is: {}\n".format(max(bleu_score))) sys.stderr.write("Corresponding weights are: {}\n".format(" ".join([ str(w) for w in weights[idx] ]))) print "\n".join([str(weight) for weight in weights[idx]])
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i%100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS) nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("%s is not on disk, so calculating it ... \n" % opts.nbestDS) for j,line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests)<=i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j%5000 == 0: sys.stderr.write(".") sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS) write_ds_to_file(nbests, opts.nbestDS) sys.stderr.write("Finish writing %s\n" % opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0/arg_num for _ in xrange(arg_num)] #initialization # avg_theta = [ 0.0 for _ in xrange(arg_num)] # avg_cnt = 0 tau = opts.tau # positive learning margin sys.stderr.write("\nTraining...\n") for iter_num in xrange(opts.epo): sys.stderr.write("\nIteration#{} ".format(iter_num + 1)) cnt = 0; # sentence wise updating for i, nbest in enumerate(nbests): y = sorted(nbest, key = lambda h: h.smoothed_bleu, reverse = True) mu = [0.0]*len(nbest) w_times_x = [0.0]*len(nbest) for j, best in enumerate(nbest): # calculate linear function result w_times_x[j] = dot_product(theta, best.feature_list) # processing pairs top_r = int(len(y)*opts.r) bottom_k = int(len(y)*opts.k) for j in xrange(len(nbest) - 1): for l in xrange(j+1, len(nbest)): if nbest[j].smoothed_bleu <= y[top_r].smoothed_bleu \ and nbest[l].smoothed_bleu >= y[- bottom_k].smoothed_bleu \ and w_times_x[j] > w_times_x[l] + tau: mu[j] = mu[j] + 1 mu[l] = mu[l] - 1 elif nbest[j].smoothed_bleu >= y[- bottom_k].smoothed_bleu \ and nbest[l].smoothed_bleu <= y[top_r].smoothed_bleu \ and w_times_x[j] > w_times_x[l] - tau: mu[j] = mu[j] - 1 mu[l] = mu[l] + 1 else: cnt += 1 if (j + 1) % 100 == 0: sys.stderr.write(".") vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))] for m, best in enumerate(nbest): vector_sum = vector_plus(vector_sum, scale_product(mu[m], best.feature_list)) theta = vector_plus(theta, vector_sum, opts.eta) # avg_theta = vector_plus(avg_theta, theta) # avg_cnt += 1 sys.stderr.write("\n Non-supported vectors: %s\n" % (cnt,)) # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in theta])
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i % 100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading nbests datastructure from disk ... \n") nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("No nbests on disk, so calculating ndests ... \n") for j, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests) <= i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j % 5000 == 0: sys.stderr.write(".") write_ds_to_file(nbests, opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0 / arg_num for _ in xrange(arg_num)] #initialization avg_theta = [0.0 for _ in xrange(arg_num)] avg_cnt = 0 sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): mistake = 0 for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake, )) weights = [ avg / avg_cnt if avg_cnt != 0 else 1 / float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in weights])
help="English reference sentences") (opts, _) = optparser.parse_args() ref = [(line.strip().split()) for line in open(opts.reference).readlines()] ref = ref[:int(opts.length)] nbests = [] for n, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") (i, sentence) = (int(i), sentence.strip()) features = [float(h) for h in features.strip().split()] if len(ref) <= i: break while len(nbests) <= i: nbests.append([]) scores = tuple(bleu_stats(sentence.split(), ref[i])) bleu_scores = smoothed_bleu(scores) inverse_scores = tuple([-x for x in scores]) nbests[i].append( translation_candidate(sentence, scores, features, bleu_scores)) if n % 2000 == 0: sys.stderr.write(".") sys.stderr.write("\n") for i in xrange(len(nbests)): nbests[i] = sorted(nbests[i], key=lambda h: h.smoothed_bleu) nbests[i] = nbests[i][::-1] num_features = 5 w = [float(1) / 5] * num_features updates = [0] * num_features k = 5 r = 5
def main(): references = [] sys.stderr.write("Reading English Sentences\n") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i % 100 == 0: sys.stderr.write(".") sys.stderr.write("\nTry reading %s from disk ... \n" % opts.nbestDS) nbests = read_ds_from_file(opts.nbestDS) if nbests is None: nbests = [] sys.stderr.write("%s is not on disk, so calculating it ... \n" % opts.nbestDS) for j, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests) <= i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j % 5000 == 0: sys.stderr.write(".") sys.stderr.write("\nWriting %s to disk ... \n" % opts.nbestDS) write_ds_to_file(nbests, opts.nbestDS) sys.stderr.write("Finish writing %s\n" % opts.nbestDS) arg_num = len(nbests[0][0].feature_list) theta = [1.0 / arg_num for _ in xrange(arg_num)] #initialization # avg_theta = [ 0.0 for _ in xrange(arg_num)] # avg_cnt = 0 tau = opts.tau # positive learning margin sys.stderr.write("\nTraining...\n") for iter_num in xrange(opts.epo): sys.stderr.write("\nIteration#{} ".format(iter_num + 1)) cnt = 0 # sentence wise updating for i, nbest in enumerate(nbests): y = sorted(nbest, key=lambda h: h.smoothed_bleu, reverse=True) mu = [0.0] * len(nbest) w_times_x = [0.0] * len(nbest) for j, best in enumerate(nbest): # calculate linear function result w_times_x[j] = dot_product(theta, best.feature_list) # processing pairs top_r = int(len(y) * opts.r) bottom_k = int(len(y) * opts.k) for j in xrange(len(nbest) - 1): for l in xrange(j + 1, len(nbest)): yj = nbest[j].smoothed_bleu yl = nbest[l].smoothed_bleu if yj < yl \ and dist(yj, yl) > opts.epsilon \ and w_times_x[j] - w_times_x[l] < g_learn(yj, yl)*tau: mu[j] = mu[j] + g_learn(yj, yl) mu[l] = mu[l] - g_learn(yj, yl) elif yj > yl \ and dist(yj, yl) > opts.epsilon \ and w_times_x[l] - w_times_x[y] < g_learn(yl, yj)*tau: mu[j] = mu[j] - g_learn(yl, yj) mu[l] = mu[l] + g_learn(yl, yj) else: cnt += 1 if (j + 1) % 10000 == 0: sys.stderr.write(".") vector_sum = [0 for _ in xrange(len(nbest[0].feature_list))] for m, best in enumerate(nbest): vector_sum = vector_plus( vector_sum, scale_product(mu[m], best.feature_list)) theta = vector_plus(theta, vector_sum, opts.eta) # avg_theta = vector_plus(avg_theta, theta) # avg_cnt += 1 sys.stderr.write("\n Non-supported vectors: %s\n" % (cnt, )) # weights = [ avg / avg_cnt if avg_cnt !=0 else 1/float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score print "\n".join([str(weight) for weight in theta])
#sys.stderr.write(opts.nbest) #sys.stderr.write(opts.train_en1) for n, line in enumerate(open(opts.nbest, 'r')): (i, sentence, features) = line.strip().split("|||") features = [float(h) for h in features.strip().split()] features = features # + align_feature + wc_feature (i, sentence) = (int(i), sentence.strip()) if len(ref_en1) <= i: break # using my own bleu scores = tuple( bleu.bleu_stats_modified( sentence.split(), [ref_en1[i], ref_en2[i], ref_en3[i], ref_en4[i]])) smoothed_scores = bleu.smoothed_bleu(scores) nbests[i].append( translation_candidate(sentence, smoothed_scores, features, 0)) if n % 10000 == 0: sys.stderr.write(".") sys.stderr.write("Smoothed Score...%f" % smoothed_scores) nbests = assign_rank(nbests) # ranking sys.stderr.write("Ranking Completed ...") sys.stderr.write("Percpeptron Started %d" % len(nbests)) if opts.theta is None: theta = [0.0 for _ in xrange(len(features))] #initialize theta else: theta = [float(line) for line in open(opts.theta, 'r')]
def train(nbest_candidates, reference_files, init_weights=None, epochs=5, alpha=0.04, tau=100, xi=20, eta=0.0001): # initialization print >> sys.stderr, "Initializing training data" candidate = namedtuple("candidate", "sentence, features, bleu, smoothed_bleu") refs = [] for reference_file in reference_files: refs.append([line.strip().split() for line in open(reference_file)]) nbests = [] for n, line in enumerate(nbest_candidates): (i, sentence, features) = line.strip().split("|||") i = int(i) sentence = sentence.strip() features = np.array([float(h) for h in features.strip().split()]) # calculate bleu score and smoothed bleu score max_bleu_score = -float('inf') for ref in refs: stats = tuple(bleu.bleu_stats(sentence.split(), ref[i])) bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) max_bleu_score = max(max_bleu_score, smoothed_bleu_score) while len(nbests) <= i: nbests.append([]) nbests[i].append( candidate(sentence, features, bleu_score, max_bleu_score)) if n % 2000 == 0: sys.stderr.write(".") print >> sys.stderr, "\nRetrieved %d candidates for %d sentences" % ( n, len(nbests)) # set weights to default w = init_weights if init_weights is not None else \ np.array([1.0/len(nbests[0][0].features)] * len(nbests[0][0].features)) assert len(w) == len(nbests[0][0].features) w_sum = np.zeros(len(nbests[0][0].features)) # training random.seed() for i in range(epochs): print >> sys.stderr, "Training epoch %d:" % i mistakes = 0 for nbest in nbests: if len(nbest) < 2: continue sample = [] for j in range(tau): (s1, s2) = (nbest[k] for k in random.sample(range(len(nbest)), 2)) if fabs(s1.smoothed_bleu - s2.smoothed_bleu) > alpha: if s1.smoothed_bleu > s2.smoothed_bleu: sample.append((s1, s2)) else: sample.append((s2, s1)) else: continue sample.sort(key=lambda s: s[0].smoothed_bleu - s[1].smoothed_bleu, reverse=True) for (s1, s2) in sample[:xi]: if np.dot(w, s1.features) <= np.dot(w, s2.features): mistakes += 1 w += eta * (s1.features - s2.features ) # this is vector addition! w_sum += w print >> sys.stderr, "Number of mistakes: %d" % mistakes w = w_sum / float(epochs) return w
def main(): nbests = [] references = [] sys.stderr.write("Reading English Sentences") for i, line in enumerate(open(opts.en)): '''Initialize references to correct english sentences''' references.append(line) if i % 100 == 0: sys.stderr.write(".") sys.stderr.write("\nReading ndests") for j, line in enumerate(open(opts.nbest)): (i, sentence, features) = line.strip().split("|||") i = int(i) stats = list(bleu.bleu_stats(sentence, references[i])) # bleu_score = bleu.bleu(stats) smoothed_bleu_score = bleu.smoothed_bleu(stats) # making the feature string to float list feature_list = [float(x) for x in features.split()] if len(nbests) <= i: nbests.append([]) # nbests[i].append(entry(sentence, bleu_score, smoothed_bleu_score, feature_list)) nbests[i].append(entry(sentence, smoothed_bleu_score, feature_list)) if j % 5000 == 0: sys.stderr.write(".") arg_num = len(nbests[0][0].feature_list) theta = [1.0 / arg_num for _ in xrange(arg_num)] #initialization weights = [[] for _ in xrange(opts.epo)] sys.stderr.write("\nTraining...\n") for j in xrange(opts.epo): avg_theta = [0.0 for _ in xrange(arg_num)] avg_cnt = 0 mistake = 0 for nbest in nbests: sample = get_sample(nbest) sample.sort(key=lambda i: i[0].smoothed_bleu - i[1].smoothed_bleu, reverse=True) for i in xrange(min(len(sample), opts.xi)): v1 = sample[i][0].feature_list v2 = sample[i][1].feature_list if dot_product(theta, v1) <= dot_product(theta, v2): mistake += 1 theta = vector_plus(theta, vector_plus(v1, v2, -1), opts.eta) avg_theta = vector_plus(avg_theta, theta) avg_cnt += 1 sys.stderr.write("Mistake: %s\n" % (mistake, )) weights[j] = [ avg / avg_cnt if avg_cnt != 0 else 1 / float(arg_num) for avg in avg_theta ] sys.stderr.write("Computing best BLEU score and outputing...\n") # instead of print the averaged-out weights, print the weights that maximize the BLEU score # print "\n".join([str(weight) for weight in final_theta]) bleu_score = [0 for _ in weights] for j, w in enumerate(weights): trans = [] translation = namedtuple("translation", "english, score") system = [] for i, nbest in enumerate(nbests): # for one sentence for et in nbest: if len(trans) <= int(i): trans.append([]) trans[int(i)].append( translation( et.sentence, sum([x * y for x, y in zip(w, et.feature_list)]))) for tran in trans: system.append(sorted(tran, key=lambda x: -x.score)[0].english) stats = [0 for i in xrange(10)] for (r, s) in zip(references, system): stats = [ sum(scores) for scores in zip(stats, bleu.bleu_stats(s, r)) ] bleu_score[j] = bleu.bleu(stats) idx = [ i for i, bscore in enumerate(bleu_score) if bscore == max(bleu_score) ][0] sys.stderr.write("Maximum BLEU score of training data is: {}\n".format( max(bleu_score))) sys.stderr.write("Corresponding weights are: {}\n".format(" ".join( [str(w) for w in weights[idx]]))) print "\n".join([str(weight) for weight in weights[idx]])