def get_translated_corpus_stats(langs=['EN-FR']): # Validate languages and get corresponding ID's fileids = validate_langs_and_get_fileids(langs) # Get aligned sentences align_sents = comtrans.aligned_sents(fileids=fileids) # Get character-level parallel corpus sources = [] targets = [] for a_sent in align_sents: tmp_src = [ord(ch) for ch in ' '.join(a_sent.words)] tmp_tgt = [ord(ch) for ch in ' '.join(a_sent.mots)] sources.append(tmp_src) targets.append(tmp_tgt) # Get lengths src_lens = [] tgt_lens = [] for src, tgt in zip(sources, targets): src_lens.append(len(src)) tgt_lens.append(len(tgt)) # Return numpy arrays stats np_src = np.array(src_lens) src_stats = [np.median(np_src), np.mean(np_src), np.std(np_src)] np_tgt = np.array(tgt_lens) tgt_stats = [np.median(np_tgt), np.mean(np_tgt), np.std(np_tgt)] return src_stats, tgt_stats
def get_translated_corpus_chars(langs=['EN-FR'], min_len=75, max_len=250): # Validate languages and get corresponding ID's fileids = validate_langs_and_get_fileids(langs) # Validate min_len if min_len > max_len: raise ValueError( "The 'min_len' variable should be smaller than 'max_len' variable." ) # Load desired parallel corpus(es) align_sents = comtrans.aligned_sents(fileids=fileids) # Make character-level parallel corpus all_bytes = [] sources = [] targets = [] for a_sent in align_sents: tmp_src = [ord(ch) for ch in ' '.join(a_sent.words)] tmp_tgt = [ord(ch) for ch in ' '.join(a_sent.mots)] sources.append(tmp_src) targets.append(tmp_tgt) new_bytes = np.unique(tmp_src + tmp_tgt) for b in new_bytes: if b not in all_bytes: all_bytes.append(b) # Translate all possible bytes into sequential values index2char, char2index = get_mapping_items(all_bytes) # Remove short and long sentences src = [] tgt = [] for s, t in zip(sources, targets): if min_len <= len(s) < max_len and min_len <= len(t) < max_len: src.append(s) tgt.append(t) # Convert char bytes to encoded list of sequential indices for i in range(len(src)): src[i] = [char2index[ch] for ch in src[i]] tgt[i] = [char2index[ch] for ch in tgt[i]] # Add <EOS> to end of sentence and then padding to make "max_len" length for i in range(len(src)): src[i] += [1] src[i] += [0] * (max_len - len(src[i])) tgt[i] += [1] tgt[i] += [0] * (max_len - len(tgt[i])) # Return source, target, and means of converting them back into char values return src, tgt, index2char
def compute_avg_aer(aligned_sents, model, n): total_aer = 0 for index, aligned_sent in enumerate(aligned_sents[:50]): my_als = model.align(aligned_sent) gold_als = comtrans.aligned_sents()[:350][index] aer = gold_als.alignment_error_rate(my_als) total_aer += aer avg_aer = float(total_aer) / float(n) return avg_aer
def test_IBMModel2(numSentences): no_of_sentences = numSentences sentences = comtrans.aligned_sents()[:no_of_sentences] sent_pairs = [] # Adding the None values to the foreign sentences and constructing them # into the correct format to use with the ibm_model2() for sentence in sentences: eng_words = sentence.mots foreign_words = [None] + sentence.words sent_pairs.append((eng_words,foreign_words)) align, t_ef = ibm_model2(sent_pairs, 15) fin_align = [] # The list of final alignments # Finding the best alignment for each of the words in the sentences for (e, f) in sent_pairs: l_e = len(e) l_f = len(f) curr_align = [] for i in range(1, l_e+1): max_prob = -1 for j in range(1, l_f+1): prob = align[j][i][l_e][l_f] if max_prob < prob: max_prob = prob max_j = j curr_align.append((i-1, max_j-1)) fin_align.append(curr_align) # Calculating the precision of the alignments avg_precision = 0 count = 0 for sent_alignments in fin_align: algn = '' for (e, f) in sent_alignments: algn += "%d-%d " %(f,e) avg_precision += sentences[count].precision(algn) count += 1 avg_precision /= count return avg_precision
def makeMixedText(minLen=MIN_LEN, maxLen=MAX_LEN, numForeign=NUM_FOREIGN): numSents = len(comtrans.sents(fileids=['alignment-de-en.txt'])) foreign = np.zeros(numSents, dtype=bool) gt = [] while len(gt) < numForeign: a = np.random.randint(minLen, numSents - minLen) b = a + np.random.randint(minLen, maxLen + 1) if not foreign[(a - minLen):(b + minLen)].any(): foreign[a:b] = True gt.append((a, b)) return [ s.words if foreign[i] else s.mots for i, s in enumerate( comtrans.aligned_sents(fileids=['alignment-de-en.txt'])) ], gt
def _load_corpus(self, mode='train'): # load en-fr parallel corpus from nltk.corpus import comtrans als = comtrans.aligned_sents('alignment-en-fr.txt') # make character-level parallel corpus all_byte, sources, targets = [], [], [] for al in als: src = [ord(ch) for ch in ' '.join(al.words)] # source language byte stream tgt = [ord(ch) for ch in ' '.join(al.mots)] # target language byte stream sources.append(src) targets.append(tgt) all_byte.extend(src + tgt) # make vocabulary self.index2byte = [0, 1] + list( np.unique(all_byte)) # add <EMP>, <EOS> tokens self.byte2index = {} for i, b in enumerate(self.index2byte): self.byte2index[b] = i self.voca_size = len(self.index2byte) self.max_len = 150 # remove short and long sentence src, tgt = [], [] for s, t in zip(sources, targets): if 50 <= len(s) < self.max_len and 50 <= len(t) < self.max_len: src.append(s) tgt.append(t) # convert to index list and add <EOS> to end of sentence for i in range(len(src)): src[i] = [self.byte2index[ch] for ch in src[i]] + [1] tgt[i] = [self.byte2index[ch] for ch in tgt[i]] + [1] # zero-padding for i in range(len(tgt)): src[i] += [0] * (self.max_len - len(src[i])) tgt[i] += [0] * (self.max_len - len(tgt[i])) # swap source and target : french -> english return tgt, src
def retrieve_corpora(self, corpora_name): try: als = comtrans.aligned_sents(corpora_name) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Comtrans download of corpora "' + str(corpora_name) + '" exception: ' \ + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) sentences_l1 = [sent.words for sent in als] sentences_l2 = [sent.mots for sent in als] Log.info('Sentences length = ' + str(len(sentences_l1))) # Filter length (sentences_l1, sentences_l2) = self.filter_pair_sentence_length( sentences_arr_l1=sentences_l1, sentences_arr_l2=sentences_l2, max_len=20) Log.info('Sentences length after filtering = ' + str(len(sentences_l1))) assert len(sentences_l1) == len(sentences_l2) return (sentences_l1, sentences_l2)
t[t_key] = cout_dict[t_key]/cout_dict[t_key[0]] for t_key in t2.keys(): t2[t_key] = cout_dict2[t_key]/cout_dict2[t_key[0]] for q_key in q.keys(): q[q_key] = cout_dict[q_key]/cout_dict[q_key[1:]] for q_key in q2.keys(): q2[q_key] = cout_dict2[q_key]/cout_dict2[q_key[1:]] return (t,q) def main(aligned_sents): ba = BerkeleyAligner(aligned_sents, 10) A.save_model_output(aligned_sents, ba, "ba.txt") avg_aer = A.compute_avg_aer(aligned_sents, ba, 50) #Report aer for each sentence of first 20 sentences for i,aligned_sent in enumerate(aligned_sents[:20]): print "ba , aer of sentence "+str(i)+" "+str(A.compute_avg_aer([aligned_sent],ba,1)) print ('Berkeley Aligner') print ('---------------------------') print('Average AER: {0:.3f}\n'.format(avg_aer)) if __name__ == "__main__": aligned_sents = comtrans.aligned_sents()[:350] main(aligned_sents) # ba = BerkeleyAligner(aligned_sents, 20) # A.save_model_output(aligned_sents,ba,"ba.txt") # avg_aer = A.compute_avg_aer(aligned_sents,ba,50) # print ('Berkeley Aligner') # print ('---------------------------') # print('Average AER: {0:.3f}\n'.format(avg_aer)) #
from nltk.corpus import comtrans print(comtrans.aligned_sents()[0]) print(comtrans.aligned_sents()[0].words) print(comtrans.aligned_sents()[0].mots) print(comtrans.aligned_sents()[0].alignment)
from nltk.corpus import comtrans import A import B import EC if __name__ == '__main__': aligned_sents = comtrans.aligned_sents()[:350] A.main(aligned_sents) B.main(aligned_sents) EC.main(aligned_sents)
def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'): als = comtrans.aligned_sents(translated_sentences_l1_l2) sentences_l1 = [sentence.words for sentence in als] sentences_l2 = [sentence.mots for sentence in als] return sentences_l1, sentences_l2
def main(): parser = argparse.ArgumentParser(description='Chainer example: seq2seq') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--unit', '-u', type=int, default=512, help='Number of units') parser.add_argument('--input', '-i', type=str, default='wmt', help='Input directory') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') args = parser.parse_args() if False: sentences = comtrans.aligned_sents('alignment-en-fr.txt') source_ids = collections.defaultdict(lambda: len(source_ids)) target_ids = collections.defaultdict(lambda: len(target_ids)) target_ids['eos'] data = [] for sentence in sentences: source = numpy.array([source_ids[w] for w in sentence.words], 'i') target = numpy.array([target_ids[w] for w in sentence.mots], 'i') data.append((source, target)) print('Source vocabulary: %d' % len(source_ids)) print('Target vocabulary: %d' % len(target_ids)) test_data = data[:len(data) / 10] train_data = data[len(data) / 10:] else: # Check file en_path = os.path.join(args.input, 'giga-fren.release2.fixed.en') source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path) source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, 'giga-fren.release2.fixed.fr') target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path) target_data = europal.make_dataset(fr_path, target_vocab) assert len(source_data) == len(target_data) print('Original training data size: %d' % len(source_data)) train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) < 50 and 0 < len(t) < 50] print('Filtered training data size: %d' % len(train_data)) en_path = os.path.join(args.input, 'dev', 'newstest2013.en') source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr') target_data = europal.make_dataset(fr_path, target_vocab) assert len(source_data) == len(target_data) test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) and 0 < len(t)] source_ids = {word: index for index, word in enumerate(source_vocab)} target_ids = {word: index for index, word in enumerate(target_vocab)} target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} # Define Model model = net.Seq2seq(15, len(source_ids), len(target_ids), args.unit) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) # Setup Optimizer optimizer = chainer.optimizers.NesterovAG(lr=0.25, momentum=0.99) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(0.1)) # Setup Trainer train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) test_iter = chainer.iterators.SerialIterator(test_data, args.batchsize, repeat=False, shuffle=False) iter_per_epoch = len(train_data) // args.batchsize print('Number of iter/epoch =', iter_per_epoch) updater = training.StandardUpdater(train_iter, optimizer, converter=seq2seq_pad_concat_convert, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) log_trigger = (min(1000, iter_per_epoch // 2), 'iteration') def floor_step(trigger): floored = trigger[0] - trigger[0] % log_trigger[0] if floored <= 0: floored = trigger[0] return (floored, trigger[1]) # Validation every half epoch eval_trigger = floor_step((iter_per_epoch // 2, 'iteration')) fail_trigger = FailMinValueTrigger('val/main/perp', eval_trigger) record_trigger = training.triggers.MaxValueTrigger('val/main/perp', eval_trigger) evaluator = extensions.Evaluator(test_iter, model, converter=seq2seq_pad_concat_convert, device=args.gpu) evaluator.default_name = 'val' trainer.extend(evaluator, trigger=eval_trigger) # Only if validation perplexity fails to be improved, # lr is decayed (until 1e-4). trainer.extend(extensions.ExponentialShift('lr', 0.1, target=1e-4), trigger=fail_trigger) trainer.extend(extensions.observe_lr(), trigger=eval_trigger) # Only if a model gets best validation score, # save the model trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}.npz'), trigger=record_trigger) def translate_one(source, target): words = europal.split_sentence(source) print('# source : ' + ' '.join(words)) x = model.xp.array([source_ids.get(w, 1) for w in words], 'i') ys = model.translate([x])[0] words = [target_words[y] for y in ys] print('# result : ' + ' '.join(words)) print('# expect : ' + target) @chainer.training.make_extension(trigger=(200, 'iteration')) def translate(trainer): translate_one('Who are we ?', 'Qui sommes-nous?') translate_one( 'And it often costs over a hundred dollars ' + 'to obtain the required identity card .', 'Or, il en coûte souvent plus de cent dollars ' + 'pour obtenir la carte d\'identité requise.') source, target = test_data[numpy.random.choice(len(test_data))] source = ' '.join([source_words[i] for i in source]) target = ' '.join([target_words[i] for i in target]) translate_one(source, target) # Gereneration Test trainer.extend(translate, trigger=(min(200, iter_per_epoch), 'iteration')) # Calculate BLEU every half epoch trainer.extend(CalculateBleu(model, test_data, 'val/main/bleu', device=args.gpu, batch=args.batchsize // 4), trigger=floor_step((iter_per_epoch // 2, 'iteration'))) # Log trainer.extend(extensions.LogReport(trigger=log_trigger), trigger=log_trigger) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'val/main/loss', 'main/perp', 'val/main/perp', 'main/acc', 'val/main/acc', 'val/main/bleu', 'lr', 'elapsed_time' ]), trigger=log_trigger) print('start training') trainer.run()
for englishword in englishvocab: for foreignword in foreignvocab: t[englishword][foreignword] = round(t[englishword][foreignword], 4) for lf in all_lf: for le in all_le: for j in all_j: for i in all_i: alignment[i][j][le][lf] = round(alignment[i][j][le][lf], 4) return t, alignment # Basic sentence pairs ibm2(sentpairs) # Set up comtrans sentence pairs germanenglishsentpairs = [(align_sent.words , align_sent.mots) for align_sent in comtrans.aligned_sents()[:100]] # Run IBM Model 2 on contrans sentence pairs, increase number of sentence pairs # if required t, a = ibm2(germanenglishsentpairs[:10]) """ IBM Model 3 This model develop upon IBM Model 2 which provides fertility, to consider the cases where a foreign words get translated to 2 English words or the cases where the flavoring particle is dropped in the translation. Further more, this also take into account of English words getting added without corresponding input word in the foreign source.
def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'): print("Retrieving corpora: {}".format(translated_sentences_l1_l2)) als = comtrans.aligned_sents(translated_sentences_l1_l2) sentences_l1 = [sent.words for sent in als] sentences_l2 = [sent.mots for sent in als] return sentences_l1, sentences_l2