コード例 #1
0
def get_translated_corpus_stats(langs=['EN-FR']):
    # Validate languages and get corresponding ID's
    fileids = validate_langs_and_get_fileids(langs)

    # Get aligned sentences
    align_sents = comtrans.aligned_sents(fileids=fileids)

    # Get character-level parallel corpus
    sources = []
    targets = []

    for a_sent in align_sents:
        tmp_src = [ord(ch) for ch in ' '.join(a_sent.words)]
        tmp_tgt = [ord(ch) for ch in ' '.join(a_sent.mots)]

        sources.append(tmp_src)
        targets.append(tmp_tgt)

    # Get lengths
    src_lens = []
    tgt_lens = []
    for src, tgt in zip(sources, targets):
        src_lens.append(len(src))
        tgt_lens.append(len(tgt))

    # Return numpy arrays stats
    np_src = np.array(src_lens)
    src_stats = [np.median(np_src), np.mean(np_src), np.std(np_src)]

    np_tgt = np.array(tgt_lens)
    tgt_stats = [np.median(np_tgt), np.mean(np_tgt), np.std(np_tgt)]

    return src_stats, tgt_stats
コード例 #2
0
def get_translated_corpus_chars(langs=['EN-FR'], min_len=75, max_len=250):
    # Validate languages and get corresponding ID's
    fileids = validate_langs_and_get_fileids(langs)

    # Validate min_len
    if min_len > max_len:
        raise ValueError(
            "The 'min_len' variable should be smaller than 'max_len' variable."
        )

    # Load desired parallel corpus(es)
    align_sents = comtrans.aligned_sents(fileids=fileids)

    # Make character-level parallel corpus
    all_bytes = []
    sources = []
    targets = []

    for a_sent in align_sents:
        tmp_src = [ord(ch) for ch in ' '.join(a_sent.words)]
        tmp_tgt = [ord(ch) for ch in ' '.join(a_sent.mots)]

        sources.append(tmp_src)
        targets.append(tmp_tgt)

        new_bytes = np.unique(tmp_src + tmp_tgt)
        for b in new_bytes:
            if b not in all_bytes:
                all_bytes.append(b)

    # Translate all possible bytes into sequential values
    index2char, char2index = get_mapping_items(all_bytes)

    # Remove short and long sentences
    src = []
    tgt = []

    for s, t in zip(sources, targets):
        if min_len <= len(s) < max_len and min_len <= len(t) < max_len:
            src.append(s)
            tgt.append(t)

    # Convert char bytes to encoded list of sequential indices
    for i in range(len(src)):
        src[i] = [char2index[ch] for ch in src[i]]
        tgt[i] = [char2index[ch] for ch in tgt[i]]

    # Add <EOS> to end of sentence and then padding to make "max_len" length
    for i in range(len(src)):
        src[i] += [1]
        src[i] += [0] * (max_len - len(src[i]))

        tgt[i] += [1]
        tgt[i] += [0] * (max_len - len(tgt[i]))

    # Return source, target, and means of converting them back into char values
    return src, tgt, index2char
コード例 #3
0
ファイル: A.py プロジェクト: asubhangi/NLP
def compute_avg_aer(aligned_sents, model, n):
    total_aer = 0
    for index, aligned_sent in enumerate(aligned_sents[:50]):
        my_als = model.align(aligned_sent)
        gold_als = comtrans.aligned_sents()[:350][index]
        aer = gold_als.alignment_error_rate(my_als)
        total_aer += aer

    avg_aer = float(total_aer) / float(n)
    return avg_aer
コード例 #4
0
ファイル: Project1.py プロジェクト: calvintl/COMP90042
def test_IBMModel2(numSentences):

    no_of_sentences = numSentences
    
    sentences = comtrans.aligned_sents()[:no_of_sentences]
    sent_pairs = []

    # Adding the None values to the foreign sentences and constructing them
    # into the correct format to use with the ibm_model2()
    for sentence in sentences:
        eng_words = sentence.mots
        foreign_words = [None] + sentence.words
        sent_pairs.append((eng_words,foreign_words))

    align, t_ef = ibm_model2(sent_pairs, 15)

    fin_align = []  # The list of final alignments

    # Finding the best alignment for each of the words in the sentences
    for (e, f) in sent_pairs:
        l_e = len(e)
        l_f = len(f)
        curr_align = []
        for i in range(1, l_e+1):
            max_prob = -1
            for j in range(1, l_f+1):
                prob = align[j][i][l_e][l_f]
                if max_prob < prob:
                    max_prob = prob
                    max_j = j
            curr_align.append((i-1, max_j-1))
            
        fin_align.append(curr_align)

    # Calculating the precision of the alignments
    avg_precision = 0
    count = 0
    for sent_alignments in fin_align:
        algn = ''
        for (e, f) in sent_alignments:
            algn += "%d-%d " %(f,e)

        avg_precision += sentences[count].precision(algn)
        count += 1

    avg_precision /= count

    return avg_precision
コード例 #5
0
def makeMixedText(minLen=MIN_LEN, maxLen=MAX_LEN, numForeign=NUM_FOREIGN):

    numSents = len(comtrans.sents(fileids=['alignment-de-en.txt']))

    foreign = np.zeros(numSents, dtype=bool)
    gt = []
    while len(gt) < numForeign:
        a = np.random.randint(minLen, numSents - minLen)
        b = a + np.random.randint(minLen, maxLen + 1)
        if not foreign[(a - minLen):(b + minLen)].any():
            foreign[a:b] = True
            gt.append((a, b))

    return [
        s.words if foreign[i] else s.mots for i, s in enumerate(
            comtrans.aligned_sents(fileids=['alignment-de-en.txt']))
    ], gt
コード例 #6
0
    def _load_corpus(self, mode='train'):

        # load en-fr parallel corpus
        from nltk.corpus import comtrans
        als = comtrans.aligned_sents('alignment-en-fr.txt')

        # make character-level parallel corpus
        all_byte, sources, targets = [], [], []
        for al in als:
            src = [ord(ch)
                   for ch in ' '.join(al.words)]  # source language byte stream
            tgt = [ord(ch)
                   for ch in ' '.join(al.mots)]  # target language byte stream
            sources.append(src)
            targets.append(tgt)
            all_byte.extend(src + tgt)

        # make vocabulary
        self.index2byte = [0, 1] + list(
            np.unique(all_byte))  # add <EMP>, <EOS> tokens
        self.byte2index = {}
        for i, b in enumerate(self.index2byte):
            self.byte2index[b] = i
        self.voca_size = len(self.index2byte)
        self.max_len = 150

        # remove short and long sentence
        src, tgt = [], []
        for s, t in zip(sources, targets):
            if 50 <= len(s) < self.max_len and 50 <= len(t) < self.max_len:
                src.append(s)
                tgt.append(t)

        # convert to index list and add <EOS> to end of sentence
        for i in range(len(src)):
            src[i] = [self.byte2index[ch] for ch in src[i]] + [1]
            tgt[i] = [self.byte2index[ch] for ch in tgt[i]] + [1]

        # zero-padding
        for i in range(len(tgt)):
            src[i] += [0] * (self.max_len - len(src[i]))
            tgt[i] += [0] * (self.max_len - len(tgt[i]))

        # swap source and target : french -> english
        return tgt, src
コード例 #7
0
    def retrieve_corpora(self, corpora_name):
        try:
            als = comtrans.aligned_sents(corpora_name)
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                     + ': Comtrans download of corpora "' + str(corpora_name) + '" exception: ' \
                     + str(ex) + '.'
            Log.error(errmsg)
            raise Exception(errmsg)

        sentences_l1 = [sent.words for sent in als]
        sentences_l2 = [sent.mots for sent in als]
        Log.info('Sentences length = ' + str(len(sentences_l1)))

        # Filter length
        (sentences_l1, sentences_l2) = self.filter_pair_sentence_length(
            sentences_arr_l1=sentences_l1,
            sentences_arr_l2=sentences_l2,
            max_len=20)
        Log.info('Sentences length after filtering = ' +
                 str(len(sentences_l1)))
        assert len(sentences_l1) == len(sentences_l2)
        return (sentences_l1, sentences_l2)
コード例 #8
0
ファイル: B.py プロジェクト: actondong/NLP
			t[t_key] = cout_dict[t_key]/cout_dict[t_key[0]]
		for t_key in t2.keys():
			t2[t_key] = cout_dict2[t_key]/cout_dict2[t_key[0]]
		for q_key in q.keys():
			q[q_key] = cout_dict[q_key]/cout_dict[q_key[1:]]
		for q_key in q2.keys():
			q2[q_key] = cout_dict2[q_key]/cout_dict2[q_key[1:]]
        return (t,q)

def main(aligned_sents):
    ba = BerkeleyAligner(aligned_sents, 10)
    A.save_model_output(aligned_sents, ba, "ba.txt")
    avg_aer = A.compute_avg_aer(aligned_sents, ba, 50)
    #Report aer for each sentence of first 20 sentences
    for i,aligned_sent in enumerate(aligned_sents[:20]):
	print "ba , aer of sentence "+str(i)+" "+str(A.compute_avg_aer([aligned_sent],ba,1))

    print ('Berkeley Aligner')
    print ('---------------------------')
    print('Average AER: {0:.3f}\n'.format(avg_aer))
if __name__ == "__main__":
    aligned_sents = comtrans.aligned_sents()[:350]
    main(aligned_sents)
#    ba = BerkeleyAligner(aligned_sents, 20)
#    A.save_model_output(aligned_sents,ba,"ba.txt")
#    avg_aer = A.compute_avg_aer(aligned_sents,ba,50)
#    print ('Berkeley Aligner')
#    print ('---------------------------')
#    print('Average AER: {0:.3f}\n'.format(avg_aer))
#
コード例 #9
0
from nltk.corpus import comtrans
print(comtrans.aligned_sents()[0])
print(comtrans.aligned_sents()[0].words)
print(comtrans.aligned_sents()[0].mots)
print(comtrans.aligned_sents()[0].alignment)
コード例 #10
0
from nltk.corpus import comtrans
import A
import B
import EC

if __name__ == '__main__':
    aligned_sents = comtrans.aligned_sents()[:350]
    A.main(aligned_sents)
    B.main(aligned_sents)
    EC.main(aligned_sents)
コード例 #11
0
def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):
    als = comtrans.aligned_sents(translated_sentences_l1_l2)
    sentences_l1 = [sentence.words for sentence in als]
    sentences_l2 = [sentence.mots for sentence in als]
    return sentences_l1, sentences_l2
コード例 #12
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=512,
                        help='Number of units')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        default='wmt',
                        help='Input directory')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    if False:
        sentences = comtrans.aligned_sents('alignment-en-fr.txt')
        source_ids = collections.defaultdict(lambda: len(source_ids))
        target_ids = collections.defaultdict(lambda: len(target_ids))
        target_ids['eos']
        data = []
        for sentence in sentences:
            source = numpy.array([source_ids[w] for w in sentence.words], 'i')
            target = numpy.array([target_ids[w] for w in sentence.mots], 'i')
            data.append((source, target))
        print('Source vocabulary: %d' % len(source_ids))
        print('Target vocabulary: %d' % len(target_ids))

        test_data = data[:len(data) / 10]
        train_data = data[len(data) / 10:]
    else:
        # Check file
        en_path = os.path.join(args.input, 'giga-fren.release2.fixed.en')
        source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path)
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'giga-fren.release2.fixed.fr')
        target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path)
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert len(source_data) == len(target_data)
        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert len(source_data) == len(target_data)
        test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index for index, word in enumerate(source_vocab)}
        target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    # Define Model
    model = net.Seq2seq(15, len(source_ids), len(target_ids), args.unit)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    # Setup Optimizer
    optimizer = chainer.optimizers.NesterovAG(lr=0.25, momentum=0.99)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(0.1))

    # Setup Trainer
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)
    iter_per_epoch = len(train_data) // args.batchsize
    print('Number of iter/epoch =', iter_per_epoch)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=seq2seq_pad_concat_convert,
                                       device=args.gpu)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    log_trigger = (min(1000, iter_per_epoch // 2), 'iteration')

    def floor_step(trigger):
        floored = trigger[0] - trigger[0] % log_trigger[0]
        if floored <= 0:
            floored = trigger[0]
        return (floored, trigger[1])

    # Validation every half epoch
    eval_trigger = floor_step((iter_per_epoch // 2, 'iteration'))
    fail_trigger = FailMinValueTrigger('val/main/perp', eval_trigger)
    record_trigger = training.triggers.MaxValueTrigger('val/main/perp',
                                                       eval_trigger)

    evaluator = extensions.Evaluator(test_iter,
                                     model,
                                     converter=seq2seq_pad_concat_convert,
                                     device=args.gpu)
    evaluator.default_name = 'val'
    trainer.extend(evaluator, trigger=eval_trigger)
    # Only if validation perplexity fails to be improved,
    # lr is decayed (until 1e-4).
    trainer.extend(extensions.ExponentialShift('lr', 0.1, target=1e-4),
                   trigger=fail_trigger)
    trainer.extend(extensions.observe_lr(), trigger=eval_trigger)
    # Only if a model gets best validation score,
    # save the model
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}.npz'),
                   trigger=record_trigger)

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array([source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one('Who are we ?', 'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words[i] for i in source])
        target = ' '.join([target_words[i] for i in target])
        translate_one(source, target)

    # Gereneration Test
    trainer.extend(translate, trigger=(min(200, iter_per_epoch), 'iteration'))
    # Calculate BLEU every half epoch
    trainer.extend(CalculateBleu(model,
                                 test_data,
                                 'val/main/bleu',
                                 device=args.gpu,
                                 batch=args.batchsize // 4),
                   trigger=floor_step((iter_per_epoch // 2, 'iteration')))

    # Log
    trainer.extend(extensions.LogReport(trigger=log_trigger),
                   trigger=log_trigger)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'val/main/loss', 'main/perp',
        'val/main/perp', 'main/acc', 'val/main/acc', 'val/main/bleu', 'lr',
        'elapsed_time'
    ]),
                   trigger=log_trigger)

    print('start training')
    trainer.run()
コード例 #13
0
ファイル: ibm2.py プロジェクト: chinyeelee/CLIRProject1
    for englishword in englishvocab:
        for foreignword in foreignvocab:
            t[englishword][foreignword] = round(t[englishword][foreignword], 4)
    for lf in all_lf:
        for le in all_le:
            for j in all_j:
                for i in all_i:
                    alignment[i][j][le][lf] = round(alignment[i][j][le][lf], 4)
    return t, alignment

# Basic sentence pairs
ibm2(sentpairs)

# Set up comtrans sentence pairs
germanenglishsentpairs = [(align_sent.words , align_sent.mots)
                          for align_sent in comtrans.aligned_sents()[:100]]

# Run IBM Model 2 on contrans sentence pairs, increase number of sentence pairs
# if required
t, a = ibm2(germanenglishsentpairs[:10])


"""
IBM Model 3

This model develop upon IBM Model 2 which provides fertility, to
consider the cases where a foreign words get translated to 2 English words
or the cases where the flavoring particle is dropped in the translation.
Further more, this also take into account of English words getting added
without corresponding input word in the foreign source.
コード例 #14
0
def retrieve_corpora(translated_sentences_l1_l2='alignment-de-en.txt'):
    print("Retrieving corpora: {}".format(translated_sentences_l1_l2))
    als = comtrans.aligned_sents(translated_sentences_l1_l2)
    sentences_l1 = [sent.words for sent in als]
    sentences_l2 = [sent.mots for sent in als]
    return sentences_l1, sentences_l2