def main():
    ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec')
    en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec')
    print("loaded the dictionaries")

    ja_dic.apply_transform('alignment_matrices/ja.txt')
    en_dic.apply_transform('alignment_matrices/en.txt')
    print("transformed the dictionaries")

    idx = 0
    result = {}
    result_f = open("en_ja_multifast.txt", "w")
    en_word_list = list(en_dic.word2id.keys())
    print("The total length of English pretrained vector : " +
          str(len(en_word_list)))

    for en_word in tqdm(en_word_list):
        ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=15)
        result[en_word] = ja_words
        idx += 1
        result[en_word] = ja_words
        resut_str = ",".join(result[en_word])
        result_f.write(str(idx) + "," + en_word + "," + resut_str + "\n")
        if idx > 5000:
            exit()

    result_f.close()
Ejemplo n.º 2
0
def main2():
    for zzz in LANGUAGE_LIST:
        lang = zzz[0]
        # get original embed
        system(
            "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
            % (OUT_DIR, lang, lang),
            pp=True)
        # project with LIB-matrix
        lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang))
        lang_dict.apply_transform("%s/alignment_matrices/%s.txt" %
                                  (LIB_DIR, lang))
        lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
Ejemplo n.º 3
0
def main():
    # first get the English one
    lang = "en"
    system(
        "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
        % (OUT_DIR, lang, lang),
        pp=True)
    # en_dict = FastVector(vector_file='%s/wiki.en.vec' % OUT_DIR)
    for zzz in LANGUAGE_LIST:
        lang, fnames = zzz[0], zzz[1]
        printing("Dealing with lang %s." % lang)
        for curf in ["train", "dev", "test"]:
            out_fname = "%s/%s_%s.conllu" % (OUT_DIR, lang, curf)
            fout = zopen(out_fname, "w")
            for fname in fnames:
                last_name = fname.split("-")[-1].lower()
                path_name = "%s/%s/%s_%s-ud-%s.conllu" % (UD2_DIR, fname, lang,
                                                          last_name, curf)
                if os.path.exists(path_name):
                    with zopen(path_name) as fin:
                        deal_conll_file(fin, fout)
            fout.close()
            # stat
            system('cat %s | grep -E "^$" | wc' % out_fname, pp=True)
            system('cat %s | grep -Ev "^$" | wc' % out_fname, pp=True)
            system(
                "cat %s | grep -Ev '^$' | cut -f 5 -d $'\t'| grep -Ev 'PUNCT|SYM' | wc"
                % out_fname,
                pp=True)
        # get original embed
        system(
            "wget -nc -O %s/wiki.%s.vec https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.%s.vec"
            % (OUT_DIR, lang, lang),
            pp=True)
        # project with LIB-matrix
        lang_dict = FastVector(vector_file='%s/wiki.%s.vec' % (OUT_DIR, lang))
        lang_dict.apply_transform("%s/alignment_matrices/%s.txt" %
                                  (LIB_DIR, lang))
        lang_dict.export("%s/wiki.multi.%s.vec" % (OUT_DIR, lang))
Ejemplo n.º 4
0
    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)


lang1_dictionary = FastVector(vector_file=args.lang1)
lang2_dictionary = FastVector(vector_file=args.lang2)

bilingual_dictionary = []
file_object = open(args.dict, "r")
lines = file_object.readlines()
for line in lines:
    line = re.sub(r'\n', '', line)
    w_lang2, w_lang1 = line.split('\t')
    if w_lang1 in lang1_dictionary.word2id.keys(
    ) and w_lang2 in lang2_dictionary.word2id.keys():
        bilingual_dictionary.append(tuple((w_lang2, w_lang1)))

print("Dic Size: " + str(len(bilingual_dictionary)))

# form the training matrices# form
source_matrix, target_matrix = make_training_matrices(lang1_dictionary,
                                                      lang2_dictionary,
                                                      bilingual_dictionary)
# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
lang1_dictionary.apply_transform(transform)

lang1_dictionary.export(args.out1)
lang2_dictionary.export(args.out2)
Ejemplo n.º 5
0
ru_words = set(ru_dictionary.word2id.keys())
fr_words = set(fr_dictionary.word2id.keys())
overlap = list(ru_words & fr_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]

# Let's align the French vectors to the Russian vectors, using only this "free" dictionary that we acquired without any bilingual expert knowledge.

# In[ ]:

# form the training matrices
source_matrix, target_matrix = make_training_matrices(fr_dictionary,
                                                      ru_dictionary,
                                                      bilingual_dictionary)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
fr_dictionary.apply_transform(transform)

# Finally, we re-evaluate the similarity of "chat" and "кот":

# In[4]:

fr_vector = fr_dictionary["chat"]
ru_vector = ru_dictionary["кот"]
print(FastVector.cosine_similarity(fr_vector, ru_vector))

# "chat" and "кот" are pretty similar after all :)
#
# Use this simple "identical strings" trick to align other language pairs for yourself, or prepare your own expert bilingual dictionaries for optimal performance.
Ejemplo n.º 6
0
from fasttext import FastVector
import json

ja_dic = FastVector(vector_file='../vecmap/data/wiki.ja.vec')
en_dic = FastVector(vector_file='../vecmap/data/wiki.en.vec')
print("loaded the dictionaries")

ja_dic.apply_transform('alignment_matrices/ja.txt')
en_dic.apply_transform('alignment_matrices/en.txt')
print("transformed the dictionaries")

en_word_list = [
    "cat", "dog", "apple", "car", "train", "school", "student", "teacher"
]
ja_word_list = ["猫", "犬", "りんご", "車", "電車", "学校", "生徒", "先生"]

result_f = open("multi_fast.txt", "w")
result = {}
# Ja_word_list 10 nearest neighbor
for ja_word in ja_word_list:
    en_words = en_dic.translate_k_nearest_neighbour(ja_dic[ja_word], k=20)
    result[ja_word] = en_words
    resut_str = ",".join(result[ja_word])
    result_f.write(ja_word + "," + resut_str + "\n")

# En_word_list 10 nearest neighbor
for en_word in en_word_list:
    ja_words = ja_dic.translate_k_nearest_neighbour(en_dic[en_word], k=20)
    result[en_word] = ja_words
    resut_str = ",".join(result[en_word])
    result_f.write(en_word + "," + resut_str + "\n")
Ejemplo n.º 7
0
def train(args):
    train_data_src = read_corpus(args.train_src, source='src')
    train_data_tgt = read_corpus(args.train_tar, source='tgt')
    train_data_src_mono = read_corpus(args.train_src_mono, source='src')
    train_data_tgt_mono = read_corpus(args.train_tar_mono, source='tgt')

    dev_data_src = read_corpus(args.dev_src, source='src')
    dev_data_tgt = read_corpus(args.dev_tar, source='tgt')

    train_data = list(
        zip(train_data_src, train_data_tgt, train_data_src_mono,
            train_data_tgt_mono))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args.batch_size)
    valid_niter = int(args.valid_iter)
    log_every = int(args.log_every)
    model_save_path = args.save_path

    "Vocab dict"
    vocab_src = pickle.load(open(args.vocab_src, 'rb'))
    vocab_tar = pickle.load(open(args.vocab_tar, 'rb'))

    "Optimizer params"
    s2s_param = []
    t2t_param = []
    s2t_param = []
    t2s_param = []

    "Embed"  # pretrained (and fixed), cross-lingual embeddings
    args.embed_size = 300
    from fasttext import FastVector
    src_embed_path = 'embed/' + args.embed_src
    tar_embed_path = 'embed/' + args.embed_tar
    try:
        vectors_src = pickle.load(open(src_embed_path, 'rb'))
    except FileNotFoundError:
        vectors_src = FastVector(vector_file=args.embed_src)
        vectors_src.apply_transform(args.embed_alignment)
        pickle.dump(vectors_src, open(src_embed_path, 'wb+'))
    try:
        vectors_tar = pickle.load(open(tar_embed_path, 'rb'))
    except FileNotFoundError:
        vectors_tar = FastVector(
            vector_file=args.embed_tar)  # tar is en, no alignment required
        pickle.dump(vectors_tar, open(tar_embed_path, 'wb+'))

    src2embed = lambda word: torch.FloatTensor(vectors_src[
        word]) if word in vectors_src else torch.zeros(300)
    tar2embed = lambda word: torch.FloatTensor(vectors_tar[
        word]) if word in vectors_tar else torch.zeros(300)
    embedder_src = Embedder(
        vocab_src.dict_size(), args.embed_size,
        nn.Embedding.from_pretrained(torch.stack([
            src2embed(word.lower() if word is not None else word)
            for word in vocab_src.id2word
        ],
                                                 dim=0),
                                     freeze=True))
    embedder_tar = Embedder(
        vocab_tar.dict_size(), args.embed_size,
        nn.Embedding.from_pretrained(torch.stack([
            tar2embed(word.lower() if word is not None else word)
            for word in vocab_tar.id2word
        ],
                                                 dim=0),
                                     freeze=True))

    "Generator"
    gen_src = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda()
    gen_src_wrapper = WrapperEmbeddingGenerator(gen_src, embedder_src).cuda()
    gen_tar = EmbeddingGenerator(args.hidden_size, args.embed_size).cuda()
    gen_tar_wrapper = WrapperEmbeddingGenerator(gen_tar, embedder_tar).cuda()

    if args.gen_src != "":
        gen_src_wrapper.load_weight(args.gen_src)
    else:
        [s2s_param, s2t_param,
         t2s_param] = add_to_optimizer(gen_src_wrapper,
                                       [s2s_param, s2t_param, t2s_param])
    if args.gen_tar != "":
        gen_tar_wrapper.load_weight(args.gen_tar)
    else:
        [s2t_param, t2s_param,
         t2t_param] = add_to_optimizer(gen_tar_wrapper,
                                       [s2t_param, t2s_param, t2t_param])

    if args.multi_gpu:
        gen_src_wrapper = nn.DataParallel(gen_src_wrapper, device_ids=[0, 1])
        gen_tar_wrapper = nn.DataParallel(gen_tar_wrapper, device_ids=[0, 1])

    "encoder"  # shared encoder
    encoder = GRUEncoder(args.embed_size,
                         args.hidden_size,
                         bidirectional=args.encoder_bidir,
                         layers=args.encoder_layer,
                         dropout=args.dropout).cuda()
    if args.multi_gpu:
        encoder = nn.DataParallel(encoder, device_ids=[0, 1])

    [s2s_param, s2t_param, t2s_param, t2t_param
     ] = add_to_optimizer(encoder,
                          [s2s_param, s2t_param, t2s_param, t2t_param])

    "Decoder"
    decoder_src = AttentionDecoder(args.embed_size,
                                   args.hidden_size,
                                   1,
                                   args.dropout,
                                   input_feed=True).cuda()
    decoder_tar = AttentionDecoder(args.embed_size,
                                   args.hidden_size,
                                   1,
                                   args.dropout,
                                   input_feed=True).cuda()
    if args.multi_gpu:
        decoder_src = nn.DataParallel(decoder_src, device_ids=[0, 1])
        decoder_tar = nn.DataParallel(decoder_tar, device_ids=[0, 1])

    [s2s_param, s2t_param,
     t2s_param] = add_to_optimizer(decoder_src,
                                   [s2s_param, s2t_param, t2s_param])
    [s2t_param, t2s_param,
     t2t_param] = add_to_optimizer(decoder_tar,
                                   [s2t_param, t2s_param, t2t_param])

    "Translators"
    s2s_model = MT(vocab_src,
                   vocab_src,
                   embedder_src,
                   embedder_src,
                   gen_src_wrapper,
                   encoder,
                   decoder_src,
                   denoising=True,
                   multi_gpu=args.multi_gpu)
    t2t_model = MT(vocab_tar,
                   vocab_tar,
                   embedder_tar,
                   embedder_tar,
                   gen_tar_wrapper,
                   encoder,
                   decoder_tar,
                   denoising=True,
                   multi_gpu=args.multi_gpu)
    s2t_model = MT(vocab_src,
                   vocab_tar,
                   embedder_src,
                   embedder_tar,
                   gen_tar_wrapper,
                   encoder,
                   decoder_tar,
                   denoising=False,
                   multi_gpu=args.multi_gpu)
    t2s_model = MT(vocab_tar,
                   vocab_src,
                   embedder_tar,
                   embedder_src,
                   gen_src_wrapper,
                   encoder,
                   decoder_src,
                   denoising=False,
                   multi_gpu=args.multi_gpu)

    "optimizers"
    s2s_optimizer = torch.optim.Adam(s2s_param, lr=args.lr)
    t2t_optimizer = torch.optim.Adam(t2t_param, lr=args.lr)
    s2t_optimizer = torch.optim.Adam(s2t_param, lr=args.lr)
    t2s_optimizer = torch.optim.Adam(t2s_param, lr=args.lr)

    def save_model():
        # save embedder
        if args.embed_src == "":
            embedder_src.save_weight(args.save_path + "/embed_src.bin")
        if args.embed_tar == "":
            embedder_tar.save_weight(args.save_path + "/embed_tar.bin")

        # save generator
        if args.gen_src == "":
            gen_src_wrapper.save_weight(args.save_path + "/gen_src.bin")
        if args.gen_tar == "":
            gen_tar_wrapper.save_weight(args.save_path + "/gen_tar.bin")

        # save encoder
        encoder.save_weight(args.save_path + "/encoder.bin")

        # save decoder
        decoder_src.save_weight(args.save_path + "/decoder_src.bin")
        decoder_tar.save_weight(args.save_path + "/decoder_tar.bin")

        # save optimizer

        print("all models saved")

    def train_step(mt, optimizer, src_sents, tar_sents):
        optimizer.zero_grad()
        loss = mt.get_loss(src_sents, tar_sents, train=True)
        loss += loss.data[0]
        res = loss.cpu().detach().item()
        loss.div(args.batch_size).backward()
        optimizer.step()
        return res

    def train_step_backtranslate(mt, optimizer, src_sents, max_ratio):
        tar_sents = mt.greedy(src_sents, max_ratio, mode=False)
        res = train_step(mt, optimizer, src_sents, tar_sents)
        return res

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cumulative_tgt_words = report_tgt_words = 0
    cumulative_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    if not os.path.exists(args.save_path):
        os.mkdir(args.save_path)
    while True:
        epoch += 1

        for src_sents, tgt_sents, src_mono_sents, tgt_mono_sents in batch_iter(
                train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1

            batch_size = len(src_sents)

            srclen = max(map(len, src_sents))
            tar_len = max(map(len, tgt_sents))
            print("SRCLEN {} TARLEN {}".format(srclen, tar_len))

            model = s2t_model
            # (batch_size)
            train_step(s2s_model, s2s_optimizer, src_sents, src_sents)
            print("finish s2s")
            train_step(t2t_model, t2t_optimizer, tgt_sents, tgt_sents)
            print("finish t2t")
            train_step(s2s_model, s2s_optimizer, src_mono_sents, src_sents)
            print("finish s2s mono")
            train_step(t2t_model, t2t_optimizer, tgt_mono_sents, tgt_sents)
            print("finish t2t mono")

            train_step(t2s_model, t2s_optimizer, tgt_sents, src_sents)
            print("finish t2s")
            loss = train_step(model, s2t_optimizer, src_sents, tgt_sents)
            print("finish s2t")

            train_step_backtranslate(s2t_model, s2t_optimizer, src_sents,
                                     (tar_len / srclen))
            print("finish s2t back")
            train_step_backtranslate(t2s_model, t2s_optimizer, tgt_sents,
                                     (srclen / tar_len))
            print("finish t2s back")
            train_step_backtranslate(s2t_model, s2t_optimizer, src_mono_sents,
                                     (tar_len / srclen))
            print("finish s2t back mono")
            train_step_backtranslate(t2s_model, t2s_optimizer, tgt_mono_sents,
                                     (srclen / tar_len))
            print("finish t2s back mono")
            os.system("nvidia-smi")
            report_loss += loss
            cum_loss += loss

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cumulative_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cumulative_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cumulative_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # the following code performs validation on dev set, and controls the learning schedule
            # if the dev score is better than the last check point, then the current model is saved.
            # otherwise, we allow for that performance degeneration for up to `--patience` times;
            # if the dev score does not increase after `--patience` iterations, we reload the previously
            # saved best model (and the state of the optimizer), halve the learning rate and continue
            # training. This repeats for up to `--max-num-trial` times.
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cumulative_examples,
                       np.exp(cum_loss / cumulative_tgt_words),
                       cumulative_examples),
                    file=sys.stderr)

                cum_loss = cumulative_examples = cumulative_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = model.evaluate_ppl(
                    dev_data, batch_size=args.batch_size
                )  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                save_model()

                # if is_better:
                #     patience = 0
                #     print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                #     model.save(model_save_path)

                #     # You may also save the optimizer's state
                # elif patience < int(args.patience):
                #     patience += 1
                #     print('hit patience %d' % patience, file=sys.stderr)

                #     if patience == int(args.patience):
                #         num_trial += 1
                #         print('hit #%d trial' % num_trial, file=sys.stderr)
                #         if num_trial == int(args.max_num_trail):
                #             print('early stop!', file=sys.stderr)
                #             exit(0)

                #         # decay learning rate, and restore from previously best checkpoint
                #         lr = lr * float(args.lr_decay)
                #         print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                #         # load model
                #         model_save_path

                #         print('restore parameters of the optimizers', file=sys.stderr)
                #         # You may also need to load the state of the optimizer saved before

                #         # reset patience
                #         patience = 0

                if epoch == int(args.max_epoch):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
Ejemplo n.º 8
0
en_vector = en_dictionary["love"]
zh_vector = zh_dictionary["爱"]

# going to print 0.0004326613965749648
print(FastVector.cosine_similarity(en_vector, zh_vector))

zh_words = set(zh_dictionary.word2id.keys())
en_words = set(en_dictionary.word2id.keys())
overlap = list(zh_words & en_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]

# form the training matrices
source_matrix, target_matrix = make_training_matrices(en_dictionary,
                                                      zh_dictionary,
                                                      bilingual_dictionary)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
en_dictionary.apply_transform(transform)

en_vector = en_dictionary["love"]
zh_vector = zh_dictionary["爱"]

# going to print 0.18727020978991674
print(FastVector.cosine_similarity(en_vector, zh_vector))

en_dictionary.export("cc.en.aligned.to.zh.vec")

embedding = gluonnlp.embedding.FastText.from_file('cc.en.aligned.to.zh.vec')
embedding.serialize('cc.en.300.aligned.to.zh.vec.npz')
Ejemplo n.º 9
0
print "Readling Dictionary"
BI_DICT = codecs.open("o.s2t_f", "r").readlines()
BI_DICT = parse_BI(BI_DICT)
print "Readling Dictionary (END)"

# SRC_WORD = "昨天"
# TGT_WORD = "yesterday"
SRC_WORD = "钥匙"
TGT_WORD = "keys"
en_dictionary = FastVector(vector_file='en.emb.orig.vec')
other_dictionary = FastVector(vector_file='tizh.emb.orig.vec')

test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)

# form the training matrices
print "Learning SVD"
source_matrix, target_matrix = make_training_matrices(other_dictionary,
                                                      en_dictionary, BI_DICT)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
other_dictionary.apply_transform(transform)
# zh
test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)
# ti
SRC_WORD = "กุญ"
test_word(other_dictionary, en_dictionary, SRC_WORD, TGT_WORD)

print "Writing transform Qe out"
other_dictionary.export("Qe")
Ejemplo n.º 10
0
if __name__ == "__main__":
    # load the datasets and perform split into training and test set
    dir = os.path.join(os.getcwd(), "expcode", "numerical_code")
    en_corpus = pickle.load(open(os.path.join(dir, 'english_vocab.pkl'),
                                 'rb'))[:100]  # CHANGE THIS WHEN WE HAVE DB
    fr_corpus = pickle.load(open(os.path.join(dir, 'french_vocab.pkl'),
                                 'rb'))[:100]  # CHANGE THIS WHEN WE HAVE DB

    # load the counts and co-occurences
    en_dict = FastVector(
        vector_file='/Users/williamst-arnaud/Downloads/cc.en.300.vec')
    fr_dict = FastVector(
        vector_file='/Users/williamst-arnaud/Downloads/cc.fr.300.vec')

    en_dict.apply_transform(
        '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/en.txt'
    )
    fr_dict.apply_transform(
        '/Users/williamst-arnaud/Downloads/fastText_multilingual-master/alignment_matrices/fr.txt'
    )

    # number of items in dataset
    n = len(en_corpus)

    start = time.time()
    w, l, losses = train(en_corpus,
                         fr_corpus,
                         2 * 300,
                         lamb=1. / n,
                         nb_epochs=5 * n)
    print(time.time() - start)