Beispiel #1
0
def main():
    params = parse_arguments()

    if params.mode == 0:
        u = util.Utils(params)
        u.run()

    else:
        print("Reading embedding numpy files...")
        use_cuda = False
        if params.mode in [1,2]:
            use_cuda = True
        src_emb_array, tgt_emb_array = util.load_npy_two(params.data_dir, 'src.npy', 'tgt.npy')
        print("Done.")
        print("Converting arrays to embedding layers...")
        src_emb = util.convert_to_embeddings(src_emb_array, use_cuda)
        tgt_emb = util.convert_to_embeddings(tgt_emb_array, use_cuda)
        print("Done.")
        
        #src_emb.weight.data /= src_emb.weight.data.norm(2, dim=1).view((-1, 1))
        #tgt_emb.weight.data /= tgt_emb.weight.data.norm(2, dim=1).view((-1, 1))

        if params.center_embeddings > 0:
            util.center_embeddings(src_emb.weight.data)
            util.center_embeddings(tgt_emb.weight.data)


        if params.mode == 1:            # GAN   tune
            t = TrainerThu(params)
            g = t.train(src_emb, tgt_emb)

        if params.mode == 2:            # GAN   tune
            t = TrainerFb(params)
            g = t.train(src_emb, tgt_emb)

        elif params.mode == 7:
            we1 = WordEmbeddings()
            we1.load_from_word2vec_new('/data/data/' + 'wiki.en.vec')
            we2 = WordEmbeddings()
            we2.load_from_word2vec_new('/data/data/' + 'wiki.zh.vec')
            if params.center_embeddings > 0:
                    we1.center_embeddings()
                    we2.center_embeddings()

        else:
            raise "Invalid flag!"
Beispiel #2
0
def main():
    params = parse_arguments()

    if params.mode == 0:  # Preparing word vectors
        u = util.Utils(params)
        u.run()
        return

    np.random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)

    src_emb_array, tgt_emb_array = util.load_npy_two(params.data_dir,
                                                     'src.npy', 'tgt.npy')
    src_emb = util.convert_to_embeddings(src_emb_array)
    tgt_emb = util.convert_to_embeddings(tgt_emb_array)
    src_emb.weight.data /= src_emb.weight.data.norm(2, dim=1).view((-1, 1))
    tgt_emb.weight.data /= tgt_emb.weight.data.norm(2, dim=1).view((-1, 1))
    evaluator = Evaluator(params, tgt_emb.weight.data, [1, 5, 10],
                          ['nn', 'csls'])

    if params.mode == 1:
        t = Trainer(params)
        g = t.train(src_emb, tgt_emb, evaluator)

    elif params.mode == 2:
        model_file_path = path.join(params.model_dir, params.model_file_name)
        g = Generator(input_size=g_input_size, output_size=g_output_size)
        g.load_state_dict(torch.load(model_file_path, map_location='cpu'))

        if torch.cuda.is_available():
            g = g.cuda()

        all_precisions = evaluator.get_all_precisions(g(src_emb.weight).data)
        print(json.dumps(all_precisions))

    else:
        raise "Invalid flag!"
Beispiel #3
0
def main():
    params = parse_arguments()
    params = construct_file_args(params)

    if params.context == 1:
        params.g_input_size *= 2

    if params.mode == 0:
        u = util.Utils(params)
        u.run()

    else:
        print("Reading embedding numpy files...", end='')
        use_cuda = False
        emb_str = ['src', 'tgt']
        file_ext = '.npy'

        if params.mode == 1:
            use_cuda = True

        emb_arrays = [
            util.load_npy_one(params.data_dir,
                              e + '_' + params.suffix_str + file_ext)
            for e in emb_str
        ]
        print("Done.")

        print("Converting arrays to embedding layers...", end='')
        embs = [
            util.convert_to_embeddings(erray, use_cuda) for erray in emb_arrays
        ]
        print("Done.")

        if params.center_embeddings > 0:
            for e in embs:
                util.center_embeddings(e.weight.data)

        if params.mode == 1:
            t = Trainer(params)
            g = t.train(embs[0], embs[1])

        elif params.mode == 2:
            params = _get_eval_params(params)
            evaluator = Evaluator(params, embs[0].weight.data,
                                  embs[1].weight.data)

            model_file_path = os.path.join(params.model_dir,
                                           params.model_file_name)
            attn_file_path = os.path.join(params.model_dir,
                                          params.attn_file_name)

            g = Generator(input_size=params.g_input_size,
                          hidden_size=params.g_hidden_size,
                          output_size=params.g_output_size,
                          hyperparams=get_hyperparams(params, disc=False))
            g.load_state_dict(torch.load(model_file_path, map_location='cpu'))
            evaluator.W = g.map1.weight.data
            if params.context > 0:
                try:
                    knn_list = pickle.load(
                        open('full_knn_list_' + params.suffix_str + '.pkl',
                             'rb'))
                except FileNotFoundError:
                    print("k-nn file not found!")
                knn_emb = util.convert_to_embeddings(knn_list, use_cuda=False)
                attn = Attention(atype=params.atype,
                                 input_size=2 * params.g_input_size,
                                 hidden_size=params.a_hidden_size)

                if params.atype in ['mlp', 'bilinear']:
                    attn.load_state_dict(
                        torch.load(attn_file_path, map_location='cpu'))

                indices = torch.arange(params.top_frequent_words).type(
                    torch.LongTensor)
                mapped_src_emb = g(
                    construct_input(knn_emb,
                                    indices,
                                    embs[0],
                                    attn,
                                    params.atype,
                                    context=params.context)).data

            else:
                mapped_src_emb = g(embs[0].weight).data


#             if torch.cuda.is_available():
#                 g = g.cuda()

#             print(mapped_src_emb)
            evaluator.get_all_precisions(mapped_src_emb)
            # print("Unsupervised criterion: ", evaluator.calc_unsupervised_criterion(mapped_src_emb))

            # unsupervised_criterion = []
            #
            # for i in range(40):
            #     model_file_path = os.path.join(params.model_dir, 'generator_weights_en_es_' + str(i+1) + '.t7')
            #     g = Generator(input_size=g_input_size, output_size=g_output_size)
            #     g.load_state_dict(torch.load(model_file_path, map_location='cpu'))
            #     if torch.cuda.is_available():
            #         g = g.cuda()
            #     mapped_src_emb = g(src_emb.weight).data
            #     uc = evaluator.calc_unsupervised_criterion(mapped_src_emb)
            #     print("i: %d, uc: %f" % (i, uc))
            #     unsupervised_criterion.append(uc)
            #
            # np.save("uc.npy", np.array(unsupervised_criterion))

        else:
            raise "Invalid flag!"
Beispiel #4
0
def main():
    params = parse_arguments()

    if params.mode == 0:
        u = util.Utils(params)
        u.run()

    else:
        print("Reading embedding numpy files...")
        use_cuda = False
        if params.mode in [0, 1, 2, 3, 4, 5, 6]:
            use_cuda = True
        src_emb_array, tgt_emb_array = util.load_npy_two(
            params.data_dir, 'src.npy', 'tgt.npy')
        print("Done.")
        print("Converting arrays to embedding layers...")
        src_emb = util.convert_to_embeddings(src_emb_array, use_cuda)
        tgt_emb = util.convert_to_embeddings(tgt_emb_array, use_cuda)
        print("Done.")

        if params.center_embeddings > 0:
            util.center_embeddings(src_emb.weight.data)
            util.center_embeddings(tgt_emb.weight.data)

        if params.mode == 1:  # cycle GAN tune2
            t = TrainerDual(params)
            g = t.train(src_emb, tgt_emb)

        elif params.mode == 2:
            params = _get_eval_params(params)
            evaluator = Evaluator(params, src_emb.weight.data,
                                  tgt_emb.weight.data)

            model_file_path = os.path.join(params.model_dir,
                                           params.model_file_name)
            g = Generator(input_size=g_input_size, output_size=g_output_size)
            g.load_state_dict(torch.load(model_file_path, map_location='cpu'))

            #             if torch.cuda.is_available():
            #                 g = g.cuda()

            mapped_src_emb = g(src_emb.weight).data
            #             print(mapped_src_emb)
            evaluator.get_all_precisions(mapped_src_emb)
            # print("Unsupervised criterion: ", evaluator.calc_unsupervised_criterion(mapped_src_emb))

            # unsupervised_criterion = []
            #
            # for i in range(40):
            #     model_file_path = os.path.join(params.model_dir, 'generator_weights_en_es_' + str(i+1) + '.t7')
            #     g = Generator(input_size=g_input_size, output_size=g_output_size)
            #     g.load_state_dict(torch.load(model_file_path, map_location='cpu'))
            #     if torch.cuda.is_available():
            #         g = g.cuda()
            #     mapped_src_emb = g(src_emb.weight).data
            #     uc = evaluator.calc_unsupervised_criterion(mapped_src_emb)
            #     print("i: %d, uc: %f" % (i, uc))
            #     unsupervised_criterion.append(uc)
            #
            # np.save("uc.npy", np.array(unsupervised_criterion))

        else:
            raise "Invalid flag!"
Beispiel #5
0
def main():
    params = parse_arguments()

    if params.mode == -1:
        u = util.Utils(params)
        u.run()

    else:
        if params.mode == 1:
            file2 = params.data_dir + '../src/tune32/best/' + 'G_BA_seed_430_mf_75000_lr_0.1_p@1_37.933.t7'

            we1 = WordEmbeddings()
            we1.load_from_word2vec_new('/data/data/' + 'wiki.en.vec')
            we2 = WordEmbeddings()
            we2.load_from_word2vec_new('/data/data/' + 'wiki.zh.vec')
            if params.center_embeddings > 0:
                we1.center_embeddings()
                we2.center_embeddings()

            X_AE = AE(params).cuda()
            Y_AE = AE(params).cuda()
            X_AE.load_state_dict(torch.load(file1))
            Y_AE.load_state_dict(torch.load(file2))

            src = Variable(torch.from_numpy(we1.vectors)).cuda().float()
            #src = src_emb.weight.cuda().float()
            tgt = Variable(torch.from_numpy(we2.vectors)).cuda().float()
            #tgt = tgt_emb.weight.cuda().float()

            X_Z = X_AE.encode(src).data
            Y_Z = Y_AE.encode(tgt).data
            '''
            mstart_time=timer()
            for method in ['nn','csls_knn_10']:
                results = get_word_translation_accuracy(
                    'en', we1.word2id, X_Z,
                    'zh', we2.word2id, Y_Z,
                    method=method,
                    small_data=False
                )
                acc = results[0][1]
                print('{} takes {:.2f}s'.format(method, timer() - mstart_time))
                print('Method:{} score:{:.4f}'.format(method,acc))   

            '''
            we1.transformed_vectors = X_Z.cpu().numpy()
            we2.transformed_vectors = Y_Z.cpu().numpy()

            we1.save_transformed_vectors(params.data_dir + 'transformed' +
                                         '.' + 'en')
            we2.save_transformed_vectors(params.data_dir + 'transformed' +
                                         '.' + 'zh')

            exit()

        print("Reading embedding numpy files...")
        use_cuda = False
        if params.mode in [0]:
            use_cuda = True
        src_emb_array, tgt_emb_array = util.load_npy_two(
            params.data_dir, 'src.npy', 'tgt.npy')
        print("Done.")
        print("Converting arrays to embedding layers...")
        src_emb = util.convert_to_embeddings(src_emb_array, use_cuda)
        tgt_emb = util.convert_to_embeddings(tgt_emb_array, use_cuda)
        print("Done.")

        if params.center_embeddings > 0:
            util.center_embeddings(src_emb.weight.data)
            util.center_embeddings(tgt_emb.weight.data)

        if params.mode == 0:  # BiAAE-new  tune
            t = TrainerAAE(params)
            g = t.train(src_emb, tgt_emb)

        else:
            raise "Invalid flag!"
Beispiel #6
0
def main():
    params = parse_arguments()

    if params.mode == 0:
        u = util.Utils(params)
        u.run()
    else:
        print("Reading embedding numpy files...")
        use_cuda = False
        if params.mode == 1:
            use_cuda = True

        src = params.src_lang
        tgt = params.tgt_lang

        suffix_str = src + '_' + tgt

        src_data = util.load_subword_embeddings(
            os.path.join(params.data_dir, params.src_file))
        tgt_data = util.load_word_embeddings(
            os.path.join(params.data_dir, params.tgt_file))
        print("Done.")

        if params.center_embeddings > 0:  # centering
            src_data['E'].center()
            tgt_data['E'].center()

        if params.mode == 1:
            # Memorize the original word embeddings
            src_data['vecs'].copy_(src_data['F'](src_data['seqs'],
                                                 src_data['E'],
                                                 transform=False).data)
            t = Trainer(params)
            g = t.train(src_data, tgt_data)

        elif params.mode == 2:
            params = _get_eval_params(params)
            # evaluator = Evaluator(params, src_emb.weight.data, tgt_emb.weight.data)

            model_file_path = os.path.join(params.model_dir,
                                           params.model_file_name)
            g = Generator(input_size=params.g_input_size,
                          hidden_size=params.g_hidden_size,
                          output_size=params.g_output_size,
                          hyperparams=get_hyperparams(params, disc=False))
            g.load_state_dict(torch.load(model_file_path, map_location='cpu'))

            try:
                knn_list = pickle.load(
                    open('full_knn_list_' + suffix_str + '.pkl', 'rb'))
            except FileNotFoundError:
                print("k-nn file not found!")
            knn_emb = util.convert_to_embeddings(knn_list, use_cuda=False)

            attn = Attention(atype=params.atype)
            indices = torch.arange(params.top_frequent_words).type(
                torch.LongTensor)

            mapped_src_emb = g(src_emb.weight).data

            #             print(mapped_src_emb)
            evaluator.get_all_precisions(mapped_src_emb)
            # print("Unsupervised criterion: ", evaluator.calc_unsupervised_criterion(mapped_src_emb))

            # unsupervised_criterion = []
            #
            # for i in range(40):
            #     model_file_path = os.path.join(params.model_dir, 'generator_weights_en_es_' + str(i+1) + '.t7')
            #     g = Generator(input_size=g_input_size, output_size=g_output_size)
            #     g.load_state_dict(torch.load(model_file_path, map_location='cpu'))
            #     if torch.cuda.is_available():
            #         g = g.cuda()
            #     mapped_src_emb = g(src_emb.weight).data
            #     uc = evaluator.calc_unsupervised_criterion(mapped_src_emb)
            #     print("i: %d, uc: %f" % (i, uc))
            #     unsupervised_criterion.append(uc)
            #
            # np.save("uc.npy", np.array(unsupervised_criterion))

        else:
            raise "Invalid flag!"