Esempio n. 1
0
def project_vectors(origForeignVecFile,
                    origEnVecFile,
                    subsetEnVecFile,
                    subsetForeignVecFile,
                    outputEnFile,
                    outputForeignFile,
                    NUMCC=40):
    '''
    将词典的向量输入到CCA中,生成投影向量,再生成双语向量
    :param origForeignVecFile: 外语向量矩阵
    :param origEnVecFile: 英语向量矩阵
    :param subsetEnVecFile: 词典中的英语向量矩阵
    :param subsetForeignVecFile: 词典中的外语向量矩阵
    :param outputEnFile: 重新获得的英语词向量
    :param outputForeignFile: 重新获得的外语词向量
    :param truncRatio: 模型的训练系数
    '''
    '''数据读入,处理掉开头的英文单词,只保留词向量'''

    tmp = np.loadtxt(origEnVecFile, dtype=np.str, delimiter=' ')
    origEnVecs = tmp[:, 1:].astype(np.float)
    tmp2 = np.loadtxt(origForeignVecFile, dtype=np.str, delimiter=' ')
    origForeignVecs = tmp2[:, 1:].astype(np.float)
    tmp3 = np.loadtxt(subsetEnVecFile, dtype=np.str, delimiter=' ')
    subsetEnVecs = tmp3[:, 1:].astype(np.float)
    tmp4 = np.loadtxt(subsetForeignVecFile, dtype=np.str, delimiter=' ')
    subsetForeignVecs = tmp4[:, 1:].astype(np.float)
    '''预处理,使每行正则化'''
    #origEnVecs=preprocessing.normalize(origEnVecs)
    #origForeignVecs=preprocessing.normalize(origForeignVecs)
    subsetEnVecs = preprocessing.normalize(subsetEnVecs)
    subsetForeignVecs = preprocessing.normalize(subsetForeignVecs)
    '''训练CCA'''
    '''
    num = [NUMCC]
    regs = [1e-1]
    cca = rcca.CCACrossValidate(regs=regs,numCCs=num,kernelcca=False,cutoff=0.1)
    cca.train([subsetEnVecs, subsetForeignVecs])
    '''
    cca = CCA(n_components=NUMCC)
    cca.fit(subsetEnVecs, subsetForeignVecs)
    print cca.get_params()
    X_c, Y_c = cca.transform(origEnVecs, origForeignVecs)
    '''生成投影后的向量'''
    #tmpOutput = rcca._listdot([d.T for d in [origEnVecs, origForeignVecs]], cca.ws)
    origEnVecsProjected = preprocessing.normalize(X_c)
    #origEnVecsProjected = preprocessing.scale(tmpOutput[0])
    origEnVecsProjected = np.column_stack(
        (tmp[:, :1], origEnVecsProjected.astype(np.str)))
    origForeignVecsProjected = preprocessing.normalize(Y_c)
    #origForeignVecsProjected = preprocessing.scale(tmpOutput[1])
    origForeignVecsProjected = np.column_stack(
        (tmp2[:, :1], origForeignVecsProjected.astype(np.str)))
    np.savetxt(outputEnFile, origEnVecsProjected, fmt="%s", delimiter=' ')
    np.savetxt(outputForeignFile,
               origForeignVecsProjected,
               fmt="%s",
               delimiter=' ')
    print "work over!"
Esempio n. 2
0
def CCA_project_vectors(args,
                        src_dico,
                        tgt_dico,
                        src_full,
                        tgt_full,
                        src_train,
                        tgt_train,
                        NUM_dim=100):

    print('Exporting embeddings...')
    OutputDir = "output/{}-{}/".format(args.src_lang, args.tgt_lang)
    if not os.path.exists(OutputDir):
        os.makedirs(OutputDir)

    cca = CCA(n_components=NUM_dim)
    print("Fitting...")
    cca.fit(src_train, tgt_train)
    print(cca.get_params())
    X_c, Y_c = cca.transform(src_full, tgt_full)
    src_out, tgt_out = utils.norm_embeddings(X_c), utils.norm_embeddings(Y_c)
    print("Exporting embeddings...")
    utils.export_embeddings(src_dico[0], src_out,
                            OutputDir + 'projected.{}'.format(args.src_lang))
    utils.export_embeddings(tgt_dico[0], tgt_out,
                            OutputDir + 'projected.{}'.format(args.tgt_lang))
    print("work over!")
Esempio n. 3
0
#Y = N.array(Z)[:,3:6].tolist()
print 'X=\n',X
print 'Y=\n',Y


Rx = N.corrcoef(X.T)
Ry = N.corrcoef(Y.T)

cca = CCA(n_components=1)
cca.fit(X, Y)

print "Rx:\n", Rx
print "Ry:\n", Ry
print "x_weights:\n", cca.x_weights_
print "y_weights:\n", cca.y_weights_
print "x_loadings:\n", cca.x_loadings_
print "y_loadings:\n", cca.y_loadings_
print "x_scores_:\n", cca.x_scores_
print "y_scores_:\n", cca.y_scores_

loadings_man_x = N.dot(Rx, cca.x_weights_)
loadings_man_y = N.dot(Ry, cca.y_weights_)
print "loadings_man_x:\n",loadings_man_x
print "loadings_man_y:\n",loadings_man_y

print cca.get_params()

#X_c, Y_c = cca.transform(X, Y)
#print "\n\n", X_c,"\n\n", Y_c