def project_vectors(origForeignVecFile, origEnVecFile, subsetEnVecFile, subsetForeignVecFile, outputEnFile, outputForeignFile, NUMCC=40): ''' 将词典的向量输入到CCA中,生成投影向量,再生成双语向量 :param origForeignVecFile: 外语向量矩阵 :param origEnVecFile: 英语向量矩阵 :param subsetEnVecFile: 词典中的英语向量矩阵 :param subsetForeignVecFile: 词典中的外语向量矩阵 :param outputEnFile: 重新获得的英语词向量 :param outputForeignFile: 重新获得的外语词向量 :param truncRatio: 模型的训练系数 ''' '''数据读入,处理掉开头的英文单词,只保留词向量''' tmp = np.loadtxt(origEnVecFile, dtype=np.str, delimiter=' ') origEnVecs = tmp[:, 1:].astype(np.float) tmp2 = np.loadtxt(origForeignVecFile, dtype=np.str, delimiter=' ') origForeignVecs = tmp2[:, 1:].astype(np.float) tmp3 = np.loadtxt(subsetEnVecFile, dtype=np.str, delimiter=' ') subsetEnVecs = tmp3[:, 1:].astype(np.float) tmp4 = np.loadtxt(subsetForeignVecFile, dtype=np.str, delimiter=' ') subsetForeignVecs = tmp4[:, 1:].astype(np.float) '''预处理,使每行正则化''' #origEnVecs=preprocessing.normalize(origEnVecs) #origForeignVecs=preprocessing.normalize(origForeignVecs) subsetEnVecs = preprocessing.normalize(subsetEnVecs) subsetForeignVecs = preprocessing.normalize(subsetForeignVecs) '''训练CCA''' ''' num = [NUMCC] regs = [1e-1] cca = rcca.CCACrossValidate(regs=regs,numCCs=num,kernelcca=False,cutoff=0.1) cca.train([subsetEnVecs, subsetForeignVecs]) ''' cca = CCA(n_components=NUMCC) cca.fit(subsetEnVecs, subsetForeignVecs) print cca.get_params() X_c, Y_c = cca.transform(origEnVecs, origForeignVecs) '''生成投影后的向量''' #tmpOutput = rcca._listdot([d.T for d in [origEnVecs, origForeignVecs]], cca.ws) origEnVecsProjected = preprocessing.normalize(X_c) #origEnVecsProjected = preprocessing.scale(tmpOutput[0]) origEnVecsProjected = np.column_stack( (tmp[:, :1], origEnVecsProjected.astype(np.str))) origForeignVecsProjected = preprocessing.normalize(Y_c) #origForeignVecsProjected = preprocessing.scale(tmpOutput[1]) origForeignVecsProjected = np.column_stack( (tmp2[:, :1], origForeignVecsProjected.astype(np.str))) np.savetxt(outputEnFile, origEnVecsProjected, fmt="%s", delimiter=' ') np.savetxt(outputForeignFile, origForeignVecsProjected, fmt="%s", delimiter=' ') print "work over!"
def CCA_project_vectors(args, src_dico, tgt_dico, src_full, tgt_full, src_train, tgt_train, NUM_dim=100): print('Exporting embeddings...') OutputDir = "output/{}-{}/".format(args.src_lang, args.tgt_lang) if not os.path.exists(OutputDir): os.makedirs(OutputDir) cca = CCA(n_components=NUM_dim) print("Fitting...") cca.fit(src_train, tgt_train) print(cca.get_params()) X_c, Y_c = cca.transform(src_full, tgt_full) src_out, tgt_out = utils.norm_embeddings(X_c), utils.norm_embeddings(Y_c) print("Exporting embeddings...") utils.export_embeddings(src_dico[0], src_out, OutputDir + 'projected.{}'.format(args.src_lang)) utils.export_embeddings(tgt_dico[0], tgt_out, OutputDir + 'projected.{}'.format(args.tgt_lang)) print("work over!")
#Y = N.array(Z)[:,3:6].tolist() print 'X=\n',X print 'Y=\n',Y Rx = N.corrcoef(X.T) Ry = N.corrcoef(Y.T) cca = CCA(n_components=1) cca.fit(X, Y) print "Rx:\n", Rx print "Ry:\n", Ry print "x_weights:\n", cca.x_weights_ print "y_weights:\n", cca.y_weights_ print "x_loadings:\n", cca.x_loadings_ print "y_loadings:\n", cca.y_loadings_ print "x_scores_:\n", cca.x_scores_ print "y_scores_:\n", cca.y_scores_ loadings_man_x = N.dot(Rx, cca.x_weights_) loadings_man_y = N.dot(Ry, cca.y_weights_) print "loadings_man_x:\n",loadings_man_x print "loadings_man_y:\n",loadings_man_y print cca.get_params() #X_c, Y_c = cca.transform(X, Y) #print "\n\n", X_c,"\n\n", Y_c