Beispiel #1
0
def find_correlation_cca_method1(signal, reference_signals, n_components=2):
    r"""
    Perform canonical correlation analysis (CCA)
    Reference: https://github.com/aaravindravi/Brain-computer-interfaces/blob/master/notebook_12_class_cca.ipynb

    Args:
        signal : ndarray, shape (channel,time)
            Input signal in time domain
        reference_signals : ndarray, shape (len(flick_freq),2*num_harmonics,time)
            Required sinusoidal reference templates corresponding to the flicker frequency for SSVEP classification
        n_components : int, default: 2
            number of components to keep (for sklearn.cross_decomposition.CCA)
    Returns:
        result : array, size: len(flick_freq)
            Probability for each reference signals
    Dependencies:
        CCA : sklearn.cross_decomposition.CCA
        np : numpy package
    """

    cca = CCA(n_components)
    corr = np.zeros(n_components)
    result = np.zeros(reference_signals.shape[0])
    for freq_idx in range(0, reference_signals.shape[0]):
        cca_x = signal.T
        cca_y = np.squeeze(reference_signals[freq_idx, :, :]).T
        cca.fit(cca_x, cca_y)
        a, b = cca.transform(cca_x, cca_y)
        for ind_val in range(0, n_components):
            corr[ind_val] = np.corrcoef(a[:, ind_val], b[:, ind_val])[0, 1]
        result[freq_idx] = np.max(corr)
    return result
Beispiel #2
0
    def map_spaces(self, algo, src_mapped_embed=None, trg_mapped_embed=None):

        # (There may be duplicates in self.shared_vocab_src and/or self.shared_vocab_trg,
        # swap_vocab can be used to only inspect one-to-one translations)
        src_embed = self.model_src[self.shared_vocab_src]
        trg_embed = self.model_trg[self.shared_vocab_trg]

        os.makedirs(algo, exist_ok=True)

        if algo == "procrustes":
            logging.info(
                "Calculating Rotation Matrix (Procrustes Problem) and applying it to first embedding"
            )
            #ortho, _ = orthogonal_procrustes(src_embed, trg_embed)
            # does the same as
            u, _, vt = np.linalg.svd(trg_embed.T.dot(src_embed))
            w = vt.T.dot(u.T)
            self.model_src.vectors.dot(w, out=self.model_src.vectors)

        elif algo == "noise":
            logging.info(
                "Calculating Rotation Matrix with noise aware algorithm and applying it to first embedding"
            )
            transform_matrix, alpha, clean_indices, noisy_indices = noise_aware(
                src_embed, trg_embed)
            #write cleaned vocab to file
            with open("vocab.clean.txt", 'w') as v:
                for src, trg in np.asarray(self.shared_vocab)[clean_indices]:
                    v.write("{}\t{}\n".format(src, trg))
            self.model_src.vectors.dot(transform_matrix,
                                       out=self.model_src.vectors)
            logging.info("Percentage of clean indices: {}".format(alpha))

        elif algo == "cca":
            logging.info(
                "Calculating Mapping based on CCA and applying it to both embeddings"
            )
            cca = CCA(n_components=100, max_iter=5000)
            cca.fit(src_embed, trg_embed)
            self.model_src.vectors, self.model_trg.vectors = cca.transform(
                self.model_src.vectors, self.model_trg.vectors)

        elif algo == "gcca":
            logging.info(
                "Calculating Mapping based on GCCA and applying it to both embeddings"
            )
            gcca = GCCA()
            gcca.fit([src_embed, trg_embed])
            transform_l = gcca.transform_as_list(
                (self.model_src.vectors, self.model_trg.vectors))
            # gcca computes positive and negative correlations (eigenvalues), sorted in ascending order.
            # We are only interested in the positive portion
            self.model_src.vectors = transform_l[0][:, 100:]
            self.model_trg.vectors = transform_l[1][:, 100:]

        # save transformed model(s)
        if src_mapped_embed:
            self.model_src.save(os.path.join(algo, src_mapped_embed))
        if trg_mapped_embed:
            self.model_trg.save(os.path.join(algo, trg_mapped_embed))
Beispiel #3
0
def CCA_project_vectors(args,
                        src_dico,
                        tgt_dico,
                        src_full,
                        tgt_full,
                        src_train,
                        tgt_train,
                        NUM_dim=100):

    print('Exporting embeddings...')
    OutputDir = "output/{}-{}/".format(args.src_lang, args.tgt_lang)
    if not os.path.exists(OutputDir):
        os.makedirs(OutputDir)

    cca = CCA(n_components=NUM_dim)
    print("Fitting...")
    cca.fit(src_train, tgt_train)
    print(cca.get_params())
    X_c, Y_c = cca.transform(src_full, tgt_full)
    src_out, tgt_out = utils.norm_embeddings(X_c), utils.norm_embeddings(Y_c)
    print("Exporting embeddings...")
    utils.export_embeddings(src_dico[0], src_out,
                            OutputDir + 'projected.{}'.format(args.src_lang))
    utils.export_embeddings(tgt_dico[0], tgt_out,
                            OutputDir + 'projected.{}'.format(args.tgt_lang))
    print("work over!")
Beispiel #4
0
class CCAFusion(TransformerMixin, BaseEstimator):
    def __init__(self, c1, c2):
        self.pipes = [c1, c2]
        self.max_iter = 500
        self.cca = None

    def fit(self, X, y=None, **fit_params):
        C = []
        n_components = None
        for pipe in self.pipes:
            c = pipe.fit_transform(X, y)
            if hasattr(c, 'toarray'):
                c = c.toarray()
            if n_components is None:
                n_components = c.shape[1]
            else:
                n_components = min(c.shape[1], n_components)
            C += [c]
        self.cca = CCA(n_components=n_components, max_iter=self.max_iter)
        self.cca.fit(*C)
        return self

    def transform(self, X, y=None):
        C = []
        for pipe in self.pipes:
            c = pipe.transform(X, y)
            if hasattr(c, 'toarray'):
                c = c.toarray()
            C += [c]
        return self.cca.transform(*C)[0]

    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X, y, **fit_params).transform(X, y)
    def train_eval(self, train_index, test_index, ignore_eval=False):
        normalized_train, normalized_test = normalize_by_train(self.source[train_index], self.source[test_index])

        if self.comp is not None:
            if self.use_scikit is not None:
                if self.use_scikit == 'cca':
                    dim_reduction = CCA(n_components=self.comp)
                else:
                    dim_reduction = PCA(n_components=self.comp)
                # fit cca according to train data only
                dim_reduction.fit(normalized_train, self.target[train_index])
                # convert source into lower dimensional representation
                normalized_train = dim_reduction.transform(normalized_train)
                normalized_test = dim_reduction.transform(normalized_test)
            else:
                _, wa, _ = tutorial_on_cca(normalized_train, self.target[train_index])
                normalized_train = normalized_train @ wa[:, :self.comp]
                normalized_test = normalized_test @ wa[:, :self.comp]

        model = self.build_model()


        model.fit(normalized_train, self.target[train_index])

        prediction = model.predict(normalized_test)

        # res_df.to_csv(f"{self.out_name}/res1.csv")
        if not ignore_eval:
            return self.evaluate_regression(prediction, test_index)
        else:
            return prediction
    def train_eval(self, train_index, test_index):
        train_source, test_source = self.source[train_index], self.source[
            test_index]
        train_target, test_target = self.target[train_index], self.target[
            test_index]

        train_source, test_source = scale_train_test(train_source, test_source)
        train_target, _ = scale_train_test(train_target, test_target)

        # rho, w_t, w_s, _ = evaluate_cca_wa_wb(train_target, train_source)
        cca = CCA(n_components=min(train_source.shape[1],
                                   train_target.shape[1]),
                  max_iter=1000)
        cca.fit(train_source, train_target)
        w_s = cca.x_rotations_
        w_t = cca.y_rotations_

        predicted_target = test_source @ w_s @ np.linalg.pinv(w_t)
        predicted_target = unscale_prediction(train_target, predicted_target)

        if self.target_encoder is not None:
            test_target = self.original_target[test_index]
            predicted_target = self.target_encoder.decode(
                torch.as_tensor(predicted_target)).detach().numpy()

        scores = np.zeros(self.original_target.shape[1])
        for i in range(self.original_target.shape[1]):
            predicted = predicted_target[:, i]
            actual = test_target[:, i]
            r, pval = pearsonr(predicted, actual)
            scores[i] = r

        return scores
Beispiel #7
0
    def fit_cca(self, outfile=''):

        # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings

        self.load_embeddings()
        self.extract_pretrained_prototype_embeddings()

        items, vectors = zip(
            *[(k, v) for k, v in self.pretrained_prototype_embeddings.items()
              if k in self.exemplar_to_concept])
        concept_embs = Reach(vectors, items)

        train_vectors = []
        for x in items:
            train_vectors.append(self.train_embeddings[x])
        train_vectors = Reach.normalize(train_vectors)

        cca = CCA(n_components=self.train_embeddings.size, max_iter=10000)
        cca.fit(train_vectors, concept_embs.norm_vectors)

        # transform all name embeddings using the CCA mapping
        all_name_embeddings = deepcopy(self.pretrained_name_embeddings)
        items = [x for _, x in sorted(all_name_embeddings.indices.items())]
        projected_name_embeddings = cca.transform(
            all_name_embeddings.norm_vectors)
        new_name_embeddings = Reach(projected_name_embeddings, items)

        self.pretrained_name_embeddings = new_name_embeddings
        self.load_embeddings()

        if outfile:
            with open('{}_cca.p', 'wb') as f:
                pickle.dump(cca, f)
class CCA_method():
    def __init__(self, n_latents):

        self._n_latents = n_latents
        self._cca = CCA(n_components=n_latents,
                        scale=False,
                        max_iter=10000,
                        tol=1e-8)
        self._Q = np.eye(self._n_latents)

    def fit(self, X, Y):

        # projections U'X, V'Y such that U'X and V'Y are maximally correlated
        self._cca.fit(X, Y)

        # get time-course of projected data
        UX, VY = self._cca.transform(X, Y)

        # learn linear regression VY = UX * Q
        # (Q will be optimal in least-squares sense)
        self._Q = np.linalg.pinv(UX).dot(VY)

    def predict(self, X):

        # transform source data into latent space
        UX = self._cca.transform(X)

        # predict latent activity in target space
        QUX = UX.dot(self._Q)

        # predict observed activity in target space
        Ypred = QUX.dot(self._cca.y_loadings_.T)

        return Ypred
Beispiel #9
0
def visualize_with_cca(X, y, title):
    cca = CCA(n_components=2)
    cca.fit(X, y)
    X_cca = cca.transform(X)
    Xax = X_cca[:, 0]
    Yax = X_cca[:, 1]
    labels = (y > 0).astype(int)
    cdict = {0: 'red', 1: 'green'}
    labl = {0: 'home_loss', 1: 'home_win'}
    marker = {0: '*', 1: 'o'}
    alpha = {0: .3, 1: .5}

    fig, ax = plt.subplots(figsize=(7, 5))
    fig.patch.set_facecolor('white')

    for l in np.unique(labels):
        ix = np.where(labels == l)
        ax.scatter(Xax[ix],
                   Yax[ix],
                   c=cdict[l],
                   s=40,
                   label=labl[l],
                   marker=marker[l],
                   alpha=alpha[l])

    plt.xlabel("First Principal Component", fontsize=14)
    plt.ylabel("Second Principal Component", fontsize=14)
    plt.legend()
    plt.title(title)
    plt.show()
Beispiel #10
0
def cca_classify(X_eeg_signals, Yi_frequency_signals):
    cca = CCA(1)
    corr_results = []
    for fr in range(0, Yi_frequency_signals.shape[0]):
        X = X_eeg_signals
        Yi = Yi_frequency_signals[fr, :, :]
        #计算X与Yi之间的相关性
        cca.fit(X.T, np.squeeze(Yi).T)
        X_train_r, Yi_train_r = cca.transform(X.T, np.squeeze(Yi).T)
        corr = np.corrcoef(X_train_r[:, 0], Yi_train_r[:, 0])[0, 1]
        #得出X与每个Yi的相关性
        corr_results.append(corr)
    if corr_results[np.argmax(corr_results)] > 0.50:
        #设置阈值
        global index
        global all_data
        classify_result = np.argmax(corr_results) + 1
        print(corr_results)
        index += 1
        #保存数据
        TT = pd.DataFrame(X_eeg_signals)
        all_data = all_data.append(np.transpose(TT[1:9]))
        if index == 50:
            #保存数据
            all_data = pd.DataFrame(all_data)
            all_data.to_csv('./j_8_all_data.csv', index=False)
        return classify_result
    else:
        return -1
Beispiel #11
0
def doCCA(metrics, color):
    inp = np.array([metrics[m] for m in metricsInput2]).T.astype(float)
    out = np.array([metrics[m] for m in metricsOutput2]).T.astype(float)
    inp0 = np.zeros(len(metricsInput2))
    out0 = np.zeros(len(metricsOutput2))
    inp = np.vstack((inp, inp0))
    out = np.vstack((out, out0))
    cca = CCA(n_components=1, scale=False)
    cca.fit(inp, out)
    inp_cca = inp.dot(cca.x_weights_)
    out_cca = out.dot(cca.y_weights_)

    # Create linear regression object
    regr = linear_model.LinearRegression()
    # Train the model using the training sets
    regr.fit(inp_cca, out_cca)
    cca_regr = regr.predict(inp_cca)
    # The coefficients
    print('Coefficients: \n', regr.coef_)

    plt.scatter(inp_cca, out_cca, c=color)
    plt.plot(inp_cca, cca_regr, color=color, linewidth=0.5)

    logging.info('cca')
    logging.info(cca.x_rotations_)
    logging.info(cca.y_rotations_)
def cca(m1, m2, preprocessing=None):
    """
    Use CCA to decompose two views and plot result.

    Params:
        m1, m2: Every column is a example with every row as a feature.
        preprocessing: If None, we don't do pre-processing; if 'orth', we adjust center to 0 and perform PCA.
    """
    # Adjust means to be 0 and perform PCA.
    if preprocessing == "orth":
        # Zero means.
        m1 -= np.mean(m1, axis=1, keepdims=True)

        # print("m1=", np.sum(m1, axis=1))
        m2 -= np.mean(m2, axis=1, keepdims=True)

        # PCA.

    cca = CCA(n_components=3, max_iter=100)
    cca.fit(m1.T, m2.T)

    X_c = cca.transform(m1.T)

    fig, ax = plt.subplots()
    ax.set_title('Fig.2.(c)')
    # ax.set_color_cycle(['blue', 'green', 'red'])
    ax.set_prop_cycle('color', ['blue', 'red', 'green'])
    ax.plot(X_c)
    # ax.plot(Y_c)
    plt.show()
Beispiel #13
0
def perform(arrs):
    blocks_cnt = sum([arr.shape[1] * arr.shape[2] for arr in arrs])
    X = np.zeros((blocks_cnt, 16))
    Y = np.zeros((blocks_cnt, 64))
    for c in range(3):
        for i, arr in enumerate(arrs):
            height = arr.shape[1]
            width = arr.shape[2]
            for y in range(height):
                for x in range(width):
                    X[y * width + x] = np.hstack(
                        [arr[c][y][x - 1][:][-1], arr[c][y - 1][x][-1][:]])
                    Y[y * width + x] = arr[c][y][x].ravel()

        X_mc = (X - X.mean()) / (X.std())
        Y_mc = (Y - Y.mean()) / (Y.std())

        ca = CCA(n_components=1)
        ca.fit(X_mc, Y_mc)

        print(f'\nColor {c}:')
        weights = ca.x_weights_.ravel()
        print(weights.shape)
        print(', '.join(map(lambda a: str(a), weights)))
        print(ca.n_iter_)
def project_vectors(origForeignVecFile,
                    origEnVecFile,
                    subsetEnVecFile,
                    subsetForeignVecFile,
                    outputEnFile,
                    outputForeignFile,
                    NUMCC=40):
    '''
    将词典的向量输入到CCA中,生成投影向量,再生成双语向量
    :param origForeignVecFile: 外语向量矩阵
    :param origEnVecFile: 英语向量矩阵
    :param subsetEnVecFile: 词典中的英语向量矩阵
    :param subsetForeignVecFile: 词典中的外语向量矩阵
    :param outputEnFile: 重新获得的英语词向量
    :param outputForeignFile: 重新获得的外语词向量
    :param truncRatio: 模型的训练系数
    '''
    '''数据读入,处理掉开头的英文单词,只保留词向量'''

    tmp = np.loadtxt(origEnVecFile, dtype=np.str, delimiter=' ')
    origEnVecs = tmp[:, 1:].astype(np.float)
    tmp2 = np.loadtxt(origForeignVecFile, dtype=np.str, delimiter=' ')
    origForeignVecs = tmp2[:, 1:].astype(np.float)
    tmp3 = np.loadtxt(subsetEnVecFile, dtype=np.str, delimiter=' ')
    subsetEnVecs = tmp3[:, 1:].astype(np.float)
    tmp4 = np.loadtxt(subsetForeignVecFile, dtype=np.str, delimiter=' ')
    subsetForeignVecs = tmp4[:, 1:].astype(np.float)
    '''预处理,使每行正则化'''
    #origEnVecs=preprocessing.normalize(origEnVecs)
    #origForeignVecs=preprocessing.normalize(origForeignVecs)
    subsetEnVecs = preprocessing.normalize(subsetEnVecs)
    subsetForeignVecs = preprocessing.normalize(subsetForeignVecs)
    '''训练CCA'''
    '''
    num = [NUMCC]
    regs = [1e-1]
    cca = rcca.CCACrossValidate(regs=regs,numCCs=num,kernelcca=False,cutoff=0.1)
    cca.train([subsetEnVecs, subsetForeignVecs])
    '''
    cca = CCA(n_components=NUMCC)
    cca.fit(subsetEnVecs, subsetForeignVecs)
    print cca.get_params()
    X_c, Y_c = cca.transform(origEnVecs, origForeignVecs)
    '''生成投影后的向量'''
    #tmpOutput = rcca._listdot([d.T for d in [origEnVecs, origForeignVecs]], cca.ws)
    origEnVecsProjected = preprocessing.normalize(X_c)
    #origEnVecsProjected = preprocessing.scale(tmpOutput[0])
    origEnVecsProjected = np.column_stack(
        (tmp[:, :1], origEnVecsProjected.astype(np.str)))
    origForeignVecsProjected = preprocessing.normalize(Y_c)
    #origForeignVecsProjected = preprocessing.scale(tmpOutput[1])
    origForeignVecsProjected = np.column_stack(
        (tmp2[:, :1], origForeignVecsProjected.astype(np.str)))
    np.savetxt(outputEnFile, origEnVecsProjected, fmt="%s", delimiter=' ')
    np.savetxt(outputForeignFile,
               origForeignVecsProjected,
               fmt="%s",
               delimiter=' ')
    print "work over!"
Beispiel #15
0
def cca_fit(X, Y):
    cca = CCA(n_components=1)
    cca.fit(X, Y)

    X = list(itertools.islice(X, 10))
    Y = list(itertools.islice(Y, 10))

    return cca.score(X, Y)
Beispiel #16
0
def _CCA(data, graph, n):
    cca = CCA(n_components=n)
    adjacencyMatrix = createAffinityMatrix(graph)
    cca.fit(data, adjacencyMatrix)
    X_c, Y_c = cca.transform(data, adjacencyMatrix)

    writeCSV(X_c, 'CCA_X')
    writeCSV(Y_c, 'CCA_Y')
Beispiel #17
0
def cca_d_h(d_var, h_var, components_num):
	cca=CCA(n_components=components_num, scale=True, max_iter=2000)
	cca.fit(d_var, h_var)
	d_c,h_c=cca.transform(d_var, h_var)  
	ah = np.linalg.inv((h_var.T).dot(h_var)).dot(h_var.T).dot(h_c)
	ad = np.linalg.inv((d_var.T).dot(d_var)).dot(d_var.T).dot(d_c)

	return d_c, h_c, ad, ah
Beispiel #18
0
 def mean_canonical_correlations(scaled_features, df):
     
     cca = CCA(1)
     cca.fit(scaled_features, df.iloc[:,-1])
     X_c, Y_c = cca.transform(scaled_features, df.iloc[:,-1])
     
     mean_canonical_correlation = np.mean(X_c)
     return mean_canonical_correlation
def canonical_correlation_analysis(occurences_a, occurences_b):
    occurences_a = pd.Series(occurences_a, dtype="category")
    occurences_a = pd.get_dummies(occurences_a)
    occurences_b = pd.DataFrame.from_items(occurences_b)
    occurences_b = pd.get_dummies(occurences_b)
    cca = CCA(n_components=1)
    cca.fit(occurences_a, occurences_b)
    return cca.score(occurences_a, occurences_b)
Beispiel #20
0
def cca_analysis(X, Y, X_dev, Y_dev):
	cca = CCA(n_components=1, max_iter=2000)
	cca.fit(X, Y)
	X_dev_c, Y_dev_c = cca.transform(X_dev, Y_dev)

	corrcoef = np.corrcoef(X_dev_c.T, Y_dev_c.T)[0,1]

	return corrcoef
Beispiel #21
0
def cca_score(X, Y):
    # Calculate the CCA score of the first component pair
    ca = CCA(n_components=1)
    ca.fit(X, Y)
    Xc, Yc = ca.transform(X, Y)
    score = np.corrcoef(Xc[:, 0], Yc[:, 0])

    return score[0][1]
Beispiel #22
0
    def CCA_analysis(self, recordings_index, trial_index, brain_area):
        path = self.all_data_path + '/' + self.selected_recordings[
            recordings_index]

        #Prepare rates
        rates = self.convert_one_population_to_rates(recordings_index,
                                                     trial_index, brain_area).T

        #Prepare behavior
        trials = np.load(path + '/' + 'trials.intervals.npy')
        #Behavioral data
        mot_timestamps = np.load(path + '/' + 'face.timestamps.npy')
        mot_energy = np.load(path + '/' + 'face.motionEnergy.npy')

        beh_range = np.bitwise_and(
            mot_timestamps[:, 1] >= trials[trial_index][0],
            mot_timestamps[:, 1] <= trials[trial_index][1])
        #print(np.where(beh_range==True))
        #print(mot_timestamps[beh_range])
        beh_subset = mot_energy[beh_range]

        beh_subset_aligned = self.align_rate_and_behavior(
            beh_subset, rates[:, 0]).reshape(-1, 1)

        from sklearn.cross_decomposition import CCA

        cca = CCA(n_components=2)
        cca.fit(rates, beh_subset_aligned)
        X_train_r, Y_train_r = cca.transform(rates, beh_subset_aligned)
        print(X_train_r.shape)
        print(Y_train_r.shape)
        plt.scatter(X_train_r[:, 0],
                    Y_train_r[:],
                    label="train",
                    marker="*",
                    c="b",
                    s=50)

        plt.show()

        plt.scatter(X_train_r[:, 1],
                    Y_train_r[:],
                    label="train",
                    marker="*",
                    c="b",
                    s=50)

        plt.show()
        #rates_test=self.convert_one_population_to_rates(recordings_index,2,brain_area).T

        #X_test_r, Y_test_r = cca.transform(rates_test, beh_subset_aligned)
        #plt.scatter(X_test_r[:, 0], Y_test_r[:], label="test",
        #marker="^", c="b", s=50)

        #plt.show()

        print(beh_subset_aligned.shape)
        print(rates.shape)
def get_cca(chip_cors, rna_vec):
    Y_vec = np.array([[each_val / max(chip_cors) for each_val in chip_cors]])
    X_vec = np.array([[each_val / max(rna_vec) for each_val in rna_vec]])
    Y_vec = Y_vec.transpose()
    X_vec = X_vec.transpose()
    cca_obj = CCA(n_components=1)
    cca_obj.fit(X_vec, Y_vec)
    r_squared_canonical = cca_obj.score(X_vec, Y_vec)
    return r_squared_canonical
Beispiel #24
0
def cca(src_dict, tgt_dict, bi_dict, dim=250):

    #with open('../data/seed_embedding.dat', 'wb') as f:
    #    pickle.dump(x, f)
    #    pickle.dump(y, f)
    cca_model = CCA(n_components=dim)
    src_mat, tgt_mat = make_training_matrices(src_dict, tgt_dict, bi_dict)
    cca_model.fit(src_mat, tgt_mat)
    return cca_model.transform(src_dict.embed, tgt_dict.embed)
Beispiel #25
0
def do_cca(X, y, X_orig, n_components=10, permutations=10):
    '''
    Performs a CCA using components
    Projects scores back to edge space
    '''
    cca = CCA(n_components=n_components)
    cca.fit(X, y)

    # save the latent component correlation
    cca.mode_r = []
    for component in range(n_components):
        cca.mode_r.append(
            np.corrcoef(cca.x_scores_[:, component],
                        cca.y_scores_[:, component])[0, 1])

    # correlate behaviour with LC score
    cca.y_score_correlation = np.zeros((np.shape(y)[1], n_components))
    for component in range(n_components):
        for beh in range(np.shape(y)[1]):
            cca.y_score_correlation[beh, component] = np.corrcoef(
                y[:, beh].T, cca.y_scores_[:, component])[0, 1]

    # correlate edges with LC score
    cca.x_score_correlation = np.zeros((np.shape(X_orig)[1], n_components))
    for component in range(n_components):
        cca.x_score_correlation[:, component] = np.corrcoef(
            cca.x_scores_[:, component], X_orig.T)[1::, 0]

    # non parametric max T tests for component significance
    max_r = []
    for perm in tqdm(range(permutations)):
        #shuffle the behaviour for each permutation
        y_shuffle = shuffle(y)

        #perform a new CCA with shuffled data
        cca_perm = []
        cca_perm = CCA(n_components=n_components)
        cca_perm.fit(X, y_shuffle)

        # save the latent component correlation
        mode_r_perm = []
        for component in range(n_components):
            mode_r_perm.append(
                np.corrcoef(cca_perm.x_scores_[:, component],
                            cca_perm.y_scores_[:, component])[0, 1])

        # take the max r value
        max_r.append(np.max(mode_r_perm))

    # Compute adjusted p-values via percentile
    p_adj = []
    for component in range(n_components):
        p_adj.append(np.mean(max_r >= cca.mode_r[component]))

    return cca, p_adj
Beispiel #26
0
 def cca_feature(self, data, parameter_list):
     cca = CCA(1)
     result = []
     for i in range(parameter_list[-1]):
         # reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3])
         reference_signals = self.reference_signals(parameter_list[1][i], parameter_list[2], parameter_list[3])
         cca.fit(data.T, reference_signals.T)
         x, y = cca.transform(data.T, np.squeeze(reference_signals).T)
         corr = np.corrcoef(x[:, 0], y[:, 0])[0, 1]
         result.append(corr)
     return result
Beispiel #27
0
def fit_CCA(tr_block, data_builder):
    '''We fit a CCA to some 100 odd points???
    '''
    # train on number of points
    num_points = 100
    PixelPoints = data_builder.sample_random_pixels()
    points_array_ipw = []
    points_array_refl = []
    for yr in [14, 15]:
        doy_strings = data_builder.club_days(tr_block[tr_block[0][:, 1] == yr])
        days_in_sorted = doy_strings.keys()
        days_in_sorted.sort()
        ipw_files, refl_files = data_builder.sort_IPW_refl_files_imgs(yr)
        for set_ in days_in_sorted:
            print 'Building data set for year: %d and string of days %s' % (
                yr, set_)
            # Get the required files only
            temp_ipw_files = filter(
                lambda x: re.findall('\d+', x)[1] in doy_strings[set_],
                ipw_files)
            temp_refl_files = filter(
                lambda x: re.findall('\d+', x)[1] in doy_strings[set_],
                refl_files)
            temp_ipw_files = map(
                lambda x: '../data/dataset/20' + str(yr) + os.sep + x,
                temp_ipw_files)
            temp_refl_files = map(
                lambda x: '../data/dataset/20' + str(yr) + os.sep + x,
                temp_refl_files)
            for x_, y_ in zip(PixelPoints[:num_points, 0],
                              PixelPoints[:num_points, 1]):
                temp_array = data_builder.build_features_and_truth_imgs(
                    temp_ipw_files, temp_refl_files, x_, y_)
                points_array_ipw.append(temp_array[1])
                points_array_refl.append(temp_array[2])
    X_ = np.vstack(points_array_ipw)
    Y_ = np.vstack(points_array_refl)
    mdl = CCA(n_components=10)
    print 'Fitting a CCA...'
    mdl.fit(X_[:, :1089], Y_[:, :1089])
    ipw_frames = X_[:, 2178:-1]
    refl_frames = Y_[:, 2178:]
    del X_
    del Y_
    ipw_frames = ipw_frames[~np.any(np.isnan(ipw_frames), axis=1), :]
    refl_frames = refl_frames[~np.any(np.isnan(refl_frames), axis=1), :]

    #    indices = [(x*1089,(x+1)*1089)for x in range(4) ]
    #    # the number of components times 4
    #    ipw_refl_fusion = np.zeros((ipw_frames.shape[0],80))
    print 'Building the feature fusion..'

    return mdl
Beispiel #28
0
def load_mutation_data():
    if os.path.isfile(mutation_pickle_path):
        pickle_load = pickle.load(open(mutation_pickle_path, 'rb'))
        return pickle_load[0], pickle_load[1]

    gene_effect_df = pd.read_csv(
        r"C:\Users\Nitay\Documents\courses\roded-seminar\Achilles_gene_dependency.csv"
    )
    mutations_df = pd.read_csv(
        r"C:\Users\Nitay\Documents\courses\roded-seminar\CCLE_mutations.csv")

    mutations_df = mutations_df[mutations_df["isDeleterious"].fillna(False)]

    gene_effect_df = gene_effect_df.set_index("Unnamed: 0").T
    gene_effect_df.columns.names = ["cell_line"]
    gene_effect_df.index.names = ["gene"]

    def clean_gene_name(name):
        return name.split("(")[0].strip()

    clean_gene_effect_df = gene_effect_df.rename(index=clean_gene_name)

    common_genes = set(clean_gene_effect_df.index).intersection(
        set(mutations_df['Hugo_Symbol']))
    mutations_cell_line = set(mutations_df['DepMap_ID'])

    new_mutations_df = pd.DataFrame(np.zeros(
        (len(common_genes), len(mutations_cell_line))),
                                    columns=mutations_cell_line,
                                    index=common_genes)
    for i, row in mutations_df.iterrows():
        cell_line = row["DepMap_ID"]
        gene = row['Hugo_Symbol']
        if gene in common_genes and cell_line in mutations_cell_line:
            new_mutations_df.loc[gene, cell_line] = 1

    filtered_gene_effect_df = clean_gene_effect_df.filter(items=common_genes,
                                                          axis=0)
    filtered_mutations_df = new_mutations_df.loc[new_mutations_df.sum(1) > 0,
                                                 new_mutations_df.sum(0) > 0]

    from sklearn.cross_decomposition import CCA
    Y = filtered_gene_effect_df.values
    X = filtered_mutations_df.values
    cca = CCA(n_components=10)
    cca.fit(X, Y)
    X_c = cca.transform(X)
    filtered_mutations_df = pd.DataFrame(X_c)

    pickle.dump([filtered_gene_effect_df, filtered_mutations_df],
                open(mutation_pickle_path, "wb"))

    return filtered_gene_effect_df, filtered_mutations_df
Beispiel #29
0
def cca(vocab1, vocab2, cca_model=None, dim=300, max_iter=1000, thre=0.5):
    if not cca_model:
        cca_model = CCA(n_components=dim, max_iter=max_iter)
        try:
            cca_model.fit(vocab1, vocab2)
            [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2)
        except:
            print('svd cannot converge, try smaller dim')
    else:
        [cca_vec1, cca_vec2] = cca_model.transform(vocab1, vocab2)
    comb_cca = (thre * cca_vec1 + (1 - thre) * cca_vec2)
    return comb_cca, cca_vec1, cca_vec2, cca_model
Beispiel #30
0
    def predict(self):
        if self.k_CCA is None:
            if self.verbose: print('Going to compute best components first')
            self.determine_CCA_components()

        # self.cca_predictions, _ = self.ccaCV.predict(self.features, self.ccaCV.ws)
        cca = CCA(n_components=self.k_CCA)
        cca.fit(self.features[:6000], self.graph[:6000])
        self.cca_predictions = cca.transform(self.features)
        if self.verbose:
            print('Produced predictions')
            print('Size of predictions {}'.format(self.cca_predictions.shape))
Beispiel #31
0
def canonical_approach():
    from sklearn.cross_decomposition import CCA

    (X, Y), cities = pull_xy_data()

    cca = CCA(n_components=2)
    cca.fit(X, Y)

    ccaX, ccaY = cca.transform(X, Y)

    plot(ccaX, cities, ["CC01", "CC02", "CC03"], 1)

    return "OK What Now?"
    def __init__(self, dataset, n=None, tol=1e-4):
        if n is None:
            n = int(numpy.ceil(numpy.sqrt(len(dataset.attributes))))

        self.dataset = dataset
        self.attributes = random.sample(dataset.attributes, n)

        cca = CCA(n_components=1, tol=tol)
        cca.fit(
            dataset.X.take([a.index for a in self.attributes], 1),
            dataset.y)

        self.linear_combination = LinearCombination(
            self.attributes,
            cca.x_weights_.transpose()[0])
Beispiel #33
0
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies):

    # TODO: Strick input checks, exceptions and avoid crashing and processing errors

    # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA
    number_time_points = input_data.shape[1]
    number_harmonics = 2
    cca_base_signal_matrix = [[] for loop_var in compared_frequencies]

    # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency
    # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation
    cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float')

    # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic
    for loop_frequencies in range(len(compared_frequencies)):

        # For this current SSVEP frequency, pre-allocate the harmonics matrix
        cca_base_signal_matrix[loop_frequencies] = numpy.zeros([number_harmonics * 2, number_time_points])
        time_points_count = numpy.arange(number_time_points, dtype='float')
        time_points_count = time_points_count / sampling_rate

        # Generate sine and cosine reference signals, for every harmonic
        for loop_harmonics in range(number_harmonics):

            # Compute the reference signals for current harmonic
            base_constant = 2 * numpy.pi * (loop_harmonics + 1) * compared_frequencies[loop_frequencies]
            base_sine_signal = numpy.sin((base_constant * time_points_count))
            base_cosine_signal = numpy.cos((base_constant * time_points_count))

            # Copy signals back to reference matrix
            base_position = loop_harmonics + 1
            sine_position = (2 * (base_position - 1) + 1)
            cosine_position = 2 * base_position
            cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal
            cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal

        # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency
        y_matrix = cca_base_signal_matrix[loop_frequencies]

        # Create a CCA object and compute the correlation score
        cca_object = CCA(n_components=number_harmonics)
        cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix))
        values_x, values_y = cca_object.transform(input_data, y_matrix)
        cca_rho_values[0, loop_frequencies] = cca_object.score(input_data, y_matrix, values_y)   # Score = Rho value?

    # After loop return and exit
    return cca_rho_values
def fit_CCA(tr_block,data_builder):
    '''We fit a CCA to some 100 odd points???
    '''
    # train on number of points
    num_points = 100
    PixelPoints = data_builder.sample_random_pixels()
    points_array_ipw = []
    points_array_refl = []
    for yr in [14,15]:
        doy_strings = data_builder.club_days(tr_block[tr_block[0][:,1] == yr])
        days_in_sorted = doy_strings.keys()
        days_in_sorted.sort()
        ipw_files,refl_files = data_builder.sort_IPW_refl_files_imgs(yr)
        for set_ in days_in_sorted:
            print 'Building data set for year: %d and string of days %s'%(yr,set_)
            # Get the required files only
            temp_ipw_files = filter(lambda x: re.findall('\d+',x)[1] in doy_strings[set_],ipw_files)
            temp_refl_files = filter(lambda x: re.findall('\d+',x)[1] in doy_strings[set_],refl_files)
            temp_ipw_files = map(lambda x: '../data/dataset/20' + str(yr) + os.sep + x,temp_ipw_files)
            temp_refl_files = map(lambda x: '../data/dataset/20' + str(yr) + os.sep + x,temp_refl_files)
            for x_,y_ in zip(PixelPoints[:num_points,0],PixelPoints[:num_points,1]):
                temp_array = data_builder.build_features_and_truth_imgs(temp_ipw_files,temp_refl_files,x_,y_)
                points_array_ipw.append(temp_array[1])
                points_array_refl.append(temp_array[2])
    X_ = np.vstack(points_array_ipw)
    Y_ = np.vstack(points_array_refl)
    mdl = CCA(n_components = 10)
    print 'Fitting a CCA...'
    mdl.fit(X_[:,:1089],Y_[:,:1089])
    ipw_frames = X_[:,2178:-1]
    refl_frames = Y_[:,2178:]
    del X_
    del Y_
    ipw_frames = ipw_frames[~np.any(np.isnan(ipw_frames),axis = 1),:]
    refl_frames = refl_frames[~np.any(np.isnan(refl_frames),axis = 1),:]
    
#    indices = [(x*1089,(x+1)*1089)for x in range(4) ]
#    # the number of components times 4
#    ipw_refl_fusion = np.zeros((ipw_frames.shape[0],80))
    print 'Building the feature fusion..'
    
    return mdl
Beispiel #35
0
def test_cca_implementation():
    X = np.random.multivariate_normal(np.random.randint(50,100,(10)).astype('float'),np.identity(10),200)
    Y = np.random.multivariate_normal(np.random.randint(80,200,(6)).astype('float'),np.identity(6),200)

    X_test = np.random.multivariate_normal(np.random.randint(50,100,(10)).astype('float'),np.identity(10),20)
    Y_test = np.random.multivariate_normal(np.random.randint(50,100,(6)).astype('float'),np.identity(6),20)
    
    mdl_test = CCA(n_components = 6)
    mdl_test.fit(X,Y)
    
    Y_pred = mdl_test.predict(X)
    
    print Y_pred
    print '-'*50
#    print Y_test

    from sklearn.cross_decomposition import CCA as CCA_sklearn
    
    mdl_actual = CCA_sklearn(n_components = 6)
    mdl_actual.fit(X,Y)
    
    print '-'*50
    Y_actual = mdl_actual.predict(X)
    print Y_actual
Beispiel #36
0
[ 138, 33, 68,  2, 110,  43]
]

print X.shape

#X = N.array(Z)[:,0:3].tolist()
#Y = N.array(Z)[:,3:6].tolist()
print 'X=\n',X
print 'Y=\n',Y


Rx = N.corrcoef(X.T)
Ry = N.corrcoef(Y.T)

cca = CCA(n_components=1)
cca.fit(X, Y)

print "Rx:\n", Rx
print "Ry:\n", Ry
print "x_weights:\n", cca.x_weights_
print "y_weights:\n", cca.y_weights_
print "x_loadings:\n", cca.x_loadings_
print "y_loadings:\n", cca.y_loadings_
print "x_scores_:\n", cca.x_scores_
print "y_scores_:\n", cca.y_scores_

loadings_man_x = N.dot(Rx, cca.x_weights_)
loadings_man_y = N.dot(Ry, cca.y_weights_)
print "loadings_man_x:\n",loadings_man_x
print "loadings_man_y:\n",loadings_man_y
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components=3)
pls2.fit(X, Y)
print("True B (such that: Y = XB + Err)")
print(B)
# compare pls2.coef_ with B
print("Estimated B")
print(np.round(pls2.coef_, 1))
pls2.predict(X)

# PLS regression, with univariate response, a.k.a. PLS1

n = 1000
p = 10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
pls1 = PLSRegression(n_components=3)
pls1.fit(X, y)
# note that the number of components exceeds 1 (the dimension of y)
print("Estimated betas")
print(np.round(pls1.coef_, 1))

# #############################################################################
# CCA (PLS mode B with symmetric deflation)

cca = CCA(n_components=2)
cca.fit(X_train, Y_train)
X_train_r, Y_train_r = cca.transform(X_train, Y_train)
X_test_r, Y_test_r = cca.transform(X_test, Y_test)
def mainExec(name_file, features):
    '''
    Based on a list of image names and image features, learn a CCA model based on Stacked Auxiliary Embedding and
    save this model to disk.
    :param name_file
    :param features
    :return:
    '''
    print "Creating vocabulary"
    voc = readVocabulary()
    print "Generating document vectors"
    occurrenceVectors, idf = createOccurrenceVectors(voc)
    print "Weighing vectors"
    weightedVectors = weight_tfidf(occurrenceVectors, idf)

    sentenceMatrix = []
    imagematrix = []
    print "Creating matrices"
    currentSentence = 0
    for i in weightedVectors.keys():
        if isLargeEnough(i):
            currentSentence += 1
            print "current Sentence: " + str(currentSentence)
            for j in range(len(weightedVectors[i])):
                weightedVectors[i][j] = float(weightedVectors[i][j])
            if currentSentence == 1:
                sentenceMatrix = weightedVectors[i]
                imagematrix = getImage(i,name_file, features)
            elif currentSentence ==2:
                sentenceMatrix = np.concatenate(([sentenceMatrix], [weightedVectors[i]]), axis = 0)
                imagematrix = np.concatenate(([imagematrix], [getImage(i,name_file, features)]), axis = 0)
            else:
                sentenceMatrix = np.concatenate((sentenceMatrix, [weightedVectors[i]]), axis = 0)
                imagematrix = np.concatenate((imagematrix, [getImage(i,name_file, features)]), axis = 0)

    print "Modelling cca"
    cca = CCA(n_components=128)
    cca.fit(sentenceMatrix, imagematrix)
    pickle.dump(cca, open("ccasnippetmodel.p",'w+'))

    idf = np.zeros(len(voc))
    trainingimages = []
    trainingsentences = []
    dp = getDataProvider('flickr30k')
    currentPair = 0
    for pair in dp.sampleImageSentencePair():
        currentPair += 1
        if currentPair % 100 == 0:
            print "Current pair: " + str(currentPair)
        img = pair['image']['feat']
        trainingimages.append(img)
        sentence = getFullSentence(pair)
        for i in range(len(sentence)):
            if sentence[i] > 0:
                idf[i] += 1
        trainingsentences.append(sentence)
    for i in range(len(trainingsentences)):
        trainingsentences[i] = trainingsentences[i]*idf

    trans_img, trans_sent = cca.transform(trainingimages, trainingsentences)
    nn_img = nearest_neighbor(trainingimages)
    nn_sent = nearest_neighbor(trainingsentences)

    augmented_imgs = []
    augmented_sentences = []
    for i in range(len(trans_img)):
        augm_img = trainingimages[i].extend(phi(3000,nn_img, trans_img[i]))
        augmented_imgs.append(augm_img)

    for i in range(len(trans_sent)):
        augm_sent = trainingsentences[i].extend(phi(3000, nn_sent, trans_sent[i]))
        augmented_sentences.append(augm_sent)

    augmentedcca = CCA(n_components= 96)
    augmentedcca.fit(augmented_sentences, augmented_imgs)

    pickle.dump(cca, open("augmentedcca.p",'w+'))
def main():
    sess = tf.InteractiveSession()

    X1_data, X2_data, Y_data, baseline_data, labels_data = read_inputs()
    
    # set up the DCCA network
    keep_input = tf.placeholder("float")
    keep_hidden = tf.placeholder("float")
    X1_in, X1_out = build_network(273, 1500, 1500, 1500, 50, keep_input, keep_hidden)
    X2_in, X2_out = build_network(112, 1500, 1500, 1500, 50, keep_input, keep_hidden)

    # define the DCCA cost function
    U = tf.placeholder("float", [50, 40])
    V = tf.placeholder("float", [50, 40])
    UtF = tf.matmul(tf.transpose(U), tf.transpose(X1_out))
    GtV = tf.matmul(X2_out, V)
    canon_corr = tf.mul(1./BATCH, tf.reduce_sum(tf.mul(tf.matmul(UtF, GtV), tf.constant(np.eye(40), dtype = tf.float32))))

    corr_step = tf.train.AdamOptimizer(1e-6).minimize(- canon_corr)

    sess.run(tf.initialize_all_variables())

    # train the network
    print "Training DCCA"
    for i in range(0, EPOCHS):
        for j in range(0, len(X1_data.train), int(BATCH)):
            X1_in_batch = X1_data.train[j:(j + BATCH)]
            X2_in_batch = X2_data.train[j:(j + BATCH)]

            X1_out_batch = X1_out.eval(feed_dict = {
                X1_in : X1_in_batch,
                keep_input : 1.0,
                keep_hidden : 1.0})
            X2_out_batch = X2_out.eval(feed_dict = {
                X2_in : X2_in_batch,
                keep_input : 1.0,
                keep_hidden : 1.0})

            # compute CCA on the output layers
            cca = CCA(n_components = 40)
            cca.fit(X1_out_batch, X2_out_batch)
            U_batch = cca.x_weights_
            V_batch = cca.y_weights_

            # perform gradient step
            corr_step.run(feed_dict = {
                X1_in : X1_in_batch,
                X2_in : X2_in_batch,
                U : U_batch,
                V : V_batch,
                keep_input : 0.9,
                keep_hidden : 0.8})

            # print useful info
            print "EPOCH", i, "/ COST", canon_corr.eval(feed_dict = {
                X1_in : X1_in_batch,
                X2_in : X2_in_batch,
                U : U_batch,
                V : V_batch,
                keep_input : 1.0,
                keep_hidden : 1.0})

    # train the softmax classifier
    print "Training softmax"
    W_s = weight_variable([89, 39])
    b_s = bias_variable([39])
    baseline = tf.placeholder("float", [None, 39])
    y_true = tf.placeholder("float", [None, 39])

    # define the cost
    X1_baseline_combo = tf.concat(1, [X1_out, baseline])
    y_pred = tf.nn.softmax(tf.matmul(X1_baseline_combo, W_s) + b_s)
    lr_cost = - tf.reduce_sum(y_true * tf.log(tf.clip_by_value(y_pred, 1e-10, 1.0)))
    lr_step = tf.train.AdamOptimizer(1e-4).minimize(lr_cost)

    # set up accuracy checking
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    sess.run(tf.initialize_all_variables())

    for i in range(0, EPOCHS):
        for j in range(0, len(X1_data.train), int(BATCH)):
            lr_step.run(feed_dict = {
                X1_in : X1_data.train[j:(j + BATCH)],
                y_true : Y_data.train[j:(j + BATCH)],
                baseline : baseline_data.train[j:(j + BATCH)],
                keep_input : 1.0,
                keep_hidden : 1.0})

        print i, accuracy.eval(feed_dict = {
            X1_in : X1_data.dev,
            y_true : Y_data.dev,
            baseline : baseline_data.dev,
            keep_input : 1.0,
            keep_hidden : 1.0})

    print "Test accuracy:", accuracy.eval(feed_dict = {
        X1_in : X1_data.test,
        y_true : Y_data.test,
        baseline : baseline_data.test,
        keep_input : 1.0,
        keep_hidden : 1.0})

    # project the data and print it to file
    X1_train_proj = X1_baseline_combo.eval(feed_dict = {
        X1_in : X1_data.train,
        baseline : baseline_data.train,
        keep_input : 1.0,
        keep_hidden : 1.0})

    X1_dev_proj = X1_baseline_combo.eval(feed_dict = {
        X1_in : X1_data.dev,
        baseline : baseline_data.dev,
        keep_input : 1.0,
        keep_hidden : 1.0})

    X1_test_proj = X1_baseline_combo.eval(feed_dict = {
        X1_in : X1_data.test,
        baseline : baseline_data.test,
        keep_input : 1.0,
        keep_hidden : 1.0})

    scipy.io.savemat("dcca_projected_data.mat", {'dataTr' : X1_train_proj, "PhonesTr" : labels_data.train, "dataDev" : X1_dev_proj, "PhonesDev" : labels_data.dev, "dataTest" : X1_test_proj, "PhonesTest" : labels_data.test})
Beispiel #40
0
	session.execute("USE TweetsXiaohu")
	# session.execute("DROP TABLE IF EXISTS Tweet")
	rows = session.execute("SELECT text, hashtags FROM Tweet limit 1000")
	X, Y = [], []
	for row in rows:
		X.append(row.text)
		Y.append([x.lower() for x in row.hashtags])
	vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore')
	# print(vectorizer)

	X = vectorizer.fit_transform(X).toarray()
	# print '40', X
	# print type(X)
	Y_indicator = LabelBinarizer().fit(Y).transform(Y)
	cca = CCA(n_components = 100, max_iter=10)
	cca.fit(X, Y_indicator)
	X = cca.transform(X)
	# print '45', X
	# print type(X)
	classif = OneVsRestClassifier(SVC(kernel='linear'))
	classif.fit(X, Y)

	for row in rows:
		# row = rows[0]
		# print vectorizer.transform([row.text]).toarray()
		# print cca.predict(vectorizer.transform([row.text]).toarray())
		transformed = vectorizer.transform([row.text]).toarray()
		# print '55', transformed
		ccad = cca.transform(transformed)
		# print '57', ccad
		predicts = classif.predict(ccad)
Beispiel #41
0
# check type of array
#print(np.dtype(data_selection))

# force dtype = float32
data_selection = data_selection.astype(np.float32, copy=False)

# complete cases
data_selection = data_selection[~np.isnan(data_selection).any(axis=1)]
data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]

# target variable / covariates
y = data_selection[:,0:3]
x = data_selection[:,4:]

# split test-train
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0)


cca = CCA(n_components=1,scale=True)
cca.fit(x_train, y_train)
#CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06),
X_train_r, Y_train_r = cca.transform(x_train,y_train)
X_test_r, Y_test_r = cca.transform(x_test, y_test)

print(type(X_train_r))
print(np.shape(X_train_r))
print(np.shape(Y_train_r))
print(np.shape(x))

print(np.corrcoef(X_train_r[:,0],Y_train_r[:,0]))
print(np.corrcoef(X_test_r[:,0],Y_test_r[:,0]))
    plt.figure()
    for i in range (5):
        plt.plot(nComponents,plsRegScores[i,:],lw=3)

    plt.xlim(1,np.amax(nComponents))
    plt.title('PLS Regression accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
    plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right')
    plt.grid(True)

if (0):
    #%% Canonical Correlation Analysis
    nComponents = np.arange(1,nClasses +1)
    cca = CCA(n_components=nClasses)
    cca.fit(Xtrain,Ytrain)
    XtrainT = cca.transform(Xtrain)
    XtestT = cca.transform(Xtest)
    ccaScores = np.zeros((5,np.alen(nComponents)))
    for i,n in enumerate(nComponents):
        ccaScores[:,i] = util.classify(XtrainT[:,0:n],XtestT[:,0:n],labelsTrain,labelsTest)
    
    cca = CCA(n_components=3)
    cca.fit(Xtrain,Ytrain)
    xt = cca.transform(Xtrain)
    fig = plt.figure()
    util.plotData(fig,xt,labelsTrain,classColors)
    plt.title('First 3 components of projected data')
    

    #%% Plot accuracies for CCA
class CCA_Model:
    def __init__(self,n_components):
        self.n_components = n_components
        self.cca = CCA(n_components=n_components)
        self.ntop  = 10


    def learn_model(self,X_chanel, Y_chanel,Y_Distinct=None):
        """

        :param X_chanel: array-like for X chanel
        :param Y_chanel: array-line for Y chanel
        :return:

        """
        print "Start learning..."

        self.x_dim  = len(X_chanel[0])
        self.y_dim = len(Y_chanel[0])
        self.cca.fit(X_chanel,Y_chanel)
        if Y_Distinct == None:
            self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_chanel)
        else:
            self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_Distinct)

        print "Learning completed"


    def get_bet_match_index_transform_x2y(self,x_transform):
        shape = self.Y_transform.shape
        scores = np.ndarray(shape[0],dtype=float)
        for i in xrange(shape[0]):
            scores[i] = np.dot(self.Y_transform[i],x_transform)
            #scores[i] = entropy(x_transform,self.Y_transform[i])

        indices = (-scores).argsort()[:self.ntop]
        return [indices, scores[indices]]


    def get_bet_match_index_transform_y2x(self,y_transform):
        shape = self.X_transform.shape
        scores = np.ndarray(shape[0], dtype=float)
        for i in xrange(shape[0]):
            scores[i] = np.dot(self.X_transform[i], y_transform)
            #scores[i] = entropy(y_transform,self.X_transform[i])
        indices = (-scores).argsort()[:self.ntop]

        return [indices, scores[indices]]

    def get_best_match_cross_indices_x2y(self,x_inputs):
        x_transformes = self.cca.transform(x_inputs)
        results = []
        for x_transform in x_transformes:
            results.append(self.get_bet_match_index_transform_x2y(x_transform))
        return results

    def get_best_match_cross_indices_y2x(self,y_inputs):
        _, y_transformes = self.cca.transform([[0 for i in xrange(self.x_dim)]],y_inputs)
        results = []
        for y_transform in y_transformes:
            results.append(self.get_bet_match_index_transform_y2x(y_transform))
        return results
    plt.xlabel("Y comp. 1")
    plt.ylabel("Y comp. 2")
    plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'% numpy.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
    plt.legend(loc="best")
    plt.xticks(())
    plt.yticks(())
    plt.savefig(output_file)
    plt.close()

# PLSCA
plsca = PLSCanonical(n_components=2)
plsca.fit(Xtrain, Ytrain)
# PLSCanonical(algorithm='nipals', copy=True, max_iter=500, n_components=2,
#       scale=True, tol=1e-06)
X_train_r, Y_train_r = plsca.transform(Xtrain, Ytrain)
X_test_r, Y_test_r = plsca.transform(Xtest, Ytest)
do_plot(X_train_r,Y_train_r,X_test_r,Y_test_r,'%s/PLSCA_2comp_norm.pdf' %output_folder)

# CCA
# probably not necessary, but just in case the data was modified in some way
Ytrain = norm.loc[train,:]
Ytest = norm.loc[holdout,:]
Xtrain = numpy.array(X.loc[train,:]) 
Xtest = X.loc[holdout,:]
cca = CCA(n_components=2)
cca.fit(Xtrain, Ytrain)
# CCA(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06)
X_train_r, Y_train_r = cca.transform(Xtrain, Ytrain)
X_test_r, Y_test_r = cca.transform(Xtest, Ytest)
do_plot(X_train_r,Y_train_r,X_test_r,Y_test_r,'%s/CCA_2comp_norm.pdf' %output_folder)