def perform_pyrcca_cca(X: np.ndarray, Y: np.ndarray):
    assert X.shape[0] == Y.shape[0]
    n, p, q = X.shape[0], X.shape[1], Y.shape[1]
    n_components = min(n, p, q)
    cca = rcca.CCA(kernelcca=False, reg=1e4, numCC=n_components)
    cca.train([X, Y])
    return cca
Esempio n. 2
0
def lc_rcca(datasets, kernelcca=True, reg=0.1, numCC=2, verbose=False):
    #    datasets contain 2 subsets: X and Y
    cca = rcca.CCA(kernelcca=kernelcca, reg=reg, numCC=numCC)
    cca.train(datasets)
    # calc the correlation between the first cannonical variate
    corr_firstVariate = cca.__dict__['cancorrs'][0]
    return corr_firstVariate, cca
Esempio n. 3
0
 def __compute_fitness(self):
     """
     For each dataset, computes the average out of sample correlation between signature exposure and
     gene expression over all k, folds and repeats. CCA coefficients are computed using training samples and then
     correlation is evaluated on test samples.
     """
     corr_all = {}
     for dataset in DATASETS:
         corr_all[dataset.dataset_name] = []
         rel_ge_data = dataset.train_ge_data
         for k in dataset.k_range:
             rel_exposures = self.train_exposures[dataset.dataset_name][k]
             for rep in np.arange(REPEATS):
                 folds = dataset.cca_folds['train'][rep]
                 folds_in_dataset = len(folds)
                 for fold_i in np.arange(folds_in_dataset):
                     not_fold = np.array(
                         np.setdiff1d(np.arange(folds_in_dataset), fold_i))
                     rel_indices_train = np.concatenate(
                         [folds[f] for f in not_fold])
                     rel_indices_val = folds[fold_i]
                     cca = rcca.CCA(reg=1e-4, numCC=1, verbose=False)
                     cca.train([
                         rel_exposures[rel_indices_train],
                         rel_ge_data[rel_indices_train]
                     ])
                     corr = self.__compute_correlation(
                         cca, rel_exposures[rel_indices_val],
                         rel_ge_data[rel_indices_val])
                     corr_all[dataset.dataset_name].append(corr)
         self.fitness_scores[dataset.dataset_name] = np.round(
             np.mean(corr_all[dataset.dataset_name]), 4)
     print('Correlation:', self.fitness_scores)
Esempio n. 4
0
def main():
    parser=OptionParser()
    parser.add_option('--out',dest='out')
    parser.add_option('--matrices',dest='ms',default='',help='Comma delimited, .npy files, nodes should be aligned')
    opts,args=parser.parse_args()

    matrices=opts.ms.split(',')
    m1=np.load(matrices[0])
    m2=np.load(matrices[1])

    # Set up Pyrcca
    cca = rcca.CCA(kernelcca=False, numCC=2, reg=0.5)
    # Find canonical components
    training=cca.train([m1,m2])
Esempio n. 5
0
def run_cca(data1, data2, test1, test2, numCC, reg):
    cca = rcca.CCA(kernelcca=False, numCC=numCC, reg=reg)
    cca.train([data1, data2])
    # Find canonical components
    # Test on held-out data
    corrs = cca.validate([test1, test2])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plot0 = ax.bar(np.arange(corrs[0].shape[0]),
                   corrs[0],
                   0.3,
                   color="steelblue")
    plot1 = ax.bar(np.arange(corrs[1].shape[0]) + 0.35,
                   corrs[1],
                   0.3,
                   color="orangered")
    ax.legend([plot0[0], plot1[0]], ["Dataset 1", "Dataset 2"])
    ax.set_ylabel("Prediction correlation")
    ax.set_xticks(np.arange(0, corrs[0].shape[0], 20) + 0.325)
    ax.set_xticklabels(["%d" % i for i in range(0, corrs[0].shape[0], 20)])
    ax.set_xlabel("Test data m=113")
    fig.savefig(str(PROJDIR) + '/Prediction.png', dpi=fig.dpi)

    #printMatrix(trainCaptureC, str(PROJDIR)+'/trainCaptureC_chr1', '1-q_value', 1, 'upper')
    #printMatrix(valiCaptureC, str(PROJDIR)+'/valiCaptureC_chr1', '1-q_value', 1, 'upper')
    #printMatrix(testCaptureC, str(PROJDIR)+'/testCaptureC_chr1', '1-q_value', 1, 'upper')

    #printMatrix(PPMatrix, str(PROJDIR)+'/PPMatrixHiC_chr1', '1-q_value', 1, 'upper')
    #DiffKMatrix10=DiffusionKernel(PPMatrix, 10)
    #DiffKMatrix5=DiffusionKernel(PPMatrix, 5)
    #DiffKMatrix1=DiffusionKernel(PPMatrix, 1)
    #Difference15=np.subtract(DiffKMatrix5, DiffKMatrix1)
    #Difference110=np.subtract(DiffKMatrix10, DiffKMatrix1)
    #printMatrix(Difference15, str(PROJDIR)+'/DiffKMatrix15HiC_chr1', 'Exp(5*H)-Exp(1*H)', 0.001, 'lower')
    #printMatrix(Difference110, str(PROJDIR)+'/DiffKMatrix110HiC_chr1', 'Exp(10*H)-Exp(1*H)', 0.001, 'lower')

    #train = train_vali_test(PPMatrix, 0.9, 0.1)
    #printMatrix(train, str(PROJDIR)+'/train_chr1', '1-q_value', 1, 'upper')
    #np.savetxt(output, PPMatrix)
    return cca.train([data1, data2])
Esempio n. 6
0
def cca_rcca(spikes, stimulus, filter_length, n_components, regularization,
             whiten):

    ncells = int(spikes.shape[1] / filter_length[1])
    if whiten:
        spikes, spikes_rotation = whiten_data(spikes)

    cca = rcca.CCA(kernelcca=False, reg=regularization, numCC=n_components)
    cca.train([spikes, stimulus])

    spikes_res = cca.ws[0]
    # Derotate the spikes to be able to interpret the responses
    if whiten:
        spikes_res = spikes_rotation @ spikes_res

    resp_comps = np.swapaxes(spikes_res, 1, 0)
    resp_comps = resp_comps.reshape((n_components, ncells, filter_length[1]))

    stim_comps = np.swapaxes(cca.ws[1], 1, 0)
    stim_comps = stim_comps.reshape((n_components, 2, filter_length[0]))

    cancorrs = cca.cancorrs

    return resp_comps, stim_comps, cancorrs
def fit_cca_clf_loop(
    h_load_file: str,
    n_seeds: int = 3,
    rescale: bool = True,
    save_file: str = None,
    **kwargs,
):
    save_file = 'results_cca_clf_{}.df'.format(
        now()) if save_file is None else save_file
    default_args = {
        'seeds': [int(2**i) for i in range(max(1, n_seeds))],
        'min_nb_trials': 100,
        'target': True,
        'global_normalize': True,
        'augment_data': False,
        'xv_folds': 5,
        'timepoint': 45,
        'num_ccs': np.arange(5, 91, 5),
        'cca_regs': np.logspace(-3, -1.5, num=20),
        'clf_regs': np.logspace(-3, -1.4, num=20),
        'clf_max_iter': int(1e3),
        'clf_tol': 1e-4,
    }
    for k in default_args:
        if k in kwargs:
            default_args[k] = kwargs[k]

    results = pd.DataFrame()
    warnings.filterwarnings('ignore', category=RuntimeWarning)
    for random_state in tqdm(default_args['seeds']):
        np.random.seed(random_state)

        for fold in tqdm(range(default_args['xv_folds']), leave=False):
            data_trn, data_tst = prepare_cca_data(
                h_load_file=h_load_file,
                min_nb_trials=default_args['min_nb_trials'],
                timepoint=default_args['timepoint'],
                target=default_args['target'],
                normalize_mode='zscore',
                augment=default_args['augment_data'],
                xv_folds=default_args['xv_folds'],
                which_fold=fold,
                verbose=False,
            )
            train_list, y_trn = data_trn['processed'], data_trn['labels']
            test_list, y_tst = data_tst['processed'], data_tst['labels']

            for n_components in tqdm(default_args['num_ccs'], leave=False):
                train_list_centered = [
                    (item - item.mean()) /
                    np.sqrt(n_components) if rescale else item - item.mean()
                    for item in train_list
                ]
                for reg in tqdm(default_args['cca_regs'], leave=False):
                    cca = rcca.CCA(
                        kernelcca=True,
                        ktype='linear',
                        numCC=n_components,
                        reg=reg,
                        cutoff=1e-15,
                        verbose=False,
                    ).train(train_list_centered)
                    testcorrs = cca.validate(test_list)

                    corrs = []
                    for item in testcorrs:
                        corrs.append(np.mean(np.abs(item)))
                    pred_r = np.mean(corrs)

                    x_trn = [x @ w for x, w in zip(train_list, cca.ws)]
                    x_tst = [x @ w for x, w in zip(test_list, cca.ws)]
                    x_trn, x_tst = tuple(map(np.concatenate, [x_trn, x_tst]))

                    for C in default_args['clf_regs']:
                        clf = LogisticRegression(
                            C=C,
                            penalty='l1',
                            solver='liblinear',
                            class_weight='balanced',
                            max_iter=default_args['clf_max_iter'],
                            tol=default_args['clf_tol'],
                            random_state=random_state,
                        ).fit(x_trn, y_trn)
                        y_pred = clf.predict(x_tst)

                        balacc = balanced_accuracy_score(y_tst, y_pred)
                        mcc = matthews_corrcoef(y_tst, y_pred)

                        data_dict = {
                            'seed': [random_state] * 3,
                            'fold': [fold] * 3,
                            'n_components': [n_components] * 3,
                            'cca_reg': [reg] * 3,
                            'clf_reg': [C] * 3,
                            'metric': ['mcc', 'bal_acc', 'pred_r'],
                            'value': [mcc, balacc, pred_r],
                        }
                        results = pd.concat(
                            [results,
                             pd.DataFrame.from_dict(data_dict)])
                save_obj(obj=results,
                         file_name=save_file,
                         save_dir='./results',
                         mode='df',
                         verbose=False)

    results = reset_df(results)
    save_obj(obj=results,
             file_name=save_file,
             save_dir='./results',
             mode='df',
             verbose=True)
    best = extract_best_hyperparams(results, metric='mcc', verbose=True)

    return results, best, default_args
def get_best_cca_clf(
    h_load_file: str,
    best: dict,
    min_nb_trials: int = -1,
    time_range: range = range(45, 46),
    target: bool = True,
    global_normalize: bool = True,
    augment_data: bool = False,
    xv_folds: int = 5,
    which_fold: int = 0,
    random_sate: int = 42,
):

    data_trn, data_tst = prepare_cca_data(
        h_load_file=h_load_file,
        min_nb_trials=min_nb_trials,
        target=target,
        global_normalize=global_normalize,
        augment_data=augment_data,
        xv_folds=xv_folds,
        which_fold=which_fold,
        time_range=time_range,
        verbose=False,
    )
    train_list, y_trn = data_trn['processed'], data_trn['labels']
    test_list, y_tst = data_tst['processed'], data_tst['labels']

    cca = rcca.CCA(
        kernelcca=True,
        ktype='linear',
        reg=best['cca_reg'],
        numCC=best['n_components'],
        verbose=False,
    )
    cca.train([item / np.sqrt(best['n_components']) for item in train_list])
    testcorrs = cca.validate(test_list)

    corrs = []
    for item in testcorrs:
        corrs.append(np.mean(np.abs(item)))
    pred_r = np.mean(corrs)

    x_trn = [x @ w for x, w in zip(train_list, cca.ws)]
    x_tst = [x @ w for x, w in zip(test_list, cca.ws)]
    x_trn, x_tst = tuple(map(np.concatenate, [x_trn, x_tst]))

    clf = LogisticRegression(
        random_state=random_sate,
        C=best['clf_reg'],
        penalty='l1',
        solver='liblinear',
        class_weight='balanced',
        max_iter=int(1e4),
        tol=1e-6,
    ).fit(x_trn, y_trn)
    y_pred = clf.predict(x_tst)

    balacc = balanced_accuracy_score(y_tst, y_pred)
    mcc = matthews_corrcoef(y_tst, y_pred)

    msg = "[PROGRESS] fitting done. results:\n"
    msg += "corr: {:.3f},   balanced accuracy: {:.3f},   mcc: {:.3f}"
    msg = msg.format(pred_r, balacc, mcc)
    print(msg)

    comps_trn, comps_df_trn = extract_components(data_trn, cca)
    comps_tst, comps_df_tst = extract_components(data_tst, cca)

    output = {
        'data_trn': data_trn,
        'data_tst': data_tst,
        'cca': cca,
        'clf': clf,
        'comps_trn': comps_trn,
        'comps_tst': comps_tst,
        'comps_df_trn': comps_df_trn,
        'comps_df_tst': comps_df_tst,
    }
    return output
Esempio n. 9
0
def cca_omb_components(exp: str,
                       stim_nr: int,
                       n_components: int = 6,
                       regularization=None,
                       filter_length=None,
                       maxframes=None,
                       shufflespikes: bool = False,
                       savedir: str = None,
                       savefig: bool = True,
                       sort_by_nspikes: bool = True,
                       select_cells: list = None,
                       plot_first_ncells: int = None):
    """
    Analyze OMB responses using cannonical correlation analysis and plot the results.

    Parameters
    ---
    n_components:
        Number of components that will be requested from the CCA anaylsis. More numbers mean
        the algortihm will stop at a later point. That means components of analyses with fewer
        n_components are going to be identical to the first n components of the higher-number
        component analyses.
    regularization:
        The regularization parameter to be passed onto rcca.CCA.
    filter_length:
        The length of the time window to be considered in the past for the stimulus and the responses.
        Can be different for stimulus and response, if a tuple is given.
    maxframes: int
        Number of frames to load in the the experiment object. Used to avoid memory and performance
        issues.
    shufflespikes: bool
        Whether to randomize the spikes, to validate the results
    savedir: str
        Custom directory to save the figures and data files. If None, will be saved in the experiment
        directory under appropritate path.
    savefig: bool
        Whether to save the figures.
    sort_by_nspikes: bool
        Wheter to sort the cell weights array by the number of spikes during the stimulus.
    select_cells: list
       A list of indexes for the subset of cells to perform the analysis for.
    plot_first_ncells: int
        Number of cells to plot in the cell plots.
    """
    if regularization is None:
        regularization = 0

    cca = rcca.CCA(kernelcca=False, reg=regularization, numCC=n_components)

    st = OMB(exp, stim_nr, maxframes=maxframes)

    if filter_length is None:
        filter_length = st.filter_length

    if type(filter_length) is int:
        filter_length = (filter_length, filter_length)

    if type(savedir) is str:
        savedir = Path(savedir)

    if savedir is None:
        savedir = st.stim_dir / 'CCA'
        savedir.mkdir(exist_ok=True, parents=True)

    spikes = st.allspikes()
    # Set the mean to zero for spikes
    spikes -= spikes.mean(axis=1)[:, None]

    bgsteps = st.bgsteps

    if select_cells is not None:
        if type(select_cells) is not np.array:
            select_cells = np.array(select_cells)
        spikes = spikes[select_cells]
        st.nclusters = len(select_cells)
        # Convert to list for better string representation
        # np.array is printed as "array([....])"
        # with newline characters which is problematic in filenames
        select_cells = list(select_cells)

    nspikes_percell = spikes.sum(axis=1)

    if shufflespikes:
        spikes = spikeshuffler.shufflebyrow(spikes)

    figsavename = f'{n_components=}_{shufflespikes=}_{select_cells=}_{regularization=}_{filter_length=}_{whiten=}'
    # If the file length gets too long due to the list of selected cells, summarize it.
    if len(figsavename) > 200:
        figsavename = f'{n_components=}_{shufflespikes=}_select_cells={len(select_cells)}cells-index{select_cells[0]}to{select_cells[-1]}_{regularization=}_{filter_length=}_{whiten=}'

    #sp_train, sp_test, stim_train, stim_test = train_test_split(spikes, bgsteps)

    stimulus = mft.packdims(st.bgsteps, filter_length[0])
    spikes = mft.packdims(spikes, filter_length[1])

    if whiten:
        spikes, spikes_rotation = whiten_data(spikes)

    cca.train([spikes, stimulus])

    # import IPython.core.debugger as ipdb; ipdb.set_trace()
    spikes_res = cca.ws[0]
    # Derotate the data to be able to interpret the responses
    if whiten:
        spikes_res = spikes_rotation @ spikes_res

    cells = np.swapaxes(spikes_res, 1, 0)
    cells = cells.reshape((n_components, st.nclusters, filter_length[1]))

    nsp_argsorted = np.argsort(nspikes_percell)
    cells_sorted_nsp = cells[:, nsp_argsorted, :]

    if sort_by_nspikes:
        cells_toplot = cells_sorted_nsp
    else:
        cells_toplot = cells

    if plot_first_ncells is not None:
        cells_toplot = cells_toplot[:, :plot_first_ncells, ...]

    motionfilt_x = cca.ws[1][:filter_length[0]].T
    motionfilt_y = cca.ws[1][filter_length[0]:].T

    motionfilt_r, motionfilt_theta = mft.cart2pol(motionfilt_x, motionfilt_y)
    #%%
    nrows, ncols = plf.numsubplots(n_components)
    fig_cells, axes_cells = plt.subplots(nrows, ncols, figsize=(10, 10))

    for i in range(n_components):
        ax = axes_cells.flat[i]
        im = ax.imshow(cells[i, :],
                       cmap='RdBu_r',
                       vmin=asc.absmin(cells),
                       vmax=asc.absmax(cells),
                       aspect='auto',
                       interpolation='nearest')
        ax.set_title(f'{i}')
    fig_cells.suptitle(f'Cells default order {shufflespikes=}')
    if savefig:
        fig_cells.savefig(savedir / f'{figsavename}_cells_default_order.pdf')
    plt.close(fig_cells)

    nsubplots = plf.numsubplots(n_components)
    height_list = [1, 1, 1, 3]  # ratios of the plots in each component

    # Create a time vector for the stimulus plots
    t_stim = -np.arange(0, filter_length[0] * st.frame_duration,
                        st.frame_duration)[::-1] * 1000
    t_response = -np.arange(0, filter_length[1] * st.frame_duration,
                            st.frame_duration)[::-1] * 1000
    xtick_loc_params = dict(nbins=4, steps=[2, 5, 10], integer=True)

    nsubrows = len(height_list)
    height_ratios = nsubplots[0] * height_list
    fig, axes = plt.subplots(nrows=nsubplots[0] * nsubrows,
                             ncols=nsubplots[1],
                             gridspec_kw={'height_ratios': height_ratios},
                             figsize=(11, 10))

    for row, ax_row in enumerate(axes):
        for col, ax in enumerate(ax_row):
            mode_i = int(row / nsubrows) * nsubplots[1] + col
            # ax.text(0.5, 0.5, f'{mode_i}')
            ax.set_yticks([])
            # Plot motion filters
            if row % nsubrows == 0:

                ax.plot(t_stim,
                        motionfilt_x[mode_i, :],
                        marker='o',
                        markersize=1)
                ax.plot(t_stim,
                        motionfilt_y[mode_i, :],
                        marker='o',
                        markersize=1)
                if col == 0:
                    ax.set_ylabel('Motion',
                                  rotation=0,
                                  ha='right',
                                  va='center')
                ax.set_ylim(cca.ws[1].min(), cca.ws[1].max())

                # Draw a horizontal line for zero and prevent rescaling of x-axis
                xlims = ax.get_xlim()
                ax.hlines(0,
                          *ax.get_xlim(),
                          colors='k',
                          linestyles='dashed',
                          alpha=0.3)
                ax.set_xlim(*xlims)

                # ax.set_title(f'Component {mode_i}', fontweight='bold')

                ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params))

                if not mode_i == 0 or filter_length[0] == filter_length[1]:
                    ax.xaxis.set_ticklabels([])
                else:
                    ax.tick_params(axis='x', labelsize=8)

            # Plot magnitude of motion
            elif row % nsubrows == 1:
                ax.plot(t_stim,
                        motionfilt_r[mode_i, :],
                        color='k',
                        marker='o',
                        markersize=1)
                if col == 0:
                    ax.set_ylabel('Magnitude',
                                  rotation=0,
                                  ha='right',
                                  va='center')
                ax.set_ylim(motionfilt_r.min(), motionfilt_r.max())
                ax.xaxis.set_ticklabels([])
                ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params))
            # Plot direction of motion
            elif row % nsubrows == 2:
                ax.plot(t_stim,
                        motionfilt_theta[mode_i, :],
                        color='r',
                        marker='o',
                        markersize=1)
                if mode_i == 0:
                    ax.yaxis.set_ticks([-np.pi, 0, np.pi])
                    ax.yaxis.set_ticklabels(['-π', 0, 'π'])
                ax.xaxis.set_ticklabels([])
                ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params))
            # Plot cell weights
            elif row % nsubrows == nsubrows - 1:
                im = ax.imshow(cells_toplot[mode_i, :],
                               cmap='RdBu_r',
                               vmin=asc.absmin(cells),
                               vmax=asc.absmax(cells),
                               aspect='auto',
                               interpolation='nearest',
                               extent=[
                                   t_response[0], t_response[-1], 0,
                                   cells_toplot.shape[1]
                               ])
                ax.xaxis.set_major_locator(MaxNLocator(**xtick_loc_params))
                if row == axes.shape[0] - 1:
                    ax.set_xlabel('Time before spike [ms]')
                    # ax.set_xticks(np.array([0, .25, .5, .75, 1]) * cells_toplot.shape[-1])
                    # ax.xaxis.set_ticklabels(-np.round((ax.get_xticks()*st.frame_duration), 2)[::-1])
                else:
                    ax.xaxis.set_ticklabels([])

                plf.integerticks(ax, 5, which='y')
                if col == 0:
                    ax.set_ylabel(
                        f'Cells\n{"(sorted nsp)"*sort_by_nspikes}\n{("(first " + str(plot_first_ncells)+ " cells)")*(type(plot_first_ncells) is int) }',
                        rotation=0,
                        ha='right',
                        va='center')
                else:
                    ax.yaxis.set_ticklabels([])
                if mode_i == n_components - 1:
                    plf.colorbar(im)
            # Add ticks on the right side of the plots
            if col == nsubplots[1] - 1 and row % nsubrows != nsubrows - 1:
                plf.integerticks(ax, 3, which='y')
                ax.yaxis.tick_right()

    fig.suptitle(
        f'CCA components of {st.exp_foldername}\n{shufflespikes=} {n_components=}\n{sort_by_nspikes=}\n'
        + f'{select_cells=} {regularization=} {filter_length=}')
    fig.subplots_adjust(wspace=0.1, hspace=0.3)
    if savefig:
        fig.savefig(savedir / f'{figsavename}_cellsandcomponents.pdf')
    # plt.show()
    plt.close(fig)

    #%%
    fig_corrs = plt.figure()
    plt.plot(cca.cancorrs, 'ko')
    # plt.ylim([0.17, 0.24])
    plt.xlabel('Component index')
    plt.ylabel('Correlation')
    plt.title(f'Cannonical correlations {shufflespikes=}')
    if savefig:
        fig_corrs.savefig(savedir / f'{figsavename}_correlation_coeffs.pdf')
    # plt.show()
    plt.close(fig_corrs)

    fig_nlt, axes_nlt = plt.subplots(nrows, ncols, figsize=(10, 10))
    for i, ax in enumerate(axes_nlt.flatten()):
        # Reshape to perform the convolution as a matrix multiplication

        generator_motion = stimulus @ cca.ws[1][..., i]
        generator_cells = spikes @ cca.ws[0][..., i]

        nonlinearity, bins = nlt.calc_nonlin(generator_cells, generator_motion)
        # ax.scatter(generator_motion, generator_cells, s=1, alpha=0.5, facecolor='grey')
        ax.plot(bins, nonlinearity, 'k')
        if i == 0:
            all_nonlinearities = np.empty((n_components, *nonlinearity.shape))
            all_bins = np.empty((n_components, *bins.shape))
        all_nonlinearities[i, ...] = nonlinearity
        all_bins[i, ...] = bins

    nlt_xlims = []
    nlt_ylims = []
    for i, ax in enumerate(axes_nlt.flatten()):
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()

        nlt_xlims.extend(xlim)
        nlt_ylims.extend(ylim)
    nlt_maxx, nlt_minx = max(nlt_xlims), min(nlt_xlims)
    nlt_maxy, nlt_miny = max(nlt_ylims), min(nlt_ylims)

    for i, ax in enumerate(axes_nlt.flatten()):
        ax.set_xlim([nlt_minx, nlt_maxx])
        ax.set_ylim([nlt_miny, nlt_maxy])

    for i, axes_row in enumerate(axes_nlt):
        for j, ax in enumerate(axes_row):
            if i == nrows - 1:
                ax.set_xlabel('Generator (motion)')
            if j == 0:
                ax.set_ylabel('Generator (cells)')
            else:
                ax.yaxis.set_ticklabels([])
            ax.set_xlim([nlt_minx, nlt_maxx])
            ax.set_ylim([nlt_miny, nlt_maxy])

    fig_nlt.suptitle(f'Nonlinearities\n{figsavename}')
    if savefig:
        fig_nlt.savefig(savedir / f'{figsavename}_nonlinearity.png')
    plt.close(fig_nlt)
    keystosave = [
        'n_components',
        'cells',
        'motionfilt_x',
        'motionfilt_y',
        'motionfilt_r',
        'motionfilt_theta',
        'cells_sorted_nsp',
        'select_cells',
        'regularization',
        'filter_length',
    ]
    datadict = dict()
    for key in keystosave:
        datadict[key] = locals()[key]
    np.savez(savedir / figsavename, **datadict)
def run_cca_loop(n_seeds: int = 10, reg: float = 0.1, **kwargs):
    default_args = {
        'sample_sizes': [int(10 ** i) for i in range(2, 5)],
        'seeds': [int(2 ** i) for i in range(n_seeds)],
        'sigmas': np.linspace(0, 5, num=11),
        'normal': [True],
        'three_d': [False],
        'num_expts': [10, 20, 30],
        'min_ncs': [int(2**i) for i in range(1, 7)],
        'max_ncs': [int(4**i) for i in range(1, 7)],
    }
    for k in default_args:
        if k in kwargs:
            default_args[k] = list(kwargs[k]) if isinstance(kwargs[k], (list, tuple, np.ndarray)) else [kwargs[k]]

    df = pd.DataFrame()
    for n_samples in tqdm(default_args['sample_sizes']):
        for three_d in default_args['three_d']:
            train, test = generate_source_signal(n_samples=n_samples, three_d=three_d)
            for seed in tqdm(default_args['seeds'], leave=False):
                for normal in default_args['normal']:
                    for sigma in default_args['sigmas']:
                        for num_expts in default_args['num_expts']:
                            for min_nc in default_args['min_ncs']:
                                for max_nc in [item for item in default_args['max_ncs'] if item > min_nc]:
                                    # create sim
                                    sim = create_cca_simulation(
                                        train=train,
                                        test=test,
                                        num_expts=num_expts,
                                        min_num_cells=min_nc,
                                        max_num_cells=max_nc,
                                        n_samples=n_samples,
                                        three_d=three_d,
                                        angle_spacing=1.0,
                                        magnitude_range=None,
                                        sigma=sigma,
                                        normal=normal,
                                        seed=seed,
                                    )
                                    # fit PLS
                                    cca = rcca.CCA(
                                        kernelcca=True,
                                        ktype='linear',
                                        reg=reg,
                                        numCC=sim['metadata']['dim_z'],
                                        verbose=False,
                                    )
                                    cca.train(sim['x_train'])
                                    # get results
                                    results = visualize_cca_results(cca, sim, verbose=False)
                                    results.update({
                                        'n_samples': n_samples,
                                        'three_d': three_d,
                                        'seed': seed,
                                        'normal': normal,
                                        'sigma': sigma,
                                        'num_expts': num_expts,
                                        'min_nc': min_nc,
                                        'max_nc': max_nc,
                                    })
                                    results = {k: [v] for k, v in results.items()}
                                    df = pd.concat([df, pd.DataFrame.from_dict(results)])
    return reset_df(df), default_args
Esempio n. 11
0
      'Collaborative Teachers %',
     'Supportive Environment %',
       'Effective School Leadership %',
   'Strong Family-Community Ties %',
    'Trust %','Average ELA Proficiency',
       'Average Math Proficiency']]
# drop missing values
data = data.dropna()
# separate X and Y groups
X = data[['Collaborative Teachers %',
     'Supportive Environment %',
   'Strong Family-Community Ties %',
    'Trust %']]
      
Y = data[['Average ELA Proficiency',
       'Average Math Proficiency']]
for col in X.columns:
    X[col] = X[col].str.strip('%')
    X[col] = X[col].astype('int')
# Standardise the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=True, with_std=True)
X_sc = sc.fit_transform(X)
Y_sc = sc.fit_transform(Y)
import rcca
nComponents = 2 # min(p,q)=2
cca = rcca.CCA(kernelcca = False, reg = 0., numCC = nComponents,)
# train on data
cca.train([X_sc, Y_sc])
print('Canonical Correlation Per Component Pair:',cca.cancorrs)
print('% Shared Variance:',cca.cancorrs**2)
        videoObj = cv2.VideoCapture("input/" + input_video)
        fps = videoObj.get(cv2.CAP_PROP_FPS)
        num_frames = int(videoObj.get(cv2.CAP_PROP_FRAME_COUNT))

        video_features = extract_video_features()
        audio_features = extract_audio_features()

        # Calculates potential black frames at the beginning of videos and removes related audio features
        if len(audio_features) >= num_frames:
            black_frames = abs(num_frames - len(audio_features))
            audio_features = audio_features[black_frames:]
        else:
            black_frames = 0

        # Loads the CCA Model
        cca = rcca.CCA()
        cca.load("Model.hdf5")

        print("Detecting speakers...")
        speakers = detect_speakers()

        print("Building output...")
        build_output()

        #Temporary and input files cleaning
        for file in os.listdir("input"):
            if os.path.isdir("input/" + file):
                shutil.rmtree("input/" + file)
            elif not file == input_video:
                os.remove("input/" + file)
Esempio n. 13
0
 def _CCA(self):
     cca = rcca.CCA(kernelcca=False, reg=self.reg, numCC=self.n_components)
     cca.train([self.X, self.Y])
     return cca.ws[0], cca.cancorrs, cca.ws[1]
Esempio n. 14
0
def cca(training: List[EmbeddingCollection],
        test: List[EmbeddingCollection],
        ncomponents: int,
        reg=.01,
        verbose=False):
    """Applies CCA to extract canonical components.

  Args:
    training: A list where each item is a collection of `EmbeddingMatrix` objects used to train CCA.
    test: A list of the same size as `training` but where the collections are used for testing.
    ncomponents: Number of canonical components.
    reg: Regularization parameter.
    verbose: Sets the pyrcca.CCA's verbosity.

  Returns:
    A tuple consisting of:
      - List[EmbeddingCollection] containing principal components for the training sets.
      - None or List[EmbeddingCollection] containing principal components for the test sets.
      - An `rcca.CCA` object.
  """
    # Validate the arguments.
    if len(training) < 2:
        raise ValueError(f'Expected at least 2 training collections')
    reference = training[0]
    for collection in training[1:]:
        if len(collection) != len(reference):
            raise ValueError(
                f'Number of embedding matrices within each collection is not consistent in the training set'
            )
        for em in collection:
            if em.items != reference[0].items:
                raise ValueError(
                    f'Training embedding matrices do not have the same number of items'
                )
    if test is not None and len(test) != 0:
        if len(test) != len(training):
            raise ValueError(
                'Number of collections must be the same in the training and test sets'
            )
        for collection in test:
            for em in collection:
                if em.items != test[0][0].items:
                    raise ValueError(
                        f'Test embedding matrices do not have the same number of items'
                    )

    _cca = rcca.CCA(kernelcca=False,
                    reg=reg,
                    numCC=ncomponents,
                    verbose=verbose)
    # Training.
    training_set, items = _cca_pack(training)
    _cca.train(training_set)
    training_cc_collection = _cca_unpack(training, _cca.comps, items)

    if test is None or len(test) == 0:
        return training_cc_collection, None, _cca

    # Test.
    def _recon(data, ws):
        def _listdot(d1, d2):
            return [np.dot(x[0].T, x[1]) for x in zip(d1, d2)]

        ccomp = _listdot([d.T for d in data], ws)
        return ccomp

    test_set, items = _cca_pack(test)
    comps = _recon(test_set, _cca.ws)
    comps = np.array(comps)
    test_cc_collection = _cca_unpack(test, comps, items)
    return training_cc_collection, test_cc_collection, _cca
def fit_cca_loop(
    h_load_file: str,
    rescale: bool = False,
    save_file: str = None,
    **kwargs,
):
    save_file = 'results_cca_{}.df'.format(
        now()) if save_file is None else save_file
    default_args = {
        'min_nb_trials': 100,
        'target': True,
        'global_normalize': True,
        'augment_data': False,
        'xv_folds': 5,
        'time_range': range(45, 46),
        'num_ccs': np.arange(5, 91, 5),
        'cca_regs': np.logspace(-3, -1.5, num=20),
        'cutoffs': np.logspace(-18, -12, num=3),
    }
    for k in default_args:
        if k in kwargs:
            default_args[k] = kwargs[k]

    results = pd.DataFrame()
    warnings.filterwarnings('ignore', category=RuntimeWarning)
    for fold in tqdm(range(default_args['xv_folds']), leave=False):
        data_trn, data_tst = prepare_cca_data(
            h_load_file=h_load_file,
            min_nb_trials=default_args['min_nb_trials'],
            target=default_args['target'],
            global_normalize=default_args['global_normalize'],
            augment_data=default_args['augment_data'],
            xv_folds=default_args['xv_folds'],
            which_fold=fold,
            time_range=default_args['time_range'],
            verbose=False,
        )
        train_list, y_trn = data_trn['processed'], data_trn['labels']
        test_list, y_tst = data_tst['processed'], data_tst['labels']

        for n_components in tqdm(default_args['num_ccs'], leave=False):
            train_list_centered = [
                (item - item.mean()) /
                np.sqrt(n_components) if rescale else item - item.mean()
                for item in train_list
            ]
            for reg in tqdm(default_args['cca_regs'], leave=False):
                for cutoff in tqdm(default_args['cutoffs'], leave=False):
                    cca = rcca.CCA(
                        kernelcca=True,
                        ktype='linear',
                        numCC=n_components,
                        reg=reg,
                        cutoff=cutoff,
                        verbose=False,
                    )
                    cca.train(train_list_centered)
                    testcorrs = cca.validate(test_list)

                    corrs = []
                    for item in testcorrs:
                        corrs.append(np.mean(np.abs(item)))
                    pred_r = np.mean(corrs)

                    data_dict = {
                        'fold': [fold],
                        'n_components': [n_components],
                        'cca_reg': [reg],
                        'cutoff': [cutoff],
                        'metric': ['pred_r'],
                        'value': [pred_r],
                    }
                    results = pd.concat(
                        [results, pd.DataFrame.from_dict(data_dict)])
            save_obj(obj=results,
                     file_name=save_file,
                     save_dir='./results',
                     mode='df',
                     verbose=False)

    results = reset_df(results, downcast='none')
    save_obj(obj=results,
             file_name=save_file,
             save_dir='./results',
             mode='df',
             verbose=True)
    # TODO: reimplement extract best hyperparams so that it works for this too
    return results, default_args
Esempio n. 16
0
    for index in range(SET_PARAMS.Number_of_multiple_orbits):
        Y, Y_buffer, X, X_buffer, Orbit = Dataset_order(
            index,
            direction,
            binary_set,
            buffer,
            categorical_num,
            use_previously_saved_models,
            columns_compare=["Earth x", "Earth y", "Earth z"],
            columns_compare_to=[
                "Angular momentum of wheels x", "Angular momentum of wheels y",
                "Angular momentum of wheels z"
            ])
        All_orbits.append(Orbit)

        # Split each dataset into two halves: training set and test set
        train1 = Y[:int(nSamples / 2)]
        train2 = X[:int(nSamples / 2)]
        test1 = Y[int(nSamples / 2):]
        test2 = X[int(nSamples / 2):]

        # Create a cca object as an instantiation of the CCA object class.
        cca = rcca.CCA(kernelcca=False, reg=0., numCC=2)

        # Use the train() method to find a CCA mapping between the two training sets.
        cca.train([train1, train2])

        # Use the validate() method to test how well the CCA mapping generalizes to the test data.
        # For each dimension in the test data, correlations between predicted and actual data are computed.
        testcorrs = cca.validate([test1, test2])
        print(testcorrs)
Esempio n. 17
0
    a_matrix = preprocessing.normalize(a_matrix)

    train_v = np.array(v_matrix)
    train_a = np.array(a_matrix)

    print("Number of examples: ", len(train_v))

    max_numCCs = np.shape(train_v)[1] if np.shape(train_v)[1] < np.shape(
        train_a)[1] else np.shape(train_a)[1]
    numCCs = list(range(3, max_numCCs + 1))
    reg_coeffs = [100, 10, 1, 0, 0.1, 0.01, 0.001, 0.0001, 0.00001]

    print("START TRAINING")
    for kernel in [True, False]:
        ktypes = ["poly", "gaussian"] if kernel else [None]
        for ktype in ktypes:
            for coeff in reg_coeffs:
                for numCC in numCCs:

                    cca = rcca.CCA(kernelcca=kernel,
                                   numCC=numCC,
                                   reg=coeff,
                                   ktype=ktype)
                    cca.train([train_v, train_a])

                    cca.save("TrainingModels/" + feat_type + "_" +
                             str(kernel) + "_" + str(ktype) + "_" +
                             str(numCC) + "_" + str(coeff) + ".hdf5")
                    # print("Model: "+f_type+"_"+str(kernel)+"_"+str(ktype)+"_"+str(numCC)+"_"+str(coeff))
                    # print("Cancorrs:", cca.cancorrs)
                    # print("----------------------------------")