Example #1
0
def main(params_file, output_dir, output_code, datasets, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    
    data_for_spca, specimen_ids = ld.load_h5_data("C:\\Users\\SMest\\fv_ nmmouse_human.h5",
                                            metadata_file=None,
                                            limit_to_cortical_layers=None,
                                            id_file=None,
                                            params_file="C:\\Users\\SMest\\source\\repos\\drcme\\drcme\\bin\\default_spca_params.json")
    imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,)
    
    for l, m in data_for_spca.items():
        if type(m) == np.ndarray:
           #
            nu_m = np.nan_to_num(m)
            p = np.nonzero(nu_m[:,:])[1]
            p = max(p)
            nu_m = nu_m[:,:p]
            print(l)
            print(p)
            data_for_spca[l] = imp.fit_transform(nu_m)
            #data_for_spca[l] = nu_m
    data_objects.append(data_for_spca)
    specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    specimen_ids = np.hstack(specimen_ids_list)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error("Mismatch of specimen id dimension ({:d}) and data dimension ({:d})".format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file,
         output_file, use_noise, **kwargs):
    spca_zht_params, _ = ld.define_spca_parameters(params_file)

    spca_results = joblib.load(orig_transform_file)

    # These arguments should be parameterized
    orig_data_objects = []
    orig_specimen_ids_list = []
    for ds in orig_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_organized_data(
            project=ds["project"],
            base_dir=ds["data_dir"],
            use_noise=use_noise,
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            params_file=params_file)
        orig_data_objects.append(data_for_spca)
        orig_specimen_ids_list.append(specimen_ids)
    orig_data_for_spca = []
    for i, do in enumerate(orig_data_objects):
        for j, data_item in enumerate(do):
            if i == 0:
                orig_data_for_spca.append({
                    "data": data_item["data"].copy(),
                    "part_keys": data_item["part_keys"],
                })
            else:
                orig_data_for_spca[j]["data"] = np.vstack(
                    [orig_data_for_spca[j]["data"], data_item["data"]])
    orig_specimen_ids = np.hstack(orig_specimen_ids_list)
    logging.info("Original datasets had {:d} cells".format(
        len(orig_specimen_ids)))
    orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results,
                                                       orig_data_for_spca,
                                                       spca_zht_params)

    new_data_objects = []
    new_specimen_ids_list = []
    for ds in new_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_organized_data(
            project=ds["project"],
            base_dir=ds["data_dir"],
            use_noise=use_noise,
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            params_file=params_file)
        new_data_objects.append(data_for_spca)
        new_specimen_ids_list.append(specimen_ids)
    data_for_spca = []
    for i, do in enumerate(new_data_objects):
        for j, data_item in enumerate(do):
            if i == 0:
                data_for_spca.append({
                    "data": data_item["data"].copy(),
                    "part_keys": data_item["part_keys"],
                })
            else:
                data_for_spca[j]["data"] = np.vstack(
                    [data_for_spca[j]["data"], data_item["data"]])
    new_ids = np.hstack(new_specimen_ids_list)
    logging.info("Applying transform to {:d} new cells".format(len(new_ids)))
    new_combo = spca_transform_new_data(spca_results, data_for_spca,
                                        spca_zht_params, orig_mean, orig_std)
    new_combo_df = pd.DataFrame(new_combo, index=new_ids)
    new_combo_df.to_csv(output_file)
Example #3
0
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = KNNImputer(copy=False)

    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = m
                p = np.nonzero(nu_m[:, :])[1]
                p = max(p)

                print(l)
                print(p)
                data_for_spca[l] = nu_m
            if 'EXTRA' not in ds["fv_h5_file"]:
                data_for_spca[l] = nu_m * -1

        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                do[k] = normalize_ds(do[k], norm_type)
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k], do[k] = equal_ar_size(
                    data_for_spca[k], do[k], k, i)

                do[k] = normalize_ds(do[k], norm_type)

                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
            np.savetxt(output_fld + k + str(i) + '.csv',
                       do[k],
                       delimiter=",",
                       fmt='%12.5f')
            np.savetxt(output_fld + k + str(i) + 'mean.csv',
                       np.vstack((np.nanmean(do[k],
                                             axis=0), np.nanstd(do[k],
                                                                axis=0))),
                       delimiter=",",
                       fmt='%12.5f')
    specimen_ids = np.hstack(specimen_ids_list)
    ### Now run through again and impute missing:
    for l in data_for_spca:
        nu_m = data_for_spca[l]
        nu_m = imp.fit_transform(nu_m)
        data_for_spca[l] = nu_m
    ##Outlier Elim?
    #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = SimpleImputer(
        missing_values=0,
        strategy='mean',
        copy=False,
    )

    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data_SC(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = np.nan_to_num(m)

                data_for_spca[l] = nu_m
                #data_for_spca[l] = nu_m
        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k], do[k] = equal_ar_size(
                    data_for_spca[k], do[k], 0)
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    specimen_ids = np.hstack(specimen_ids_list)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)

    spca_results = sf.spca_on_all_data_SC(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
Example #5
0
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = KNNImputer(copy=False)
    dataset_no = []

    for i, ds in enumerate(datasets):
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = m
                p = np.nonzero(nu_m[:, :])[1]
                p = max(p)
                filename = ds["fv_h5_file"]
                if 'INTRA' not in filename:
                    nu_m = nu_m[:, 30:-1]
                #else:
                #nu_m = nu_m * -1
                print(l)
                print(p)
                data_for_spca[l] = nu_m[:, 94:]

        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)
        dataset_no = np.hstack((dataset_no, np.full(specimen_ids.shape[0], i)))
    truncate = []
    for i, do in enumerate(data_objects[0]):
        ###FIND THE ARGMIN
        argmin = []
        for l in np.arange(len(data_objects)):
            argmin = np.hstack(
                (argmin,
                 np.nanargmin(norm.normalize_ds(data_objects[l][do], 4))))

    data_for_spca = {}
    data_for_spca_nonorm = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca_nonorm[k] = do[k]
                do[k] = norm.normalize_ds(do[k], norm_type)
                data_for_spca[k] = do[k]
            else:
                #data_for_spca_nonorm[k], do[k] = norm.center_on_m(data_for_spca_nonorm[k], do[k])
                #data_for_spca[k], do[k] = norm.equal_ar_size(data_for_spca[k], do[k])
                #data_for_spca_nonorm[k], do[k] = equal_ar_size(data_for_spca_nonorm[k], do[k], k, i)

                #data_for_spca[k] = norm.normalize_ds(data_for_spca[k], norm_type)
                #_, do[k] = norm.shift_means(data_for_spca_nonorm[k], do[k])
                #data_for_spca_nonorm[k] = np.vstack([data_for_spca_nonorm[k], do[k]])
                do[k] = norm.normalize_ds(do[k], norm_type)

                #
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
            np.savetxt(output_fld + k + str(i) + '.csv',
                       do[k],
                       delimiter=",",
                       fmt='%12.5f')
            np.savetxt(output_fld + k + str(i) + 'mean.csv',
                       np.vstack((np.nanmean(do[k],
                                             axis=0), np.nanstd(do[k],
                                                                axis=0))),
                       delimiter=",",
                       fmt='%12.5f')
    specimen_ids = np.hstack(specimen_ids_list)
    ### Now run through again and impute missing:
    for l in data_for_spca:
        nu_m = data_for_spca[l]
        nu_m = imp.fit_transform(nu_m)
        nu_m = norm.normalize_ds(nu_m, 1)
        data_for_spca[l] = nu_m
    ##Outlier Elim?
    #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    row = int((len(combo_df.index) / 2))
    df_2 = combo_df.iloc[row:]
    df_1 = combo_df.iloc[:row]
    _df = umap.combined_umap(df_1, df_2)

    cmap = cm.get_cmap('tab10')
    _df.plot.scatter(x='x', y='y', c=dataset_no, cmap=cmap)
    plt.show()
    _df.to_csv(output_dir + 'umap_' + output_code + '.csv')
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file,
         output_file, use_noise, **kwargs):
    spca_zht_params, _ = ld.define_spca_parameters(params_file)

    spca_results = joblib.load(orig_transform_file)
    imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,)
    # These arguments should be parameterized
    orig_data_objects = []
    orig_specimen_ids_list = []
    for ds in orig_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"],
                                            metadata_file=ds["metadata_file"],
                                            dendrite_type=ds["dendrite_type"],
                                            need_structure=not ds["allow_missing_structure"],
                                            include_dend_type_null=ds["allow_missing_dendrite"],
                                            limit_to_cortical_layers=limit_to_cortical_layers,
                                            id_file=ds["id_file"],
                                            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = np.nan_to_num(m)
                p = np.nonzero(nu_m[:,:])[1]
                p = max(p)
                nu_m = nu_m[:,:p]
                print(l)
                print(p)
                nu_m = imp.fit_transform(nu_m)
                data_for_spca[l] = nu_m
                
        orig_data_objects.append(data_for_spca)
        orig_specimen_ids_list.append(specimen_ids)
    orig_data_for_spca = {}
    for i, do in enumerate(orig_data_objects):
        for k in do:
            if k not in orig_data_for_spca:
                orig_data_for_spca[k] = do[k]
            else:
                orig_data_for_spca[k] = np.vstack([orig_data_for_spca[k], do[k]])
    orig_specimen_ids = np.hstack(orig_specimen_ids_list)
    logging.info("Original datasets had {:d} cells".format(len(orig_specimen_ids)))
    orig_mean, orig_std = orig_mean_and_std_for_zscore_h5(spca_results, orig_data_for_spca, spca_zht_params)

    new_data_objects = []
    new_specimen_ids_list = []
    for ds in new_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"],
                                            metadata_file=ds["metadata_file"],
                                            dendrite_type=ds["dendrite_type"],
                                            need_structure=not ds["allow_missing_structure"],
                                            include_dend_type_null=ds["allow_missing_dendrite"],
                                            limit_to_cortical_layers=limit_to_cortical_layers,
                                            id_file=ds["id_file"],
                                            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = np.nan_to_num(m)
                p = np.nonzero(nu_m[:,:])[1]
                p = max(p)
                nu_m = nu_m[:,:p]
                print(l)
                print(p)
                nu_m = imp.fit_transform(nu_m)
                data_for_spca[l] = nu_m
                
        new_data_objects.append(data_for_spca)
        new_specimen_ids_list.append(specimen_ids)
    data_for_spca = {}
    for i, do in enumerate(new_data_objects):
         for k in do:
            if k not in data_for_spca:
                _, do[k] = equal_ar_size(orig_data_for_spca[k], do[k], k, i)
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    new_ids = np.hstack(new_specimen_ids_list)
    logging.info("Applying transform to {:d} new cells".format(len(new_ids)))
    new_combo = spca_transform_new_data_h5(spca_results,
                                        data_for_spca,
                                        spca_zht_params,
                                        orig_mean, orig_std)
    new_combo_df = pd.DataFrame(new_combo, index=new_ids)
    new_combo_df.to_csv(output_file)
def main(orig_transform_file, orig_datasets, new_datasets, params_file,
         output_file, **kwargs):
    """ Main runner function for script.

    See :class:`SpcaTransformParameters` for argument descriptions.
    """

    spca_zht_params, _ = ld.define_spca_parameters(params_file)

    spca_results = joblib.load(orig_transform_file)

    # Load original data sets
    orig_data_objects = []
    orig_specimen_ids_list = []
    for ds in orig_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            need_ramp_spike=ds["need_ramp_spike"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        orig_data_objects.append(data_for_spca)
        orig_specimen_ids_list.append(specimen_ids)
    orig_data_for_spca = []
    for i, do in enumerate(orig_data_objects):
        for k in do:
            if k not in orig_data_for_spca:
                orig_data_for_spca[k] = do[k]
            else:
                orig_data_for_spca[k] = np.vstack(
                    [orig_data_for_spca[k], do[k]])
    orig_specimen_ids = np.hstack(orig_specimen_ids_list)
    logging.info("Original datasets had {:d} cells".format(
        len(orig_specimen_ids)))
    orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results,
                                                       orig_data_for_spca,
                                                       spca_zht_params)

    new_data_objects = []
    new_specimen_ids_list = []
    for ds in new_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            need_ramp_spike=ds["need_ramp_spike"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        new_data_objects.append(data_for_spca)
        new_specimen_ids_list.append(specimen_ids)
    data_for_spca = []
    for i, do in enumerate(new_data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])

    new_ids = np.hstack(new_specimen_ids_list)
    logging.info("Applying transform to {:d} new cells".format(len(new_ids)))
    new_combo = spca_transform_new_data(spca_results, data_for_spca,
                                        spca_zht_params, orig_mean, orig_std)
    new_combo_df = pd.DataFrame(new_combo, index=new_ids)
    new_combo_df.to_csv(output_file)
Example #8
0
def main(params_file, output_dir, output_code, datasets, **kwargs):
    """ Main runner function for script.

    See argschema input parameters for argument descriptions.
    """

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            need_ramp_spike=ds["need_ramp_spike"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    specimen_ids = np.hstack(specimen_ids_list)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = spca.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = spca.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = spca.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")