def main(params_file, output_dir, output_code, datasets, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] data_for_spca, specimen_ids = ld.load_h5_data("C:\\Users\\SMest\\fv_ nmmouse_human.h5", metadata_file=None, limit_to_cortical_layers=None, id_file=None, params_file="C:\\Users\\SMest\\source\\repos\\drcme\\drcme\\bin\\default_spca_params.json") imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,) for l, m in data_for_spca.items(): if type(m) == np.ndarray: # nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:,:])[1] p = max(p) nu_m = nu_m[:,:p] print(l) print(p) data_for_spca[l] = imp.fit_transform(nu_m) #data_for_spca[l] = nu_m data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) specimen_ids = np.hstack(specimen_ids_list) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error("Mismatch of specimen id dimension ({:d}) and data dimension ({:d})".format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump(spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv(os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open(os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file, output_file, use_noise, **kwargs): spca_zht_params, _ = ld.define_spca_parameters(params_file) spca_results = joblib.load(orig_transform_file) # These arguments should be parameterized orig_data_objects = [] orig_specimen_ids_list = [] for ds in orig_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_organized_data( project=ds["project"], base_dir=ds["data_dir"], use_noise=use_noise, dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, params_file=params_file) orig_data_objects.append(data_for_spca) orig_specimen_ids_list.append(specimen_ids) orig_data_for_spca = [] for i, do in enumerate(orig_data_objects): for j, data_item in enumerate(do): if i == 0: orig_data_for_spca.append({ "data": data_item["data"].copy(), "part_keys": data_item["part_keys"], }) else: orig_data_for_spca[j]["data"] = np.vstack( [orig_data_for_spca[j]["data"], data_item["data"]]) orig_specimen_ids = np.hstack(orig_specimen_ids_list) logging.info("Original datasets had {:d} cells".format( len(orig_specimen_ids))) orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results, orig_data_for_spca, spca_zht_params) new_data_objects = [] new_specimen_ids_list = [] for ds in new_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_organized_data( project=ds["project"], base_dir=ds["data_dir"], use_noise=use_noise, dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, params_file=params_file) new_data_objects.append(data_for_spca) new_specimen_ids_list.append(specimen_ids) data_for_spca = [] for i, do in enumerate(new_data_objects): for j, data_item in enumerate(do): if i == 0: data_for_spca.append({ "data": data_item["data"].copy(), "part_keys": data_item["part_keys"], }) else: data_for_spca[j]["data"] = np.vstack( [data_for_spca[j]["data"], data_item["data"]]) new_ids = np.hstack(new_specimen_ids_list) logging.info("Applying transform to {:d} new cells".format(len(new_ids))) new_combo = spca_transform_new_data(spca_results, data_for_spca, spca_zht_params, orig_mean, orig_std) new_combo_df = pd.DataFrame(new_combo, index=new_ids) new_combo_df.to_csv(output_file)
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = KNNImputer(copy=False) for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = m p = np.nonzero(nu_m[:, :])[1] p = max(p) print(l) print(p) data_for_spca[l] = nu_m if 'EXTRA' not in ds["fv_h5_file"]: data_for_spca[l] = nu_m * -1 data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: do[k] = normalize_ds(do[k], norm_type) data_for_spca[k] = do[k] else: data_for_spca[k], do[k] = equal_ar_size( data_for_spca[k], do[k], k, i) do[k] = normalize_ds(do[k], norm_type) data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) np.savetxt(output_fld + k + str(i) + '.csv', do[k], delimiter=",", fmt='%12.5f') np.savetxt(output_fld + k + str(i) + 'mean.csv', np.vstack((np.nanmean(do[k], axis=0), np.nanstd(do[k], axis=0))), delimiter=",", fmt='%12.5f') specimen_ids = np.hstack(specimen_ids_list) ### Now run through again and impute missing: for l in data_for_spca: nu_m = data_for_spca[l] nu_m = imp.fit_transform(nu_m) data_for_spca[l] = nu_m ##Outlier Elim? #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = SimpleImputer( missing_values=0, strategy='mean', copy=False, ) for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data_SC( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = np.nan_to_num(m) data_for_spca[l] = nu_m #data_for_spca[l] = nu_m data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k], do[k] = equal_ar_size( data_for_spca[k], do[k], 0) data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) specimen_ids = np.hstack(specimen_ids_list) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data_SC(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs): # Load data from each dataset data_objects = [] specimen_ids_list = [] imp = KNNImputer(copy=False) dataset_no = [] for i, ds in enumerate(datasets): if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = m p = np.nonzero(nu_m[:, :])[1] p = max(p) filename = ds["fv_h5_file"] if 'INTRA' not in filename: nu_m = nu_m[:, 30:-1] #else: #nu_m = nu_m * -1 print(l) print(p) data_for_spca[l] = nu_m[:, 94:] data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) dataset_no = np.hstack((dataset_no, np.full(specimen_ids.shape[0], i))) truncate = [] for i, do in enumerate(data_objects[0]): ###FIND THE ARGMIN argmin = [] for l in np.arange(len(data_objects)): argmin = np.hstack( (argmin, np.nanargmin(norm.normalize_ds(data_objects[l][do], 4)))) data_for_spca = {} data_for_spca_nonorm = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca_nonorm[k] = do[k] do[k] = norm.normalize_ds(do[k], norm_type) data_for_spca[k] = do[k] else: #data_for_spca_nonorm[k], do[k] = norm.center_on_m(data_for_spca_nonorm[k], do[k]) #data_for_spca[k], do[k] = norm.equal_ar_size(data_for_spca[k], do[k]) #data_for_spca_nonorm[k], do[k] = equal_ar_size(data_for_spca_nonorm[k], do[k], k, i) #data_for_spca[k] = norm.normalize_ds(data_for_spca[k], norm_type) #_, do[k] = norm.shift_means(data_for_spca_nonorm[k], do[k]) #data_for_spca_nonorm[k] = np.vstack([data_for_spca_nonorm[k], do[k]]) do[k] = norm.normalize_ds(do[k], norm_type) # data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) np.savetxt(output_fld + k + str(i) + '.csv', do[k], delimiter=",", fmt='%12.5f') np.savetxt(output_fld + k + str(i) + 'mean.csv', np.vstack((np.nanmean(do[k], axis=0), np.nanstd(do[k], axis=0))), delimiter=",", fmt='%12.5f') specimen_ids = np.hstack(specimen_ids_list) ### Now run through again and impute missing: for l in data_for_spca: nu_m = data_for_spca[l] nu_m = imp.fit_transform(nu_m) nu_m = norm.normalize_ds(nu_m, 1) data_for_spca[l] = nu_m ##Outlier Elim? #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params) spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = sf.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) row = int((len(combo_df.index) / 2)) df_2 = combo_df.iloc[row:] df_1 = combo_df.iloc[:row] _df = umap.combined_umap(df_1, df_2) cmap = cm.get_cmap('tab10') _df.plot.scatter(x='x', y='y', c=dataset_no, cmap=cmap) plt.show() _df.to_csv(output_dir + 'umap_' + output_code + '.csv') with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file, output_file, use_noise, **kwargs): spca_zht_params, _ = ld.define_spca_parameters(params_file) spca_results = joblib.load(orig_transform_file) imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,) # These arguments should be parameterized orig_data_objects = [] orig_specimen_ids_list = [] for ds in orig_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:,:])[1] p = max(p) nu_m = nu_m[:,:p] print(l) print(p) nu_m = imp.fit_transform(nu_m) data_for_spca[l] = nu_m orig_data_objects.append(data_for_spca) orig_specimen_ids_list.append(specimen_ids) orig_data_for_spca = {} for i, do in enumerate(orig_data_objects): for k in do: if k not in orig_data_for_spca: orig_data_for_spca[k] = do[k] else: orig_data_for_spca[k] = np.vstack([orig_data_for_spca[k], do[k]]) orig_specimen_ids = np.hstack(orig_specimen_ids_list) logging.info("Original datasets had {:d} cells".format(len(orig_specimen_ids))) orig_mean, orig_std = orig_mean_and_std_for_zscore_h5(spca_results, orig_data_for_spca, spca_zht_params) new_data_objects = [] new_specimen_ids_list = [] for ds in new_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) for l, m in data_for_spca.items(): if type(m) == np.ndarray: nu_m = np.nan_to_num(m) p = np.nonzero(nu_m[:,:])[1] p = max(p) nu_m = nu_m[:,:p] print(l) print(p) nu_m = imp.fit_transform(nu_m) data_for_spca[l] = nu_m new_data_objects.append(data_for_spca) new_specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(new_data_objects): for k in do: if k not in data_for_spca: _, do[k] = equal_ar_size(orig_data_for_spca[k], do[k], k, i) data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) new_ids = np.hstack(new_specimen_ids_list) logging.info("Applying transform to {:d} new cells".format(len(new_ids))) new_combo = spca_transform_new_data_h5(spca_results, data_for_spca, spca_zht_params, orig_mean, orig_std) new_combo_df = pd.DataFrame(new_combo, index=new_ids) new_combo_df.to_csv(output_file)
def main(orig_transform_file, orig_datasets, new_datasets, params_file, output_file, **kwargs): """ Main runner function for script. See :class:`SpcaTransformParameters` for argument descriptions. """ spca_zht_params, _ = ld.define_spca_parameters(params_file) spca_results = joblib.load(orig_transform_file) # Load original data sets orig_data_objects = [] orig_specimen_ids_list = [] for ds in orig_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike=ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) orig_data_objects.append(data_for_spca) orig_specimen_ids_list.append(specimen_ids) orig_data_for_spca = [] for i, do in enumerate(orig_data_objects): for k in do: if k not in orig_data_for_spca: orig_data_for_spca[k] = do[k] else: orig_data_for_spca[k] = np.vstack( [orig_data_for_spca[k], do[k]]) orig_specimen_ids = np.hstack(orig_specimen_ids_list) logging.info("Original datasets had {:d} cells".format( len(orig_specimen_ids))) orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results, orig_data_for_spca, spca_zht_params) new_data_objects = [] new_specimen_ids_list = [] for ds in new_datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike=ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) new_data_objects.append(data_for_spca) new_specimen_ids_list.append(specimen_ids) data_for_spca = [] for i, do in enumerate(new_data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) new_ids = np.hstack(new_specimen_ids_list) logging.info("Applying transform to {:d} new cells".format(len(new_ids))) new_combo = spca_transform_new_data(spca_results, data_for_spca, spca_zht_params, orig_mean, orig_std) new_combo_df = pd.DataFrame(new_combo, index=new_ids) new_combo_df.to_csv(output_file)
def main(params_file, output_dir, output_code, datasets, **kwargs): """ Main runner function for script. See argschema input parameters for argument descriptions. """ # Load data from each dataset data_objects = [] specimen_ids_list = [] for ds in datasets: if len(ds["limit_to_cortical_layers"]) == 0: limit_to_cortical_layers = None else: limit_to_cortical_layers = ds["limit_to_cortical_layers"] data_for_spca, specimen_ids = ld.load_h5_data( h5_fv_file=ds["fv_h5_file"], metadata_file=ds["metadata_file"], dendrite_type=ds["dendrite_type"], need_structure=not ds["allow_missing_structure"], need_ramp_spike=ds["need_ramp_spike"], include_dend_type_null=ds["allow_missing_dendrite"], limit_to_cortical_layers=limit_to_cortical_layers, id_file=ds["id_file"], params_file=params_file) data_objects.append(data_for_spca) specimen_ids_list.append(specimen_ids) data_for_spca = {} for i, do in enumerate(data_objects): for k in do: if k not in data_for_spca: data_for_spca[k] = do[k] else: data_for_spca[k] = np.vstack([data_for_spca[k], do[k]]) specimen_ids = np.hstack(specimen_ids_list) first_key = list(data_for_spca.keys())[0] if len(specimen_ids) != data_for_spca[first_key].shape[0]: logging.error( "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})" .format(len(specimen_ids), data_for_spca[first_key].shape[0])) logging.info("Proceeding with %d cells", len(specimen_ids)) # Load parameters spca_zht_params, _ = ld.define_spca_parameters(filename=params_file) # Run sPCA subset_for_spca = spca.select_data_subset(data_for_spca, spca_zht_params) spca_results = spca.spca_on_all_data(subset_for_spca, spca_zht_params) combo, component_record = spca.consolidate_spca(spca_results) logging.info("Saving results...") joblib.dump( spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code))) combo_df = pd.DataFrame(combo, index=specimen_ids) combo_df.to_csv( os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code))) with open( os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f: json.dump(component_record, f, indent=4) logging.info("Done.")