Esempio n. 1
0
def main(params_file, output_dir, output_code, datasets, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    
    data_for_spca, specimen_ids = ld.load_h5_data("C:\\Users\\SMest\\fv_ nmmouse_human.h5",
                                            metadata_file=None,
                                            limit_to_cortical_layers=None,
                                            id_file=None,
                                            params_file="C:\\Users\\SMest\\source\\repos\\drcme\\drcme\\bin\\default_spca_params.json")
    imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,)
    
    for l, m in data_for_spca.items():
        if type(m) == np.ndarray:
           #
            nu_m = np.nan_to_num(m)
            p = np.nonzero(nu_m[:,:])[1]
            p = max(p)
            nu_m = nu_m[:,:p]
            print(l)
            print(p)
            data_for_spca[l] = imp.fit_transform(nu_m)
            #data_for_spca[l] = nu_m
    data_objects.append(data_for_spca)
    specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    specimen_ids = np.hstack(specimen_ids_list)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error("Mismatch of specimen id dimension ({:d}) and data dimension ({:d})".format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(spca_results, os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(os.path.join(output_dir, "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(os.path.join(output_dir, "spca_components_used_{:s}.json".format(output_code)), "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
Esempio n. 2
0
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = KNNImputer(copy=False)

    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = m
                p = np.nonzero(nu_m[:, :])[1]
                p = max(p)

                print(l)
                print(p)
                data_for_spca[l] = nu_m
            if 'EXTRA' not in ds["fv_h5_file"]:
                data_for_spca[l] = nu_m * -1

        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                do[k] = normalize_ds(do[k], norm_type)
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k], do[k] = equal_ar_size(
                    data_for_spca[k], do[k], k, i)

                do[k] = normalize_ds(do[k], norm_type)

                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
            np.savetxt(output_fld + k + str(i) + '.csv',
                       do[k],
                       delimiter=",",
                       fmt='%12.5f')
            np.savetxt(output_fld + k + str(i) + 'mean.csv',
                       np.vstack((np.nanmean(do[k],
                                             axis=0), np.nanstd(do[k],
                                                                axis=0))),
                       delimiter=",",
                       fmt='%12.5f')
    specimen_ids = np.hstack(specimen_ids_list)
    ### Now run through again and impute missing:
    for l in data_for_spca:
        nu_m = data_for_spca[l]
        nu_m = imp.fit_transform(nu_m)
        data_for_spca[l] = nu_m
    ##Outlier Elim?
    #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
Esempio n. 3
0
def main(params_file, output_dir, output_code, datasets, norm_type,
         labels_file, spca_file, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = KNNImputer(copy=False)
    pad_len = 0
    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = m
                p = np.nonzero(nu_m[:, :])[1]
                p = max(p)

                print(l)
                print(p)
                if p > pad_len:
                    pad_len = p
                data_for_spca[l] = nu_m

        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)
    HPARAMS = HParams()

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                do[k] = norm.normalize_ds(do[k], norm_type)
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k], do[k] = equal_ar_size(
                    data_for_spca[k], do[k], k, i)

                do[k] = norm.normalize_ds(do[k], norm_type)

                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
            np.savetxt(output_fld + k + str(i) + '.csv',
                       do[k],
                       delimiter=",",
                       fmt='%12.5f')
            np.savetxt(output_fld + k + str(i) + 'mean.csv',
                       np.vstack((np.nanmean(do[k],
                                             axis=0), np.nanstd(do[k],
                                                                axis=0))),
                       delimiter=",",
                       fmt='%12.5f')
    specimen_ids = np.hstack(specimen_ids_list)
    labels = pd.read_csv(labels_file, index_col=0)

    df_s = pd.read_csv(spca_file, index_col=0)

    #labels['0'] = labelnoise(df_s, labels)
    train_ind = np.where(labels['0'] > -1)[0]
    pred_ind = np.where(labels['0'] == -1)[0]
    train_id = specimen_ids[train_ind]
    pred_id = specimen_ids[pred_ind]
    train_label = labels.iloc[train_ind]
    train_label = labels.iloc[train_ind]
    pred_label = labels.iloc[pred_ind]
    ### Now run through again and impute missing:
    train_data = {}
    pred_data = {}

    for l in data_for_spca:
        nu_m = data_for_spca[l]
        nu_m = imp.fit_transform(nu_m)
        if nu_m.shape[1] < pad_len:
            pad_wid = (pad_len - nu_m.shape[1]) + 1
            nu_m = np.hstack((nu_m, np.zeros((nu_m.shape[0], pad_wid))))
        train_data[l] = nu_m[train_ind]
        pred_data[l] = nu_m[pred_ind]
        data_for_spca[l] = nu_m
    ##Outlier Elim?
    #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca)
    ## Form our datasets for training
    HPARAMS.input_shape = [pad_len + 1, 1, len(data_for_spca)]
    full_data = np.hstack(
        (data_for_spca[i] for i in sorted(data_for_spca.keys())))
    train_data = np.hstack((train_data[i] for i in sorted(train_data.keys())))
    pred_data = np.hstack((pred_data[i] for i in sorted(pred_data.keys())))

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    ## Write the Data to a record for use with 'graph params'
    writer = tf.io.TFRecordWriter(output_dir + 'train_data.tfr')
    for id, data, label in zip(train_id, train_data, train_label.values):
        example = nsl_tools.create_example(data, label, id)
        writer.write(example.SerializeToString())
    writer = tf.io.TFRecordWriter(output_dir + 'pred_data.tfr')
    for id, data, label in zip(specimen_ids, full_data, labels.values):
        example = nsl_tools.create_example(data, label, id)
        writer.write(example.SerializeToString())

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Define the Keras TensorBoard callback.
    logdir = os.path.join(
        "logs",
        "fit",
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
    )
    full_labels = np.where(labels.values != -1, labels.values,
                           (np.unique(labels.values)[-1] + 1))

    #Split into validation / test / train datasets
    train_dataset = tf.data.Dataset.from_tensor_slices({
        'waves':
        train_data,
        'label':
        np.ravel(train_label.values)
    }).shuffle(2000).batch(HPARAMS.batch_size)
    train_size = train_data.shape[0] // HPARAMS.batch_size
    test_fraction = 0.3
    test_size = int(test_fraction * train_size)

    test_dataset = train_dataset.take(test_size)
    train_dataset = train_dataset.skip(test_size)
    train_size = train_size - test_size
    validation_fraction = 0.6
    validation_size = int(validation_fraction * train_size)
    train_size = train_size - validation_size
    print('taking val: ' + str(validation_size) + ' test: ' + str(test_size) +
          ' train: ' + str(train_size))
    validation_dataset = train_dataset.take(validation_size)
    train_dataset = train_dataset.skip(validation_size)

    nsl_tools.HPARAMS.max_seq_length = train_data.shape[1]
    base_model = nsl_tools.build_base_model()

    # Wrap the model with adversarial regularization.
    adv_config = nsl.configs.make_adv_reg_config(multiplier=0.2,
                                                 adv_step_size=0.05)
    adv_model = nsl.keras.AdversarialRegularization(base_model,
                                                    adv_config=adv_config)
    # Compile, train, and evaluate.

    adv_model.compile(optimizer='adam',
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                      metrics=['accuracy'])
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
    history = adv_model.fit(train_dataset,
                            validation_data=validation_dataset,
                            epochs=15)
    print('### FIT COMPLETE ### TESTING')
    acc = adv_model.evaluate(test_dataset, verbose=1)

    np.savetxt('full_acc.csv', acc, delimiter=",", fmt='%12.5f')
    pred_labels_prob = adv_model.predict({
        'waves': full_data,
        'label': full_labels
    })
    pred_labels = np.argmax(pred_labels_prob, axis=1)
    logging.info("Saving results...")
    labels['0'] = pred_labels

    labels.to_csv(output_code + '_NSL_pred_adv_learn.csv')

    #####GRAPH NETWORK
    ###nsl_tools.save_for_gam(full_data, full_labels)

    #nsl_tools.build_graph(df_s, output_dir + 'embed.tsv')
    #spca_pack_nbrs.pack_nbrs(
    #   output_dir + '/train_data.tfr',
    #    output_dir + '/pred_data.tfr',
    #    output_dir + 'embed.tsv',
    #    output_dir + '/nsl_train_data.tfr',
    #add_undirected_edges=True,
    #max_nbrs=6)
    #predictions = nsl_tools.graph_nsl(output_dir + '/nsl_train_data.tfr', output_dir + '/pred_data.tfr', train_data)
    #pred_labels = np.argmax(predictions, axis=1)
    #logging.info("Saving results...")
    #labels['0'] = pred_labels

    #labels.to_csv(output_code + '_NSL_pred_graph_learn.csv')
    logging.info("Done.")
Esempio n. 4
0
def main(params_file, output_dir, output_code, datasets, norm_type,
         labels_file, spca_file, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = SimpleImputer(
        missing_values=0,
        strategy='mean',
        copy=False,
    )

    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:

                nu_m = np.nan_to_num(m)
                p = np.nonzero(nu_m[:, :])[1]
                p = max(p)
                nu_m = nu_m[:, :p]
                print(l)
                print(p)
                nu_m = imp.fit_transform(nu_m)

                data_for_spca[l] = normalize_ds(nu_m, norm_type)

        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)
    specimen_ids = np.hstack(specimen_ids_list)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k], do[k] = equal_ar_size(
                    data_for_spca[k], do[k], k, i)
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])

    ##Outlier Elim?
    #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca)
    df_s = pd.read_csv(spca_file, index_col=0)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))
    labels = pd.read_csv(labels_file, index_col=0)
    print(labels)
    print(labels.values)
    uni_labels = np.unique(labels.values)
    ids_list = labels.index.values

    if labels.shape[0] == ids_list.shape[0]:
        print("Same Ids loaded... Proceeding")
        logging.info("Proceeding with %d cells", len(specimen_ids))
        for p in data_for_spca:
            labels_means = pd.DataFrame()
            arr_data = data_for_spca[p]
            for x in uni_labels:
                indx = np.where(labels['0'] == x)[0]
                row, col = arr_data[indx].shape
                n_co = np.full(col, row)
                mean = pd.Series(data=np.mean(arr_data[indx], axis=0),
                                 name=('Cluster ' + str(x) + ' mean'))
                std = pd.Series(data=np.std(arr_data[indx], axis=0),
                                name=('Cluster ' + str(x) + ' std'))
                n = pd.Series(data=n_co, name=('Cluster ' + str(x) + ' n'))
                labels_means = labels_means.append(mean, ignore_index=True)
                labels_means = labels_means.append(std, ignore_index=True)
                labels_means = labels_means.append(n, ignore_index=True)
            labels_means.to_csv(output_fld + p + '_cluster_mean.csv')

        train_df, test_df, labels_2, _ = train_test_split(df_s, labels)

        rf = RandomForestClassifier(n_estimators=500,
                                    oob_score=True,
                                    random_state=0)
        #per = multiclass.OneVsOneClassifier(RandomForestClassifier(n_estimators=500, oob_score=True,
        # random_state=0), n_jobs=-1).fit(train_df.values, labels.to_numpy().flatten())
        rf.fit(train_df.values, labels_2.to_numpy().flatten())

        logging.info("OOB score: {:f}".format(rf.oob_score_))
        pred_labels = rf.predict(test_df.values)
        feat_import = rf.feature_importances_
        print(rf.oob_score_)
        logging.debug("Saving results")
        #pd.DataFrame(pred_labels, index=test_df.index.values).to_csv('rf_predictions.csv')
        pd.DataFrame(feat_import).to_csv('rf_feat_importance.csv')
        ### Now compute for labeled data
        train_ind = np.where(labels['0'] > -1)[0]
        labeled = labels.iloc[train_ind]
        labeled_df_s = df_s.iloc[train_ind]
        train_df, test_df, labels_2, labels_3 = train_test_split(
            labeled_df_s, labeled)

        clf1 = LogisticRegression(random_state=1, max_iter=1000)
        clf2 = RandomForestClassifier(n_estimators=500, random_state=1)
        clf3 = GaussianNB()
        eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                            ('gnb', clf3)],
                                voting='soft',
                                n_jobs=-1)
        eclf.fit(train_df, np.ravel(labels_2.values))
        fit_score = eclf.score(test_df, np.ravel(labels_3.values))
        print(fit_score)
        params = {
            'lr__C': np.linspace(1.0, 1000.0, 10),
            'rf__n_estimators': np.linspace(20, 1000, 10, dtype=np.int64)
        }

        grid = GridSearchCV(estimator=eclf,
                            param_grid=params,
                            cv=5,
                            n_jobs=-1,
                            verbose=1)
        grid.fit(train_df, np.ravel(labels_2.values))
        fit_score = grid.score(test_df, np.ravel(labels_3.values))
        print("grid search params")
        print(fit_score)
        grid_CV = grid.best_estimator_
        full_acc = np.arange(15, dtype=np.float64)
        PARAMS = grid.best_estimator_
        for i, a in enumerate(full_acc):
            train_df, test_df, labels_2, labels_3 = train_test_split(
                labeled_df_s, labeled, test_size=0.6, train_size=0.28)
            clf1 = LogisticRegression(random_state=1, max_iter=1000)
            clf2 = RandomForestClassifier(n_estimators=500, random_state=1)
            clf3 = GaussianNB()
            eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                                ('gnb', clf3)],
                                    voting='soft',
                                    n_jobs=-1)
            eclf.fit(train_df, np.ravel(labels_2.values))
            full_acc[i] = eclf.score(test_df, np.ravel(labels_3.values))

        np.savetxt('full_acc.csv', full_acc, delimiter=",", fmt='%12.5f')
        _, _, pvalue = permutation_test_score(grid_CV,
                                              train_df,
                                              np.ravel(labels_2.values),
                                              n_jobs=-1)
        print("pvalue: " + str(pvalue))
        fclf = OneVsOneClassifier(grid, n_jobs=-1)
        fclf.fit(train_df, np.ravel(labels_2.values))
        fit_score = fclf.score(test_df, np.ravel(labels_3.values))
        y_pred = fclf.predict(test_df)
        print(fit_score)
        print(metrics.classification_report(y_pred, np.ravel(labels_3.values)))
        pred_labels = fclf.predict(df_s.values)
        pd.DataFrame(pred_labels,
                     index=df_s.index.values).to_csv('full_predictions.csv')

    feat_import_by_label = np.hstack((0, np.full(feat_import.shape[0],
                                                 np.nan)))
    for i in permutations(uni_labels, 2):
        indx_1 = np.where((labels['0'] == i[0]))[0]
        indx_2 = np.where((labels['0'] == i[1]))[0]
        indx = np.hstack((indx_1, indx_2))
        if indx.shape[0] >= 100:
            print(indx.shape[0])
            df_s_temp = df_s.iloc[indx]
            labels_s_temp = labels.iloc[indx]
            train_df, test_df, labels_2, _ = train_test_split(
                df_s_temp, labels_s_temp)

            rf = RandomForestClassifier(n_estimators=500,
                                        oob_score=True,
                                        random_state=0)
            #per = multiclass.OneVsOneClassifier(RandomForestClassifier(n_estimators=500, oob_score=True,
            # random_state=0), n_jobs=-1).fit(train_df.values, labels.to_numpy().flatten())
            rf.fit(train_df.values, labels_2.to_numpy().flatten())
            logging.info("OOB score: {:f}".format(rf.oob_score_))
            pred_labels = rf.predict(test_df.values)
            feat_import = rf.feature_importances_
            print(str(i) + ' ' + str(rf.oob_score_))
            logging.debug("Saving results")
            feat_import_by_label = np.vstack(
                (feat_import_by_label,
                 np.hstack((str(i), np.ravel(feat_import)))))
            del rf
    pd.DataFrame(feat_import_by_label).to_csv(output_fld +
                                              'label_rf_feat_importance.csv')

    logging.info("Done.")
Esempio n. 5
0
def main(params_file, output_dir, output_code, datasets, norm_type, **kwargs):

    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    imp = KNNImputer(copy=False)
    dataset_no = []

    for i, ds in enumerate(datasets):
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = m
                p = np.nonzero(nu_m[:, :])[1]
                p = max(p)
                filename = ds["fv_h5_file"]
                if 'INTRA' not in filename:
                    nu_m = nu_m[:, 30:-1]
                #else:
                #nu_m = nu_m * -1
                print(l)
                print(p)
                data_for_spca[l] = nu_m[:, 94:]

        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)
        dataset_no = np.hstack((dataset_no, np.full(specimen_ids.shape[0], i)))
    truncate = []
    for i, do in enumerate(data_objects[0]):
        ###FIND THE ARGMIN
        argmin = []
        for l in np.arange(len(data_objects)):
            argmin = np.hstack(
                (argmin,
                 np.nanargmin(norm.normalize_ds(data_objects[l][do], 4))))

    data_for_spca = {}
    data_for_spca_nonorm = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca_nonorm[k] = do[k]
                do[k] = norm.normalize_ds(do[k], norm_type)
                data_for_spca[k] = do[k]
            else:
                #data_for_spca_nonorm[k], do[k] = norm.center_on_m(data_for_spca_nonorm[k], do[k])
                #data_for_spca[k], do[k] = norm.equal_ar_size(data_for_spca[k], do[k])
                #data_for_spca_nonorm[k], do[k] = equal_ar_size(data_for_spca_nonorm[k], do[k], k, i)

                #data_for_spca[k] = norm.normalize_ds(data_for_spca[k], norm_type)
                #_, do[k] = norm.shift_means(data_for_spca_nonorm[k], do[k])
                #data_for_spca_nonorm[k] = np.vstack([data_for_spca_nonorm[k], do[k]])
                do[k] = norm.normalize_ds(do[k], norm_type)

                #
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
            np.savetxt(output_fld + k + str(i) + '.csv',
                       do[k],
                       delimiter=",",
                       fmt='%12.5f')
            np.savetxt(output_fld + k + str(i) + 'mean.csv',
                       np.vstack((np.nanmean(do[k],
                                             axis=0), np.nanstd(do[k],
                                                                axis=0))),
                       delimiter=",",
                       fmt='%12.5f')
    specimen_ids = np.hstack(specimen_ids_list)
    ### Now run through again and impute missing:
    for l in data_for_spca:
        nu_m = data_for_spca[l]
        nu_m = imp.fit_transform(nu_m)
        nu_m = norm.normalize_ds(nu_m, 1)
        data_for_spca[l] = nu_m
    ##Outlier Elim?
    #specimen_ids, data_for_spca = outlierElim(specimen_ids, data_for_spca)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    row = int((len(combo_df.index) / 2))
    df_2 = combo_df.iloc[row:]
    df_1 = combo_df.iloc[:row]
    _df = umap.combined_umap(df_1, df_2)

    cmap = cm.get_cmap('tab10')
    _df.plot.scatter(x='x', y='y', c=dataset_no, cmap=cmap)
    plt.show()
    _df.to_csv(output_dir + 'umap_' + output_code + '.csv')
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file,
         output_file, use_noise, **kwargs):
    spca_zht_params, _ = ld.define_spca_parameters(params_file)

    spca_results = joblib.load(orig_transform_file)
    imp = SimpleImputer(missing_values=0, strategy='mean', copy=False,)
    # These arguments should be parameterized
    orig_data_objects = []
    orig_specimen_ids_list = []
    for ds in orig_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"],
                                            metadata_file=ds["metadata_file"],
                                            dendrite_type=ds["dendrite_type"],
                                            need_structure=not ds["allow_missing_structure"],
                                            include_dend_type_null=ds["allow_missing_dendrite"],
                                            limit_to_cortical_layers=limit_to_cortical_layers,
                                            id_file=ds["id_file"],
                                            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = np.nan_to_num(m)
                p = np.nonzero(nu_m[:,:])[1]
                p = max(p)
                nu_m = nu_m[:,:p]
                print(l)
                print(p)
                nu_m = imp.fit_transform(nu_m)
                data_for_spca[l] = nu_m
                
        orig_data_objects.append(data_for_spca)
        orig_specimen_ids_list.append(specimen_ids)
    orig_data_for_spca = {}
    for i, do in enumerate(orig_data_objects):
        for k in do:
            if k not in orig_data_for_spca:
                orig_data_for_spca[k] = do[k]
            else:
                orig_data_for_spca[k] = np.vstack([orig_data_for_spca[k], do[k]])
    orig_specimen_ids = np.hstack(orig_specimen_ids_list)
    logging.info("Original datasets had {:d} cells".format(len(orig_specimen_ids)))
    orig_mean, orig_std = orig_mean_and_std_for_zscore_h5(spca_results, orig_data_for_spca, spca_zht_params)

    new_data_objects = []
    new_specimen_ids_list = []
    for ds in new_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(h5_fv_file=ds["fv_h5_file"],
                                            metadata_file=ds["metadata_file"],
                                            dendrite_type=ds["dendrite_type"],
                                            need_structure=not ds["allow_missing_structure"],
                                            include_dend_type_null=ds["allow_missing_dendrite"],
                                            limit_to_cortical_layers=limit_to_cortical_layers,
                                            id_file=ds["id_file"],
                                            params_file=params_file)
        for l, m in data_for_spca.items():
            if type(m) == np.ndarray:
                nu_m = np.nan_to_num(m)
                p = np.nonzero(nu_m[:,:])[1]
                p = max(p)
                nu_m = nu_m[:,:p]
                print(l)
                print(p)
                nu_m = imp.fit_transform(nu_m)
                data_for_spca[l] = nu_m
                
        new_data_objects.append(data_for_spca)
        new_specimen_ids_list.append(specimen_ids)
    data_for_spca = {}
    for i, do in enumerate(new_data_objects):
         for k in do:
            if k not in data_for_spca:
                _, do[k] = equal_ar_size(orig_data_for_spca[k], do[k], k, i)
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    new_ids = np.hstack(new_specimen_ids_list)
    logging.info("Applying transform to {:d} new cells".format(len(new_ids)))
    new_combo = spca_transform_new_data_h5(spca_results,
                                        data_for_spca,
                                        spca_zht_params,
                                        orig_mean, orig_std)
    new_combo_df = pd.DataFrame(new_combo, index=new_ids)
    new_combo_df.to_csv(output_file)
Esempio n. 7
0
def main(params_file, output_dir, output_code, datasets, **kwargs):
    # Load data from each dataset
    data_objects = []
    specimen_ids_list = []
    for ds in datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        data_objects.append(data_for_spca)
        specimen_ids_list.append(specimen_ids)

    data_for_spca = {}
    for i, do in enumerate(data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])
    specimen_ids = np.hstack(specimen_ids_list)

    first_key = list(data_for_spca.keys())[0]
    if len(specimen_ids) != data_for_spca[first_key].shape[0]:
        logging.error(
            "Mismatch of specimen id dimension ({:d}) and data dimension ({:d})"
            .format(len(specimen_ids), data_for_spca[first_key].shape[0]))

    logging.info("Proceeding with %d cells", len(specimen_ids))

    # Load parameters
    spca_zht_params, _ = ld.define_spca_parameters(filename=params_file)

    # Run sPCA
    subset_for_spca = sf.select_data_subset(data_for_spca, spca_zht_params)
    spca_results = sf.spca_on_all_data(subset_for_spca, spca_zht_params)
    combo, component_record = sf.consolidate_spca(spca_results)

    logging.info("Saving results...")
    joblib.dump(
        spca_results,
        os.path.join(output_dir, "spca_loadings_{:s}.pkl".format(output_code)))
    combo_df = pd.DataFrame(combo, index=specimen_ids)
    combo_df.to_csv(
        os.path.join(output_dir,
                     "sparse_pca_components_{:s}.csv".format(output_code)))
    with open(
            os.path.join(output_dir,
                         "spca_components_used_{:s}.json".format(output_code)),
            "w") as f:
        json.dump(component_record, f, indent=4)
    logging.info("Done.")
def main(orig_transform_file, orig_datasets, new_datasets, params_file,
         output_file, **kwargs):
    """ Main runner function for script.

    See :class:`SpcaTransformParameters` for argument descriptions.
    """

    spca_zht_params, _ = ld.define_spca_parameters(params_file)

    spca_results = joblib.load(orig_transform_file)

    # Load original data sets
    orig_data_objects = []
    orig_specimen_ids_list = []
    for ds in orig_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            need_ramp_spike=ds["need_ramp_spike"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        orig_data_objects.append(data_for_spca)
        orig_specimen_ids_list.append(specimen_ids)
    orig_data_for_spca = []
    for i, do in enumerate(orig_data_objects):
        for k in do:
            if k not in orig_data_for_spca:
                orig_data_for_spca[k] = do[k]
            else:
                orig_data_for_spca[k] = np.vstack(
                    [orig_data_for_spca[k], do[k]])
    orig_specimen_ids = np.hstack(orig_specimen_ids_list)
    logging.info("Original datasets had {:d} cells".format(
        len(orig_specimen_ids)))
    orig_mean, orig_std = orig_mean_and_std_for_zscore(spca_results,
                                                       orig_data_for_spca,
                                                       spca_zht_params)

    new_data_objects = []
    new_specimen_ids_list = []
    for ds in new_datasets:
        if len(ds["limit_to_cortical_layers"]) == 0:
            limit_to_cortical_layers = None
        else:
            limit_to_cortical_layers = ds["limit_to_cortical_layers"]

        data_for_spca, specimen_ids = ld.load_h5_data(
            h5_fv_file=ds["fv_h5_file"],
            metadata_file=ds["metadata_file"],
            dendrite_type=ds["dendrite_type"],
            need_structure=not ds["allow_missing_structure"],
            need_ramp_spike=ds["need_ramp_spike"],
            include_dend_type_null=ds["allow_missing_dendrite"],
            limit_to_cortical_layers=limit_to_cortical_layers,
            id_file=ds["id_file"],
            params_file=params_file)
        new_data_objects.append(data_for_spca)
        new_specimen_ids_list.append(specimen_ids)
    data_for_spca = []
    for i, do in enumerate(new_data_objects):
        for k in do:
            if k not in data_for_spca:
                data_for_spca[k] = do[k]
            else:
                data_for_spca[k] = np.vstack([data_for_spca[k], do[k]])

    new_ids = np.hstack(new_specimen_ids_list)
    logging.info("Applying transform to {:d} new cells".format(len(new_ids)))
    new_combo = spca_transform_new_data(spca_results, data_for_spca,
                                        spca_zht_params, orig_mean, orig_std)
    new_combo_df = pd.DataFrame(new_combo, index=new_ids)
    new_combo_df.to_csv(output_file)