def beta_dist(data_sets, kmer_size, n_factor):

    # Loop over all data sets
    for data_set in data_sets:

        data_set = data_set[0]

        # Retrieve diseased data and labels
        allowed_labels = ['0', '1']
        kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers(
            kmer_size, data_set, allowed_labels)
        print("LOADED DATASET " + str(data_set[0]) + ": " +
              str(len(kmer_cnts)) + " SAMPLES")
        labelz = np.asarray(labelz)
        labelz = labelz.astype(np.int)

        # Conduct NMF and resave to data_normalized
        if n_factor == 0:
            data_normalized = normalize(kmer_cnts, axis=1, norm='l1')
            data_normalized, labels = shuffle(data_normalized,
                                              labelz,
                                              random_state=0)
            x = data_normalized
            y = labels

        else:
            data_normalized = normalize(kmer_cnts, axis=1, norm='l1')
            data_normalized = stats_utils_AEB.NMF_factor(
                data_normalized,
                kmer_size,
                n_components=int(n_factor),
                title=(str(data_set) + str(kmer_size) + "mers" +
                       str(n_factor) + "factors"))
            data_normalized, labels = shuffle(data_normalized,
                                              labelz,
                                              random_state=0)
            x = data_normalized
            y = labels

        return x, y
Ejemplo n.º 2
0
              'w',
              newline='') as csvfile:
        fieldnames = [
            'dataset', 'kmer_size', 'n_splits', 'n_repeats', 'acc', 'auc',
            'precision', 'recall', 'f1', 'model', 'NMF_factors', 'params'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Loop over all data sets
        for data_set in data_sets_to_use:
            data_set = data_set[0]

            # Retrieve diseased data and labels
            allowed_labels = ['0', '1']
            kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers(
                kmer_size, data_set, allowed_labels)
            print("LOADED DATASET " + str(data_set[0]) + ": " +
                  str(len(kmer_cnts)) + " SAMPLES")
            labelz = np.asarray(labelz)
            labelz = labelz.astype(np.int)

            # Conduct NMF and resave to data_normalized
            #enter in desired no.factors
            for n in [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
                if n == 0:
                    data_normalized = normalize(kmer_cnts, axis=1, norm='l1')
                    data_normalized, labels = shuffle(data_normalized,
                                                      labelz,
                                                      random_state=0)
                    x = data_normalized
                    y = labels
Ejemplo n.º 3
0
backend = K.backend()

import load_kmer_cnts_jf
import deep_learning_models

#################
# Load the data #
#################

kmer_size = 7

#data_sets_healthy=['HMP', 'Qin_et_al','RA','MetaHIT','Feng','Karlsson_2013','LiverCirrhosis','Zeller_2014']

data_sets_healthy = ['MetaHIT']
allowed_labels = ['0']
kmer_cnts_healthy, accessions_healthy, labels_healthy, domain_labels = load_kmer_cnts_jf.load_kmers(
    kmer_size, data_sets_healthy, allowed_labels)

data_sets_diseased = ['MetaHIT']
allowed_labels = ['1']
kmer_cnts_diseased, accessions_diseased, labels_diseased, domain_labels = load_kmer_cnts_jf.load_kmers(
    kmer_size, data_sets_diseased, allowed_labels)

kmer_cnts = np.concatenate((kmer_cnts_healthy, kmer_cnts_diseased))
accessions = np.concatenate((accessions_healthy, accessions_diseased))
labels = np.concatenate((labels_healthy, labels_diseased))

labels = np.asarray(labels)
labels = labels.astype(np.int)
healthy = np.where(labels == 0)
disease = np.where(labels == 1)
#################
# Load the data # 
#################

#kmer_size=3
kmer_size=5
#kmer_size=10
#data_set='Qin_et_al'
#data_set='RA'
#data_set='MetaHIT'
data_set='HMP'

data_sets=['HMP']

kmer_cnts, accessions, labels =load_kmer_cnts_jf.load_kmers(kmer_size,data_sets)
labels=np.asarray(labels)
healthy=np.where(labels=='0')
disease=np.where(labels=='1')


data=pd.DataFrame(kmer_cnts)
data_normalized = normalize(data, axis = 1, norm = 'l1')


################################
# set up a model (autoencoder)
################################

input_dim=len(data_normalized[0]) # this is the number of input kmers
encoding_dim=10