Ejemplo n.º 1
0
def generate_1000_genomes_bag_of_genes(
        transpose=False,
        label_splits=None,
        feature_splits=[0.8],
        fold=0,
        path='/home/einarbmag/data/1000Genome/'):

    train, valid, test, _ = du.load_1000_genomes(transpose,
                                                 label_splits,
                                                 feature_splits,
                                                 fold,
                                                 norm=False)

    nolabel_orig = (np.vstack([train[0], valid[0]]))

    if not os.path.isdir(path):
        os.makedirs(path)

    filename = 'unsupervised_bag_of_genes'
    filename += '_fold' + str(fold) + '.npy'

    nolabel_x = np.zeros((nolabel_orig.shape[0], nolabel_orig.shape[1] * 2))

    mod1 = np.zeros(nolabel_orig.shape)
    mod2 = np.zeros(nolabel_orig.shape)

    for i in range(nolabel_x.shape[0]):
        mod1[i, :] = np.where(nolabel_orig[i, :] > 0)[0] * 2 + 1
        mod2[i, :] = np.where(nolabel_orig == 2)[0] * 2

    nolabel_x[mod1] += 1
    nolabel_x[mod2] += 1
Ejemplo n.º 2
0
def load_data(dataset,
              dataset_path,
              embedding_source,
              which_fold=0,
              keep_labels=1.,
              missing_labels_val=1.,
              embedding_input='raw',
              transpose=False,
              norm=True):

    # Load data from specified dataset
    splits = [.6, .2]  # this will split the data into [60%, 20%, 20%]
    if dataset == '1000_genomes':
        # This will split the training data into 75% train, 25%
        # this corresponds to the split 60/20 of the whole data,
        # test is considered elsewhere as an extra 20% of the whole data
        splits = [.75]
        data = du.load_1000_genomes(transpose=transpose,
                                    label_splits=splits,
                                    feature_splits=[.8],
                                    fold=which_fold,
                                    nolabels=embedding_input,
                                    norm=norm,
                                    path=dataset_path)
    else:
        print("Unknown dataset")
        return

    if not transpose:
        (x_train, y_train), (x_valid, y_valid), (x_test, y_test),\
            x_nolabel = data
    else:
        return data

    if not embedding_source:
        if x_nolabel is None:
            x_unsup = x_train.transpose()
        else:
            x_unsup = x_nolabel
    else:
        x_unsup = None

    # If needed, remove some of the training labels
    if keep_labels <= 1.0:
        training_labels = y_train.copy()
        random.seed(23)
        nb_train = len(training_labels)

        indices = range(nb_train)
        random.shuffle(indices)

        indices_discard = indices[:int(nb_train * (1 - keep_labels))]
        for idx in indices_discard:
            training_labels[idx] = missing_labels_val
    else:
        training_labels = y_train

    return x_train, y_train, x_valid, y_valid, x_test, y_test, \
        x_unsup, training_labels
Ejemplo n.º 3
0
def generate_1000_genomes_hist(
        transpose=False,
        label_splits=None,
        feature_splits=None,
        fold=0,
        perclass=False,
        path='/data/lisatmp4/romerosa/datasets/1000_Genome_project/'):
    """
    train, valid, test, _ = du.load_1000_genomes(transpose, label_splits,
                                                 feature_splits, fold,
                                                 norm=False)
    """
    train, valid, test, _ = du.load_1000_genomes(transpose=transpose,
                                                 label_splits=label_splits,
                                                 feature_splits=feature_splits,
                                                 fold=fold,
                                                 norm=False,
                                                 nolabels='raw')

    # Generate no_label: fuse train and valid sets
    nolabel_orig = (np.vstack([train[0], valid[0]])).transpose()
    nolabel_y = np.vstack([train[1], valid[1]])

    nolabel_y = nolabel_y.argmax(axis=1)

    filename = 'histo3x26' if perclass else \
        'histo3'
    filename += '_fold' + str(fold) + '.npy'

    if perclass:
        # the first dimension of the following is length 'number of snps'
        nolabel_x = np.zeros((nolabel_orig.shape[0], 3 * 26))
        for i in range(nolabel_x.shape[0]):
            if i % 5000 == 0:
                print "processing snp no: ", i
            for j in range(26):
                nolabel_x[i, j*3:j*3+3] += \
                    np.bincount(nolabel_orig[i, nolabel_y == j ].astype('int32'), minlength=3)
                nolabel_x[i, j*3:j*3+3] /= \
                    nolabel_x[i, j*3:j*3+3].sum()
            # print nolabel_orig[0,:].shape
            # print nolabel_orig[0,:].sum()
            # print nolabel_y
            # print zip(np.sum(nolabel_x[0,:].reshape(26,3), axis=1), np.bincount(nolabel_y.astype('int32')))
            # print nolabel_x[0,:].reshape(26,3)
    else:
        nolabel_x = np.zeros((nolabel_orig.shape[0], 3))
        for i in range(nolabel_x.shape[0]):
            nolabel_x[i, :] += np.bincount(nolabel_orig[i, :].astype('int32'),
                                           minlength=3)
            nolabel_x[i, :] /= nolabel_x[i, :].sum()

    nolabel_x = nolabel_x.astype('float32')

    np.save(os.path.join(path, filename), nolabel_x)
Ejemplo n.º 4
0
def generate_1000_genomes_hist(path,
                               fold,
                               perclass,
                               transpose=False,
                               label_splits=None,
                               feature_splits=None,
                               sum_to_one=True,
                               is_asw_dataset=False):
    """
    Can generate embeddings based on populations using genotypic and allelic
    frequencies

    Parameters:
        perclass: If this is true we will compute the frequency_type for each
            population
        sum_to_one: By default (the default is false), the function will return
            the miminum amount of information per SNPs. For exemple, if we have
            a SNP with a major allelic frequency of 0.8, then we will only
            store and use 0.8 (since 0.2, can be infered easily). At the time
            of writing this, it remains to be shown that putting this parameter
            to false doesn't harm the results.
    """
    if is_asw_dataset:
        train, valid, test, _, label_names = du.load_1000G_ASW(
            path=path, fold=fold, norm=False, return_label_names=True)
    else:
        train, valid, test, _, label_names = du.load_1000_genomes(
            transpose=transpose,
            label_splits=label_splits,
            feature_splits=feature_splits,
            fold=fold,
            norm=False,
            nolabels='raw',
            path=path,
            return_label_names=True)
    """
    train, valid, test, _, label_names = du.load_1000G_ASW(
            path=path, fold=fold, norm=False, return_label_names=True)
    """

    # Generate no_label: fuse train and valid sets
    nolabel_orig = (np.vstack([train[0], valid[0]])).transpose()
    nolabel_y = np.vstack([train[1], valid[1]])
    nolabel_y = nolabel_y.argmax(axis=1)

    filename_suffix = ( '_perclass' if perclass else '') + \
                ( '' if sum_to_one else '_SumToOne') + \
                '_fold' + str(fold) + '.npy'
    filename_genotypic = 'histo_' + 'GenotypicFrequency' + filename_suffix
    filename_allelic = 'histo_' + 'AllelicFrequency' + filename_suffix

    generate_snp_hist(nolabel_orig, nolabel_y, label_names, perclass,
                      sum_to_one, os.path.join(path, filename_genotypic),
                      os.path.join(path, filename_allelic))
Ejemplo n.º 5
0
def load_data(data_path, raw_path, emb_path, fold):
    # norm by default is true because for training the samples is normalized
    print('Load 1000 genome data')
    data = du.load_1000_genomes(data_path, raw_path, fold=0, norm=True)
    (x_train, y_train), (x_valid, y_valid), (x_test, y_test) = data

    feat_emb_val = []
    if emb_path:
        print('Load embedding data')
        feat_emb_val = du.load_embedding_mat(data_path, emb_path, fold=0, transpose=False)

    training_labels = y_train

    return x_train, y_train, x_valid, y_valid, x_test, y_test, \
           feat_emb_val, training_labels
def load_data(dataset, dataset_path, embedding_source,
              which_fold=0, keep_labels=1., missing_labels_val=1.,
              embedding_input='raw', transpose=False, norm=True):

    # Load data from specified dataset
    splits = [.6, .2]  # this will split the data into [60%, 20%, 20%]
    if dataset == '1000_genomes':
        # This will split the training data into 75% train, 25%
        # this corresponds to the split 60/20 of the whole data,
        # test is considered elsewhere as an extra 20% of the whole data
        splits = [.75]
        data = du.load_1000_genomes(dataset_path,
                                    transpose=transpose,
                                    label_splits=splits,
                                    feature_splits=[.8],
                                    fold=which_fold,
                                    nolabels=embedding_input,
                                    norm=norm, return_subject_ids=True,
                                    return_snp_names=True,
                                    return_label_names=True)

        if transpose:
            return data
        else:
            ((x_train, y_train, exmpl_ids_train), 
             (x_valid, y_valid, exmpl_ids_valid),
             (x_test, y_test, exmpl_ids_test),
             x_nolabel, feature_names, label_names) = data

    elif dataset in  ['amikacin__pseudomonas_aeruginosa',
                      'beta-lactam__streptococcus_pneumoniae',
                      'carbapenem__acinetobacter_baumannii',
                      'gentamicin__staphylococcus_aureus',
                      'isoniazid__mycobacterium_tuberculosis']:
        data = du.load_antibiotic_resistance(dataset_path, dataset,
                                             transpose=transpose,
                                             label_splits=[.8],
                                             feature_splits=[.8],
                                             fold=which_fold,
                                             nolabels=embedding_input,
                                             norm=False)

        if transpose:
            return data
        else:
            (x_train, y_train), (x_valid, y_valid), (x_test, y_test), x_nolabel = data
            exmpl_ids_train = ["" for i in range(x_train.shape[0])]
            exmpl_ids_valid = ["" for i in range(x_valid.shape[0])]
            exmpl_ids_test = ["" for i in range(x_test.shape[0])]
            feature_names = ["" for i in range(x_train.shape[1])]
            label_names = None

    elif dataset == "1000G_WTCCC":
        data = du.load_1000G_WTCCC(dataset_path, fold=which_fold, norm=norm)
        ((x_train, y_train, exmpl_ids_train), 
         (x_valid, y_valid, exmpl_ids_valid),
         (x_test, y_test, exmpl_ids_test),
         x_nolabel, feature_names, label_names) = data
    elif dataset == "1000G_ASW":
        data = du.load_1000G_ASW(dataset_path, fold=which_fold, norm=norm,
                                 return_subject_ids=True, return_snp_names=True,
                                 return_label_names=True)
        ((x_train, y_train, exmpl_ids_train), 
         (x_valid, y_valid, exmpl_ids_valid),
         (x_test, y_test, exmpl_ids_test),
         x_nolabel, feature_names, label_names) = data
    else:
        print("Unknown dataset")
        return

    if not embedding_source:
        if x_nolabel is None:
            x_unsup = x_train.transpose()
        else:
            x_unsup = x_nolabel
    else:
        x_unsup = None

    # If needed, remove some of the training labels
    if keep_labels <= 1.0:
        training_labels = y_train.copy()
        random.seed(23)
        nb_train = len(training_labels)

        indices = range(nb_train)
        random.shuffle(indices)

        indices_discard = indices[:int(nb_train * (1 - keep_labels))]
        for idx in indices_discard:
            training_labels[idx] = missing_labels_val
    else:
        training_labels = y_train

    return (x_train, y_train, exmpl_ids_train,
            x_valid, y_valid, exmpl_ids_valid,
            x_test, y_test, exmpl_ids_test,
            x_unsup, training_labels, feature_names, label_names)