def generate_1000_genomes_bag_of_genes( transpose=False, label_splits=None, feature_splits=[0.8], fold=0, path='/home/einarbmag/data/1000Genome/'): train, valid, test, _ = du.load_1000_genomes(transpose, label_splits, feature_splits, fold, norm=False) nolabel_orig = (np.vstack([train[0], valid[0]])) if not os.path.isdir(path): os.makedirs(path) filename = 'unsupervised_bag_of_genes' filename += '_fold' + str(fold) + '.npy' nolabel_x = np.zeros((nolabel_orig.shape[0], nolabel_orig.shape[1] * 2)) mod1 = np.zeros(nolabel_orig.shape) mod2 = np.zeros(nolabel_orig.shape) for i in range(nolabel_x.shape[0]): mod1[i, :] = np.where(nolabel_orig[i, :] > 0)[0] * 2 + 1 mod2[i, :] = np.where(nolabel_orig == 2)[0] * 2 nolabel_x[mod1] += 1 nolabel_x[mod2] += 1
def load_data(dataset, dataset_path, embedding_source, which_fold=0, keep_labels=1., missing_labels_val=1., embedding_input='raw', transpose=False, norm=True): # Load data from specified dataset splits = [.6, .2] # this will split the data into [60%, 20%, 20%] if dataset == '1000_genomes': # This will split the training data into 75% train, 25% # this corresponds to the split 60/20 of the whole data, # test is considered elsewhere as an extra 20% of the whole data splits = [.75] data = du.load_1000_genomes(transpose=transpose, label_splits=splits, feature_splits=[.8], fold=which_fold, nolabels=embedding_input, norm=norm, path=dataset_path) else: print("Unknown dataset") return if not transpose: (x_train, y_train), (x_valid, y_valid), (x_test, y_test),\ x_nolabel = data else: return data if not embedding_source: if x_nolabel is None: x_unsup = x_train.transpose() else: x_unsup = x_nolabel else: x_unsup = None # If needed, remove some of the training labels if keep_labels <= 1.0: training_labels = y_train.copy() random.seed(23) nb_train = len(training_labels) indices = range(nb_train) random.shuffle(indices) indices_discard = indices[:int(nb_train * (1 - keep_labels))] for idx in indices_discard: training_labels[idx] = missing_labels_val else: training_labels = y_train return x_train, y_train, x_valid, y_valid, x_test, y_test, \ x_unsup, training_labels
def generate_1000_genomes_hist( transpose=False, label_splits=None, feature_splits=None, fold=0, perclass=False, path='/data/lisatmp4/romerosa/datasets/1000_Genome_project/'): """ train, valid, test, _ = du.load_1000_genomes(transpose, label_splits, feature_splits, fold, norm=False) """ train, valid, test, _ = du.load_1000_genomes(transpose=transpose, label_splits=label_splits, feature_splits=feature_splits, fold=fold, norm=False, nolabels='raw') # Generate no_label: fuse train and valid sets nolabel_orig = (np.vstack([train[0], valid[0]])).transpose() nolabel_y = np.vstack([train[1], valid[1]]) nolabel_y = nolabel_y.argmax(axis=1) filename = 'histo3x26' if perclass else \ 'histo3' filename += '_fold' + str(fold) + '.npy' if perclass: # the first dimension of the following is length 'number of snps' nolabel_x = np.zeros((nolabel_orig.shape[0], 3 * 26)) for i in range(nolabel_x.shape[0]): if i % 5000 == 0: print "processing snp no: ", i for j in range(26): nolabel_x[i, j*3:j*3+3] += \ np.bincount(nolabel_orig[i, nolabel_y == j ].astype('int32'), minlength=3) nolabel_x[i, j*3:j*3+3] /= \ nolabel_x[i, j*3:j*3+3].sum() # print nolabel_orig[0,:].shape # print nolabel_orig[0,:].sum() # print nolabel_y # print zip(np.sum(nolabel_x[0,:].reshape(26,3), axis=1), np.bincount(nolabel_y.astype('int32'))) # print nolabel_x[0,:].reshape(26,3) else: nolabel_x = np.zeros((nolabel_orig.shape[0], 3)) for i in range(nolabel_x.shape[0]): nolabel_x[i, :] += np.bincount(nolabel_orig[i, :].astype('int32'), minlength=3) nolabel_x[i, :] /= nolabel_x[i, :].sum() nolabel_x = nolabel_x.astype('float32') np.save(os.path.join(path, filename), nolabel_x)
def generate_1000_genomes_hist(path, fold, perclass, transpose=False, label_splits=None, feature_splits=None, sum_to_one=True, is_asw_dataset=False): """ Can generate embeddings based on populations using genotypic and allelic frequencies Parameters: perclass: If this is true we will compute the frequency_type for each population sum_to_one: By default (the default is false), the function will return the miminum amount of information per SNPs. For exemple, if we have a SNP with a major allelic frequency of 0.8, then we will only store and use 0.8 (since 0.2, can be infered easily). At the time of writing this, it remains to be shown that putting this parameter to false doesn't harm the results. """ if is_asw_dataset: train, valid, test, _, label_names = du.load_1000G_ASW( path=path, fold=fold, norm=False, return_label_names=True) else: train, valid, test, _, label_names = du.load_1000_genomes( transpose=transpose, label_splits=label_splits, feature_splits=feature_splits, fold=fold, norm=False, nolabels='raw', path=path, return_label_names=True) """ train, valid, test, _, label_names = du.load_1000G_ASW( path=path, fold=fold, norm=False, return_label_names=True) """ # Generate no_label: fuse train and valid sets nolabel_orig = (np.vstack([train[0], valid[0]])).transpose() nolabel_y = np.vstack([train[1], valid[1]]) nolabel_y = nolabel_y.argmax(axis=1) filename_suffix = ( '_perclass' if perclass else '') + \ ( '' if sum_to_one else '_SumToOne') + \ '_fold' + str(fold) + '.npy' filename_genotypic = 'histo_' + 'GenotypicFrequency' + filename_suffix filename_allelic = 'histo_' + 'AllelicFrequency' + filename_suffix generate_snp_hist(nolabel_orig, nolabel_y, label_names, perclass, sum_to_one, os.path.join(path, filename_genotypic), os.path.join(path, filename_allelic))
def load_data(data_path, raw_path, emb_path, fold): # norm by default is true because for training the samples is normalized print('Load 1000 genome data') data = du.load_1000_genomes(data_path, raw_path, fold=0, norm=True) (x_train, y_train), (x_valid, y_valid), (x_test, y_test) = data feat_emb_val = [] if emb_path: print('Load embedding data') feat_emb_val = du.load_embedding_mat(data_path, emb_path, fold=0, transpose=False) training_labels = y_train return x_train, y_train, x_valid, y_valid, x_test, y_test, \ feat_emb_val, training_labels
def load_data(dataset, dataset_path, embedding_source, which_fold=0, keep_labels=1., missing_labels_val=1., embedding_input='raw', transpose=False, norm=True): # Load data from specified dataset splits = [.6, .2] # this will split the data into [60%, 20%, 20%] if dataset == '1000_genomes': # This will split the training data into 75% train, 25% # this corresponds to the split 60/20 of the whole data, # test is considered elsewhere as an extra 20% of the whole data splits = [.75] data = du.load_1000_genomes(dataset_path, transpose=transpose, label_splits=splits, feature_splits=[.8], fold=which_fold, nolabels=embedding_input, norm=norm, return_subject_ids=True, return_snp_names=True, return_label_names=True) if transpose: return data else: ((x_train, y_train, exmpl_ids_train), (x_valid, y_valid, exmpl_ids_valid), (x_test, y_test, exmpl_ids_test), x_nolabel, feature_names, label_names) = data elif dataset in ['amikacin__pseudomonas_aeruginosa', 'beta-lactam__streptococcus_pneumoniae', 'carbapenem__acinetobacter_baumannii', 'gentamicin__staphylococcus_aureus', 'isoniazid__mycobacterium_tuberculosis']: data = du.load_antibiotic_resistance(dataset_path, dataset, transpose=transpose, label_splits=[.8], feature_splits=[.8], fold=which_fold, nolabels=embedding_input, norm=False) if transpose: return data else: (x_train, y_train), (x_valid, y_valid), (x_test, y_test), x_nolabel = data exmpl_ids_train = ["" for i in range(x_train.shape[0])] exmpl_ids_valid = ["" for i in range(x_valid.shape[0])] exmpl_ids_test = ["" for i in range(x_test.shape[0])] feature_names = ["" for i in range(x_train.shape[1])] label_names = None elif dataset == "1000G_WTCCC": data = du.load_1000G_WTCCC(dataset_path, fold=which_fold, norm=norm) ((x_train, y_train, exmpl_ids_train), (x_valid, y_valid, exmpl_ids_valid), (x_test, y_test, exmpl_ids_test), x_nolabel, feature_names, label_names) = data elif dataset == "1000G_ASW": data = du.load_1000G_ASW(dataset_path, fold=which_fold, norm=norm, return_subject_ids=True, return_snp_names=True, return_label_names=True) ((x_train, y_train, exmpl_ids_train), (x_valid, y_valid, exmpl_ids_valid), (x_test, y_test, exmpl_ids_test), x_nolabel, feature_names, label_names) = data else: print("Unknown dataset") return if not embedding_source: if x_nolabel is None: x_unsup = x_train.transpose() else: x_unsup = x_nolabel else: x_unsup = None # If needed, remove some of the training labels if keep_labels <= 1.0: training_labels = y_train.copy() random.seed(23) nb_train = len(training_labels) indices = range(nb_train) random.shuffle(indices) indices_discard = indices[:int(nb_train * (1 - keep_labels))] for idx in indices_discard: training_labels[idx] = missing_labels_val else: training_labels = y_train return (x_train, y_train, exmpl_ids_train, x_valid, y_valid, exmpl_ids_valid, x_test, y_test, exmpl_ids_test, x_unsup, training_labels, feature_names, label_names)