def __init__(self, pd, study_names, protocol=None): # @timing def _check(map, keys): values = np.array(list(map.dic.values())) result = {} r = (values == -1).any(axis=1) if np.sum(r) == values.shape[0]: raise ValueError('There is no common names between studies') for i, k in enumerate(keys): result[k] = values[~r, i] return result self.name = None self.study_names = study_names self.phen_mapper = Mapper() self.cov_mapper = Mapper() self.covariates = OrderedDict() self.pd = OrderedDict() for i in pd: self.pd[i.folder.name] = i keys = [] if protocol is None: for i, k in enumerate(pd): self.covariates[k] = [ n.split(self.study_names[i] + '_')[1] for n in k.folder._data.metadata['names'] ] if i == 0: self.phen_mapper.fill(k.folder._data.metadata['phenotype'], k.folder.name, reference=False) self.cov_mapper.fill(self.covariates[k], k.folder.name, reference=False) else: self.phen_mapper.push(k.folder._data.metadata['phenotype'], name=k.folder.name, new_id=False) self.cov_mapper.push(self.covariates[k], name=k.folder.name, new_id=False) keys.append(k.folder.name) self.phen_order = _check(self.phen_mapper, keys) self.cov_order = _check(self.cov_mapper, keys) if protocol is not None: if not protocol.enable: protocol.parse()
class MetaParData(object): def __init__(self, pd, study_names, protocol=None): # @timing def _check(map, keys): values = np.array(list(map.dic.values())) result = {} r = (values == -1).any(axis=1) if np.sum(r) == values.shape[0]: raise ValueError('There is no common names between studies') for i, k in enumerate(keys): result[k] = values[~r, i] return result self.name = None self.study_names = study_names self.phen_mapper = Mapper() self.cov_mapper = Mapper() self.covariates = OrderedDict() self.pd = OrderedDict() for i in pd: self.pd[i.folder.name] = i keys = [] if protocol is None: for i, k in enumerate(pd): self.covariates[k] = [ n.split(self.study_names[i] + '_')[1] for n in k.folder._data.metadata['names'] ] if i == 0: self.phen_mapper.fill(k.folder._data.metadata['phenotype'], k.folder.name, reference=False) self.cov_mapper.fill(self.covariates[k], k.folder.name, reference=False) else: self.phen_mapper.push(k.folder._data.metadata['phenotype'], name=k.folder.name, new_id=False) self.cov_mapper.push(self.covariates[k], name=k.folder.name, new_id=False) keys.append(k.folder.name) self.phen_order = _check(self.phen_mapper, keys) self.cov_order = _check(self.cov_mapper, keys) if protocol is not None: if not protocol.enable: protocol.parse() def check_pd(self, old, new): np.set_printoptions(precision=3, suppress=True) print("*******PD CHECK*********") print('A covariates...') print(old[3] / old[3][0, 0]) print(new[3] / new[3][0, 0]) print('FREQ TESTs') N = np.random.randint(0, np.min((old[0].shape[0], new[0].shape[0])), 10) print(np.array(old[0][:, 0] / old[3][0, 0] / 2)[N]) print(np.array(new[0][:, 0] / new[3][0, 0] / 2)[N]) print('FREQ PHENO') M = np.random.randint(0, np.min((old[1].shape[1], new[1].shape[1])), 10) print(np.array(old[1][0, :] / old[3][0, 0])[M]) print(np.array(new[1][0, :] / new[3][0, 0])[M]) print("****************") # @timing def check_maf(self, SNPs_index): maf = np.zeros((SNPs_index[0].shape[0], len(self.pd))) for i, j in enumerate(self.pd): maf[:, i] = np.array( self.pd[j].folder._data.metadata['MAF'])[SNPs_index[i].astype( np.int64)] if (np.std(maf, axis=1) > 0.1).any(): raise ValueError('MAF is not consistent between PD data!') def get(self, SNPs_index=None, B4=False, regression_model=None, random_effect_intercept=False): if self.pd is None: raise ValueError('Data not defined!') k = list(self.pd.keys()) if SNPs_index is not None: if len(self.pd) != len(SNPs_index): raise ValueError( 'There are not equal number od PD and SNPs indexes {}!={}'. format(len(self.pd), len(SNPs_index))) a_test, b_cov, C, a_cov = self.pd[k[0]].get( gen_order=SNPs_index[0], phen_order=self.phen_order[k[0]], cov_order=self.cov_order[k[0]]) if random_effect_intercept and len(self.pd) > 1: # TODO (high) a_test_effect = a_test[:, 0:1] b_cov_effect = b_cov[0:1, :] a_cov_effect = a_cov[0:1, :] else: raise ValueError('Indexes are not defined!') if B4: b4 = self.pd[k[0]].folder._data.b4[SNPs_index[0], :] b4 = b4[:, self.phen_order[k[0]]] # self.check_maf(SNPs_index) for i in range(1, len(self.pd)): a, b, c, a_c = self.pd[k[i]].get(gen_order=SNPs_index[i], phen_order=self.phen_order[k[i]], cov_order=self.cov_order[k[i]]) self.check_pd([a_test, b_cov, C, a_cov], [a, b, c, a_c]) a_test = a_test + a b_cov = b_cov + b C = C + c a_cov = a_cov + a_c if random_effect_intercept and i < (len(self.pd) - 1): a_test_effect = np.hstack((a_test_effect, a[:, 0:1])) b_cov_effect = np.vstack((b_cov_effect, b[0:1, :])) a_cov_effect = np.vstack((a_cov_effect, a_c[0:1, :])) if B4: b4_tmp = self.pd[k[i]].folder._data.b4[SNPs_index[i], :] b4 = b4 + b4_tmp[:, self.phen_order[k[i]]] if random_effect_intercept and len(self.pd) > 1: a_cov_I = np.zeros((len(self.pd) - 1, len(self.pd) - 1)) np.fill_diagonal(a_cov_I, a_cov_effect[:, 0]) a_effect = np.hstack((a_cov_I, a_cov_effect)) a_test = np.hstack((a_test_effect, a_test)) b_cov = np.vstack((b_cov_effect, b_cov)) a_cov = np.vstack((a_cov_effect, a_cov)) a_cov = np.hstack((a_effect.T, a_cov)) if B4: return a_test, b_cov, C, a_cov, b4 else: return a_test, b_cov, C, a_cov def maf_pard(self, SNPs_index=None): samples = 0 maf = np.zeros(len(SNPs_index[0])) for j, i in enumerate(self.pd): n = len(self.pd[i].folder._data.metadata['id']) samples += n maf = maf + n * np.array( self.pd[i].folder._data.metadata['MAF'])[SNPs_index[j]] maf = maf / np.float(samples) return maf def get_n_id(self): return np.sum( [len(self.pd[i].folder._data.metadata['id']) for i in self.pd]) def get_phenotype_names(self): n = {} for i in self.pd: n[i] = self.pd[i].folder._data.metadata['phenotype']
class MetaPhenotype(object): def __init__(self, phen, protocol=None, include=None, exclude=None): # @timing def _check(map, keys, index=None): values = np.array(list(map.dic.values())) result = {} r = (values == -1).any(axis=1) if index is not None: r[np.where(index == False)[0]] = True if np.sum(r) == values.shape[0]: raise ValueError('There is no common names between studies') for i, k in enumerate(keys): result[k] = values[~r, i] return result, np.array(list(map.dic.keys()))[~r] self.chunk_size = 10000 self.exclude = None self.include = None self.name = None self.mapper = Mapper() self.keys = [] self.keep_index = None if include is not None: self.include = pd.DataFrame.from_csv(include, index_col=None) print('Include:') print(self.include.head()) if 'ID' not in self.include.columns: raise ValueError( '{} table does not have ID column for phenotypes'.format( include)) if exclude is not None: self.exclude = pd.DataFrame.from_csv(exclude, index_col=None) print('Exclude:') print(self.exclude.head()) if 'ID' not in self.exclude.columns: raise ValueError( '{} table does not have ID column for phenotypes'.format( exclude)) if protocol is None: for i, k in enumerate(phen): phen_names = [] if i == 0: for j in k.folder.files: if j != 'info_dic.npy': phen_names = phen_names + list( k.folder.data_info[j]) self.mapper.fill(phen_names, i, reference=False) else: for j in k.folder.files: if j != 'info_dic.npy': phen_names = phen_names + list( k.folder.data_info[j]) self.mapper.push(phen_names, name=i, new_id=False) self.keys.append(i) if self.exclude is not None or self.include is not None: phen_names = pd.Series(list(self.mapper.dic.keys())) phen_names = phen_names[phen_names.isin( self.include.ID )] if self.include is not None else phen_names phen_names = phen_names[~phen_names.isin( self.exclude.ID )] if self.exclude is not None else phen_names self.keep_index = pd.Series(list( self.mapper.dic.keys())).isin(phen_names) print(np.sum(self.keep_index)) self.order, self.phen_names = _check(self.mapper, self.keys, index=self.keep_index) self.n_phenotypes = len(self.order[self.keys[0]]) print(('Loaded {} common phenotypes for meta-analysis'.format( self.n_phenotypes))) self.processed = 0 else: if not protocol.enable: protocol.parse() self.pool = {i: PhenPool(j.folder) for i, j in enumerate(phen)} def get(self): if self.processed == self.n_phenotypes: return None, None else: start = self.processed finish = self.processed + self.chunk_size if ( self.processed + self.chunk_size) <= self.n_phenotypes else self.n_phenotypes self.processed = finish phenotype = np.zeros( (np.sum([len(self.pool[i].folder._data.id) for i in self.pool]), len(list(range(start, finish))))) for i, j in enumerate(self.keys): if i == 0: a, b = 0, self.pool[j].folder._data.id.shape[0] phenotype[a:b, :] = self.pool[j].get_chunk( self.order[j][start:finish]) a = b N = np.random.randint(0, phenotype.shape[1], 10) print(phenotype.mean(axis=0)[N]) else: ph_tmp = self.pool[j].get_chunk(self.order[j][start:finish]) print(self.pool[j].folder.path) print(ph_tmp.mean(axis=0)[N]) b += ph_tmp.shape[0] phenotype[a:b, :] = ph_tmp a = b return phenotype, self.phen_names[start:finish]
def __init__(self, phen, protocol=None, include=None, exclude=None): # @timing def _check(map, keys, index=None): values = np.array(list(map.dic.values())) result = {} r = (values == -1).any(axis=1) if index is not None: r[np.where(index == False)[0]] = True if np.sum(r) == values.shape[0]: raise ValueError('There is no common names between studies') for i, k in enumerate(keys): result[k] = values[~r, i] return result, np.array(list(map.dic.keys()))[~r] self.chunk_size = 10000 self.exclude = None self.include = None self.name = None self.mapper = Mapper() self.keys = [] self.keep_index = None if include is not None: self.include = pd.DataFrame.from_csv(include, index_col=None) print('Include:') print(self.include.head()) if 'ID' not in self.include.columns: raise ValueError( '{} table does not have ID column for phenotypes'.format( include)) if exclude is not None: self.exclude = pd.DataFrame.from_csv(exclude, index_col=None) print('Exclude:') print(self.exclude.head()) if 'ID' not in self.exclude.columns: raise ValueError( '{} table does not have ID column for phenotypes'.format( exclude)) if protocol is None: for i, k in enumerate(phen): phen_names = [] if i == 0: for j in k.folder.files: if j != 'info_dic.npy': phen_names = phen_names + list( k.folder.data_info[j]) self.mapper.fill(phen_names, i, reference=False) else: for j in k.folder.files: if j != 'info_dic.npy': phen_names = phen_names + list( k.folder.data_info[j]) self.mapper.push(phen_names, name=i, new_id=False) self.keys.append(i) if self.exclude is not None or self.include is not None: phen_names = pd.Series(list(self.mapper.dic.keys())) phen_names = phen_names[phen_names.isin( self.include.ID )] if self.include is not None else phen_names phen_names = phen_names[~phen_names.isin( self.exclude.ID )] if self.exclude is not None else phen_names self.keep_index = pd.Series(list( self.mapper.dic.keys())).isin(phen_names) print(np.sum(self.keep_index)) self.order, self.phen_names = _check(self.mapper, self.keys, index=self.keep_index) self.n_phenotypes = len(self.order[self.keys[0]]) print(('Loaded {} common phenotypes for meta-analysis'.format( self.n_phenotypes))) self.processed = 0 else: if not protocol.enable: protocol.parse() self.pool = {i: PhenPool(j.folder) for i, j in enumerate(phen)}
G.split_size = CONVERTER_SPLIT_SIZE G.VCF2hdf5(args.out) else: raise ValueError( 'Genotype data should be in PLINK/MINIMAC/VCF format and alone in folder' ) check_converter(args.out, args.study_name[0]) print(('Time to convert all data: {} sec'.format(t.secs))) ################################### ENCODING ############################## elif args.mode == 'encoding': # ARG_CHECKER.check(args,mode='encoding') mapper = Mapper() mapper.genotype_names = args.study_name mapper.chunk_size = MAPPER_CHUNK_SIZE mapper.reference_name = args.ref_name mapper.load_flip(args.mapper) mapper.load(args.mapper) phen = Reader('phenotype') phen.start(args.phenotype[0]) gen = Reader('genotype') gen.start(args.genotype[0], hdf5=args.hdf5, study_name=args.study_name[0], ID=False)