def convert_genotypes(self): chunk_size = self.split_size if chunk_size is None: raise ValueError( 'CONVERTER_SPLIT_SIZE does not define in config file!') G = np.array([]) # self.reader.folder.processed=0 while True: with Timer() as t: G = self.reader.folder.get_bed(chunk_size) if isinstance(G, type(None)): break print(('Time to read {} SNPs is {} s'.format(G.shape[0], t.secs))) self.write_data('gen') atom = tables.Int8Atom() self.genotype = self.h5_gen_file.create_carray( self.h5_gen_file.root, 'genotype', atom, (G.shape), title='Genotype', filters=self.pytable_filters) with Timer() as t: self.genotype[:] = G print(('Time to write {} SNPs is {} s'.format(G.shape[0], t.secs))) self.h5_gen_file.close() G = None gc.collect()
def hase_convert(args): R = Reader('genotype') R.start(args.genotype[0], vcf=args.vcf) with Timer() as t: if R.format == 'PLINK': G = GenotypePLINK(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.plink2hdf5(out=args.out) elif R.format == 'MINIMAC': G = GenotypeMINIMAC(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.MACH2hdf5(args.out, id=args.id) elif R.format == 'VCF': G = GenotypeVCF(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.VCF2hdf5(args.out) else: raise ValueError( 'Genotype data should be in PLINK/MINIMAC/VCF format and alone in folder' ) check_converter(args.out, args.study_name[0]) print(('Time to convert all data: {} sec'.format(t.secs))) return
def hase_convert(args): if (os.path.exists(args.outfolder + '/probes/')) and ( os.path.exists(args.outfolder + '/genotype/')) and ( os.path.exists(args.outfolder + '/individuals/')): print( "The folders: probes, genotype and individuals already exist. Data seems already in HASE format. Delete " "the folders if the files are not converted properly. Continuing with the current files:" ) return else: print('using', args.outfolder) R = Reader('genotype') R.start(args.genotype[0], vcf=args.vcf) with Timer() as t: if R.format == 'PLINK': G = GenotypePLINK(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.plink2hdf5(out=args.out) elif R.format == 'MINIMAC': G = GenotypeMINIMAC(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.MACH2hdf5(args.out, id=args.id) elif R.format == 'VCF': G = GenotypeVCF(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.VCF2hdf5(args.out) else: raise ValueError( 'Genotype data should be in PLINK/MINIMAC/VCF format and alone in folder' ) check_converter(args.out, args.study_name[0]) args.outfolder = args.genotype print(('Time to convert all data: {} sec'.format(t.secs))) return
def HASE(b4, A_inverse, b_cov, C, N_con, DF): with Timer() as t: B13 = b_cov B4 = b4 A1_B_constant = np.tensordot(A_inverse[:, :, 0:(N_con)], B13, axes=([2], [0])) A1_B_nonconstant = np.einsum('ijk,il->ijl', A_inverse[:, :, N_con:N_con + 1], B4) A1_B_full = A1_B_constant + A1_B_nonconstant BT_A1B_const = np.einsum('ij,lji->li', B13.T, A1_B_full[:, 0:(N_con), :]) BT_A1B_nonconst = np.einsum('ijk,ijk->ijk', B4[:, None, :], A1_B_full[:, (N_con):N_con + 1, :]) BT_A1B_full = BT_A1B_const[:, None, :] + BT_A1B_nonconst C_BTA1B = BT_A1B_full - C.reshape(1, -1) C_BTA1B = np.abs(C_BTA1B) a44_C_BTA1B = C_BTA1B * A_inverse[:, (N_con):N_con + 1, (N_con):N_con + 1] a44_C_BTA1B = np.sqrt((a44_C_BTA1B)) t_stat = np.sqrt(DF) * np.divide(A1_B_full[:, (N_con):N_con + 1, :], a44_C_BTA1B) SE = a44_C_BTA1B / np.sqrt(DF) print("time to compute GWAS for {} phenotypes and {} SNPs .... {} sec". format(b4.shape[1], A_inverse.shape[0], t.secs)) return t_stat, SE
def haseregression(phen, gen, cov, mapper, Analyser, maf, intercept=True, interaction=None): g = tuple([i.folder._data for i in gen]) row_index, ids = study_indexes(phenotype=phen.folder._data, genotype=g, covariates=cov.folder._data) if mapper is not None: SNP = [0, 0, mapper.n_keys] else: SNP = [0, 0, 'unknown'] covariates = cov.get_next(index=row_index[2]) a_cov = A_covariates(covariates, intercept=intercept) while True: gc.collect() if mapper is not None: if mapper.cluster == 'n': SNPs_index, keys = mapper.get() else: ch = mapper.chunk_pop() if ch is None: SNPs_index = None break SNPs_index, keys = mapper.get(chunk_number=ch) if isinstance(SNPs_index, type(None)): break Analyser.rsid = keys else: SNPs_index = None with Timer() as t: genotype = merge_genotype(gen, SNPs_index, mapper) print(('time to read and merge genotype {}s'.format(t.secs))) gc.collect() if genotype is None: print('All genotype processed!') break SNP[0] += genotype.shape[0] genotype = genotype[:, row_index[0]] if mapper is None: Analyser.rsid = np.array(list(range(genotype.shape[0]))) MAF = np.mean(genotype, axis=1) / 2 STD = np.std(genotype, axis=1) if maf != 0: filter = (MAF > maf) & (MAF < 1 - maf) & (MAF != 0.5) genotype = genotype[filter, :] Analyser.MAF = MAF[filter] Analyser.rsid = Analyser.rsid[filter] if genotype.shape[0] == 0: print('NO SNPs > MAF') continue else: Analyser.MAF = MAF SNP[1] += genotype.shape[0] while True: phenotype = phen.get_next(index=row_index[1]) if isinstance(phenotype, type(None)): phen.folder.processed = 0 print('All phenotypes processed!') break if phen.permutation: np.random.shuffle(phenotype) b_cov = B_covariates(covariates, phenotype, intercept=intercept) C = C_matrix(phenotype) if interaction is not None: pass a_test = A_tests(covariates, genotype, intercept=intercept) a_inv = A_inverse(a_cov, a_test) N_con = a_inv.shape[1] - 1 DF = (phenotype.shape[0] - a_inv.shape[1]) b4 = B4(phenotype, genotype) t_stat, SE = HASE(b4, a_inv, b_cov, C, N_con, DF) print(('Read {}, processed {}, total {}'.format(SNP[0], SNP[1], SNP[2]))) Analyser.t_stat = t_stat Analyser.SE = SE if mapper is not None and mapper.cluster == 'y': Analyser.cluster = True Analyser.chunk = ch Analyser.node = mapper.node[1] if phen.permutation: Analyser.permutation = True Analyser.save_result(phen.folder._data.names[phen.folder._data.start:phen.folder._data.finish]) t_stat = None Analyser.t_stat = None del b4 del C del b_cov del a_inv del a_test del t_stat gc.collect() if Analyser.cluster: np.save(os.path.join(Analyser.out, str(Analyser.node) + '_node_RSID.npy'), Analyser.rsid_dic)
print('********************************') print('r', r) if p == 0: ID = np.append(ID, b.ID) b['counter_ref'] = np.arange(counter_ref, counter_ref + b.shape[0], dtype='int32') counter_ref += b.shape[0] if len(match_index) or len(flip_index): print('matched {}'.format(match_index.shape[0])) print('flipped {}'.format(flip_index.shape[0])) if del_counter_ref.get(r) is not None: with Timer() as t: b = b[~b.counter_ref.isin(del_counter_ref[r])] print('time {}'.format(t.secs)) match_df = pd.merge(b, a, left_on=merge['straight'], right_on=merge['straight']) flip_df = pd.merge(b[~b.counter_ref.isin(match_df.counter_ref)], a, left_on=merge['reverse'], right_on=merge['straight']) if len(match_df): match_key = np.append(match_key, match_df.counter_ref) match_index = np.append(match_index, match_df.counter_prob)
def partial_derivatives(save_path=None, COV=None, PHEN=None, GEN=None, MAP=None, MAF=None, R2=None, B4_flag=False, study_name=None, intercept=True): row_index, ids = study_indexes(phenotype=PHEN.folder._data, genotype=GEN.folder._data, covariates=COV.folder._data) metadata = {} # TODO (mid) add parameter to compute PD only for new phenotypes or cov metadata['id'] = ids metadata['MAF'] = [] metadata['filter'] = [] metadata['names'] = [] # TODO (low) change to cov_names metadata['phenotype'] = [] b_cov = [] C = [] a_test = [] b4 = [] covariates = COV.get_next(index=row_index[2]) if MAP.cluster == 'n' or MAP.node[1] == 1: if intercept: metadata['names'].append(study_name + '_intercept') metadata['names'] = metadata['names'] + [ study_name + '_' + i for i in COV.folder._data.get_names() ] a_cov = A_covariates(covariates, intercept=intercept) np.save(os.path.join(save_path, study_name + '_a_cov.npy'), a_cov) with Timer() as t_phen: while True: phenotype = PHEN.get_next(index=row_index[1]) if isinstance(phenotype, type(None)): b_cov = np.concatenate(b_cov, axis=1) C = np.concatenate(C, axis=0) np.save(os.path.join(save_path, study_name + '_b_cov.npy'), b_cov) np.save(os.path.join(save_path, study_name + '_C.npy'), C) break metadata['phenotype'] = metadata['phenotype'] + list( PHEN.folder._data.get_names()) b_cov.append( B_covariates(covariates, phenotype, intercept=intercept)) C.append(C_matrix(phenotype)) print(('Time to PD phenotype {} is {} s'.format( np.array(C).shape, t_phen.secs))) if MAP.cluster == 'y': f_max = np.max([int(f.split('_')[0]) for f in GEN.folder.files]) files2read = [ '{}_{}.h5'.format(i, study_name) for i in np.array_split(list(range(f_max + 1)), MAP.node[0])[MAP.node[1] - 1] ][::-1] filesdone = [] for i in range(MAP.node[1] - 1): filesdone = filesdone + [ '{}_{}.h5'.format(i, study_name) for i in np.array_split(list(range(f_max + 1)), MAP.node[0])[i] ] N_snps_read = 0 for f in filesdone: file = os.path.join(GEN.folder.path, 'genotype', f) N_snps_read += GEN.folder.get_info(file)['shape'][0] else: N_snps_read = 0 while True: with Timer() as t_gen: if MAP.cluster == 'y': if len(files2read) != 0: file = os.path.join(GEN.folder.path, 'genotype', files2read.pop()) genotype = GEN.folder.read(file) else: genotype = None else: genotype = GEN.get_next() if isinstance(genotype, type(None)): if MAP.cluster == 'y': np.save( os.path.join( save_path, 'node_{}_'.format(MAP.node[1]) + study_name + '_a_test.npy'), np.concatenate(a_test).astype(np.float64)) np.save( os.path.join( save_path, 'node_{}_'.format(MAP.node[1]) + study_name + '_metadata.npy'), metadata) if B4_flag: b4 = np.concatenate(b4, axis=0) np.save( os.path.join( save_path, 'node_{}_'.format(MAP.node[1]) + study_name + '_b4.npy'), b4.astype(np.float64)) if MAP.node[1] == MAP.node[0]: merge_PD(save_path, MAP.node[0], study_name) else: np.save( os.path.join(save_path, study_name + '_a_test.npy'), np.concatenate(a_test)) np.save( os.path.join(save_path, study_name + '_metadata.npy'), metadata) if B4_flag: b4 = np.concatenate(b4, axis=0) np.save( os.path.join(save_path, study_name + '_b4.npy'), b4) break flip = MAP.flip[GEN.folder.name][N_snps_read:N_snps_read + genotype.shape[0]] N_snps_read += genotype.shape[0] flip_index = (flip == -1) genotype = np.apply_along_axis( lambda x: flip * (x - 2 * flip_index), 0, genotype) genotype = genotype[:, row_index[0]] maf = np.mean(genotype, axis=1) / 2 metadata['MAF'] = metadata['MAF'] + list(maf) # TODO (low) add interaction a_test.append(A_tests(covariates, genotype, intercept=intercept)) if B4_flag: # works only when all phenotypes in one chunk, if not, do not use this option! # it would use to much disk space anyway if len([f for f in PHEN.folder.files if f != 'info_dic.npy' ]) > 1: print('pd_full flag disabled!') B4_flag = False continue PHEN.folder.processed = 0 phenotype = PHEN.get_next(index=row_index[1]) b4.append(B4(phenotype, genotype)) print(('Time to PD genotype {} is {} s'.format(genotype.shape, t_gen.secs)))
if not os.path.isdir(args.out): print("Creating output folder {}".format(args.out)) os.mkdir(args.out) if args.np: check_np() ################################### CONVERTING ############################## if args.mode == 'converting': # ARG_CHECKER.check(args,mode='converting') R = Reader('genotype') R.start(args.genotype[0], vcf=args.vcf) with Timer() as t: if R.format == 'PLINK': G = GenotypePLINK(args.study_name[0], reader=R) G.split_size = CONVERTER_SPLIT_SIZE G.plink2hdf5(out=args.out) elif R.format == 'MINIMAC': G = GenotypeMINIMAC(args.study_name[0], reader=R) if args.cluster == 'y': G.cluster = True G.split_size = CONVERTER_SPLIT_SIZE G.MACH2hdf5(args.out, id=args.id) elif R.format == 'VCF': G = GenotypeVCF(args.study_name[0], reader=R) if args.cluster == 'y':