def __init__(self, covar_numbers, npcs, client_config, env, threshold=.005): if LogisticAdmm.__instance is not None: return else: pfile = client_config["plinkfile"] store_name = shared.get_plink_store(pfile) self.store = h5py.File(store_name, 'a') self.threshold = threshold self.client_config = client_config self.include_mask = [] self.env = env self.load_Y(covar_numbers[-1], pfile) self.load_covar_mat(npcs, covar_numbers[:-1], pfile) self.warm_start = 0 self.previous_estimates = {} self.prev_cov_estimate = None self.previous_Us = {} self.flipped_covar = None self.baseline_likelihood = {} LogisticAdmm.__instance = self
def store_filtered(message, client_config): pfile = client_config["plinkfile"] msg = pickle.loads(message) with h5py.File(shared.get_plink_store(pfile), 'a') as store: for chrom, val in msg.items(): mask = store[f"{chrom}/PCA_mask"].value mask[mask] = val write_or_replace(store[f"{chrom}"], 'PCA_mask', val=mask)
def init_store(client_config, env): pfile = client_config['plinkfile'] store_name = shared.get_plink_store(pfile) if os.path.isfile(store_name): clear_consistency_flag(store_name) report_file_info(store_name, client_config, env) logger.info(f"HDF5 file {store_name} already exists.") else: plinkToH5(client_config, env) logger.info("Preparing counts.") report_counts(client_config, env) logger.info('Finished reporting counts.')
def __init__(self, win_size, client_config): if LdReporter.__instance is not None: return else: self.r3 = 0 self.print_int = 1000 self.r1, self.r2 = win_size, int(win_size/2) LdReporter.__instance = self pfile = client_config["plinkfile"] store_name = shared.get_plink_store(pfile) self.store = h5py.File(store_name, 'a') self.chroms = [key for key, items in self.store.items() if key != "meta"] LdReporter.__instance = self
def report_cov(client_config, env): def standardize_mat(mat, af, sd): af = 2 * af.reshape(af.shape[0], 1) mat -= af ind = sd > 0 mat[ind, :] /= sd[ind].reshape(np.sum(ind), 1) mat[np.isnan(mat)] = 0 return mat pfile = shared.get_plink_store(client_config["plinkfile"]) with h5py.File(pfile, 'r') as store: n = store.attrs["n"] chroms = sorted([ch for ch in store if ch != "meta"], key=int) size = 0 for chi, ch1 in enumerate(chroms): group = store[ch1] tokeep = group['PCA_mask'].value pos = group["positions"].value[tokeep] af1 = group["MAF"].value[tokeep] sd1 = np.sqrt(group["VAR"].value[tokeep]) g1 = np.empty((len(pos), n)) for i, snp1 in enumerate(pos): g1[i, :] = group[str(snp1)].value g1 = standardize_mat(g1, af1, sd1) size += i+1 for j, ch2 in enumerate(chroms): if j > chi: continue msg = {} group = store[ch2] tokeep = group['PCA_mask'].value af2 = group["MAF"].value[tokeep] sd2 = np.sqrt(group["VAR"].value[tokeep]) pos = group["positions"].value[tokeep] g2 = np.empty((n, len(pos))) for i, snp2 in enumerate(pos): g2[:, i] = group[str(snp2)].value g2 = standardize_mat(g2.transpose(), af2, sd2).transpose() msg["CH1"] = ch1 msg["CH2"] = ch2 logger.info(f"Reporting cov: {ch1}_{ch2}: {g1.shape} x {g2.shape}") msg["MAT"] = g1.dot(g2).astype(np.float32) if ch1 == chroms[-1] and ch2 == chroms[-1]: msg["E"] = True msg = pickle.dumps(msg) networking.respond_to_server('api/tasks/PCA/COV', 'POST', msg, client_config['name'], env) logger.info(f"Final size will be {size}")
def init_stats(msg_dict, client_config, env): print(msg_dict.keys()) # Wait on previous tasks to finish i = current_app.control.inspect() client_name = client_config['name'] while i.active() is not None: active_tasks = i.active()[f'celery@{client_name}'] dependent_tasks = list( filter(lambda x: x['name'] == 'tasks.init_store', active_tasks)) if len(dependent_tasks) > 0: logger.info('Waiting on tasks.init_store to finish.') time.sleep(.1) else: break #message = pickle.loads(message) pfile = client_config['plinkfile'] #chrom = message["CHROM"] with h5py.File(shared.get_plink_store(pfile), 'a') as store: for chrom, message in msg_dict.items(): logger.info(f'Computing statistics for Chrom: {chrom}.') chrom_group = store[chrom] if "MISS" in message: vals = np.array(message["MISS"]) task = "not_missing_per_snp" write_or_replace(chrom_group, task, val=1 - vals) if "AF" in message: vals = np.array(message["AF"]) task = 'MAF' write_or_replace(chrom_group, task, val=vals) if "HWE" in message: vals = np.array(message["HWE"]) task = "hwe" write_or_replace(chrom_group, task, val=vals) if "VAR" in message: vals = np.array(message["VAR"]) task = "VAR" write_or_replace(chrom_group, task, val=vals) logging.info(f'Finished initializing QC statistics for chrom {chrom}.') client_name = client_config['name'] status = f'Finished with init stats.' networking.respond_to_server( f'api/clients/{client_name}/report?status={status}', 'POST', env=env)
def report_counts(client_config, env): """ Report the counts (Het, h**o Alt, missing) """ pfile = client_config['plinkfile'] store_name = shared.get_plink_store(pfile) with h5py.File(store_name, 'r') as store: countDict = {} n = store.attrs["n"] countDict["START"] = True keys = [i for i in store.keys() if i != 'meta'] for chrom in keys: countDict["n"] = int(n) countDict["CHROM"] = chrom count_arr = store["{}/counts".format(chrom)].value countDict["COUNTS"] = count_arr if chrom == keys[-1]: countDict["END"] = True logger.info(f'Sending counts from chrom {chrom}.') send_counts_to_server(countDict, client_config, env)
def pca_projection(data, client_config): message = pickle.loads(data) inv_sigma = message["ISIG"] v = message["V"] chroms = message["CHROMS"] pfile = shared.get_plink_store(client_config["plinkfile"]) with h5py.File(pfile, 'a') as store: n = 0 for chrom in chroms: n += np.sum(store[f"{chrom}/PCA_mask"]) num_inds = store.attrs["n"] # pca_sigma = dset.require_dataset('pca_sigma', shape=inv_sigma.shape, dtype=np.float32) # pca_sigma[:] = inv_sigma arr = np.empty((num_inds, n), dtype=np.float32) offset = 0 for chrom in chroms: group = store[str(chrom)] tokeep = group["PCA_mask"].value af = group["MAF"].value[tokeep] sd = np.sqrt(group["VAR"].value[tokeep]) positions = group["positions"].value[tokeep] for i, position in enumerate(positions): val = (group[str(position)].value - 2 * af[i])/sd[i] val[np.isnan(val)] = 0 arr[:, offset+i] = val offset += i+1 u = arr.dot(v.T).dot(np.diag(inv_sigma)) u, v = svd_flip(u, v, u_based_decision=False) dset = store.require_group("pca") write_or_replace(dset, 'pca_sigma', val=inv_sigma) write_or_replace(dset, 'pca_v.T', val=v) write_or_replace(dset, 'pca_u', val=u) # pca_vt = dset.require_dataset('pca_v.T', shape=v.shape, # dtype=np.float32) # pca_vt[:,:] = v # pca_u = dset.require_dataset('pca_u', shape=u.shape, # dtype=np.float32) # pca_u[:,:] = u logger.info("Done with projection!")
def plinkToH5(client_config, env): """Gets plink prefix, produces an HDF file with the same prefix""" pfile = client_config['plinkfile'] store_name = shared.get_plink_store(pfile) logger.info(f'Opening plinkfile: {pfile}') try: plink_file = plinkfile.open(pfile) except MemoryError as e: logger.error('MemoryError!') logger.error(e) if not plink_file.one_locus_per_row(): logger.error("""This script requires that snps are rows and samples columns.""") sys.exit(1) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() n_tot = len(sample_list) logger.info(f'Opening h5py file:{store_name}') with h5py.File(store_name, 'w', libver='latest') as store: store.attrs['n'] = len(sample_list) store.attrs['has_local_AF'] = False store.attrs['has_global_AF'] = False store.attrs['has_centering'] = False store.attrs['has_normalization'] = False potential_pheno_file = pfile + ".pheno" if os.path.isfile(pfile + ".pheno"): affection = np.loadtxt(potential_pheno_file, dtype=int, usecols=2) else: affection = [sample.affection for sample in sample_list] if len(np.unique(affection)) > 2: raise ValueError( "phenotype is not binary. We only support binary for now") write_or_replace(store, 'meta/Status', affection, np.int8) ids = [sample.iid for sample in sample_list] write_or_replace(store, 'meta/id', ids, 'S11') del ids, affection # Read Demographic file logger.info(f'Reading demographic file at {pfile}.ind') logger.info(f'File exists: {os.path.isfile(pfile + ".ind")}') with open(pfile + ".ind", 'r') as dem_f: dem = [(row.split("\t")[2]).encode("UTF8") for row in dem_f] write_or_replace(store, 'meta/regions', dem) # Read chromosome data current_chr = 1 positions = [] rsids = [] all_counts = [] current_group = store.require_group(str(current_chr)) genotypes = np.zeros(n_tot, dtype=np.float32) for locus, row in zip(locus_list, plink_file): if locus.chromosome != current_chr: if len(positions) == 0: del store[str(current_chr)] else: write_or_replace(current_group, 'positions', positions, dtype=np.uint) write_or_replace(current_group, 'rsids', rsids) write_or_replace(current_group, 'counts', all_counts, np.uint32) send_positions_to_server(positions, current_chr, client_config, env) positions = [] # rsid = [] all_counts = [] current_chr = locus.chromosome if current_chr == 23: break current_group = store.require_group(str(current_chr)) pos = str(locus.bp_position) counts, geno = process_plink_row(row, genotypes) # This should be a try except try: current_group.create_dataset(pos, data=geno) except Exception: logger.error( f"Cannot write position: chr{locus.chromosome} {pos}") rsids.append(locus.name.encode('utf8')) positions.append(pos) all_counts.append(counts) if locus.chromosome != 23: write_or_replace(current_group, 'positions', positions, np.uint32) write_or_replace(current_group, 'rsids', rsids) write_or_replace(current_group, 'counts', all_counts, np.uint32) send_positions_to_server(positions, current_chr, client_config, env) plink_file.close() logger.info('Finished writing plink to hdf5.')
def run_QC(filters, client_config, prefix, remove=True, env="production"): def find_what_passes(qc_name, dset_name, tokeep, doubleSided=False): vals = group[dset_name].value if qc_name in filters: thresh = float(filters[qc_name]) if not doubleSided: tokeep = np.logical_and(tokeep, vals > thresh) else: tokeep = np.logical_and( tokeep, np.logical_and( vals > thresh - Settings.kSmallEpsilon, (1.0 - vals) > thresh - Settings.kSmallEpsilon)) return tokeep def replace_dataset(tokeep, dset_name, return_deleted=False): vals = group[dset_name].value remaining = vals[tokeep] deleted = vals[np.logical_not(tokeep)] write_or_replace(group, dset_name, remaining) if return_deleted: return deleted pfile = client_config["plinkfile"] store_name = shared.get_plink_store(pfile) with h5py.File(store_name, 'a') as store: for chrom in store.keys(): if chrom == "meta": continue group = store[chrom] positions = group['positions'].value if "QC_mask" in group: tokeep = group["QC_mask"].value else: tokeep = np.ones_like(positions, dtype=bool) tokeep = find_what_passes(QCFilterNames.QC_HWE, "hwe", tokeep) tokeep = find_what_passes(QCFilterNames.QC_MAF, "MAF", tokeep, doubleSided=True) if QCFilterNames.QC_MPS in filters: filters[ QCFilterNames.QC_MPS] = 1 - filters[QCFilterNames.QC_MPS] tokeep = find_what_passes(QCFilterNames.QC_MPS, "not_missing_per_snp", tokeep) logger.info( f"After filtering {chrom}, {np.sum(tokeep)} snps remain") if remove: # Delete what doesn't pass replace_dataset(tokeep, 'hwe') replace_dataset(tokeep, 'VAR') replace_dataset(tokeep, 'MAF') replace_dataset(tokeep, 'not_missing_per_snp') deleted = replace_dataset(tokeep, 'positions', return_deleted=True) for snp in deleted: snp = str(snp) if snp in group: del group[snp] else: # Store what has been tagged pass_mask = prefix + "_mask" pos_mask = prefix + "_positions" if pass_mask in group: del group[pass_mask] if pos_mask in group: del group[pos_mask] write_or_replace(group, pass_mask, val=tokeep, dtype=bool) positions = group['positions'].value[tokeep] write_or_replace(group, pos_mask, val=positions) if prefix == "PCA": write_or_replace(group, "PCA_passed", val=np.ones(np.sum(tokeep), dtype=bool)) if 'non_ld_mask' in group: del group['non_ld_mask'] client_name = client_config['name'] if prefix == "QC": networking.respond_to_server('api/tasks/QC/FIN', "POST", b'', client_name, env) else: networking.respond_to_server('api/tasks/PCA/FIN', "POST", b'', client_name, env)