def send_positions_to_server(positions, chrom, client_config, env): client_name = client_config['name'] data = pickle.dumps({'CHROM': chrom, 'POS': positions}) networking.respond_to_server('api/tasks/INIT/POS', 'POST', data, client_name, env)
def run_covar_regression(self, warm_start=None, rho=250.0, alpha=1.0): # instead of making many function calls, I'll separate the covariate and chromosome regressions model = "Small" ncov = self.covariates.shape[1] estimates = np.zeros((ncov - 1, 1)) idx = [i for i in range(ncov) if i != 1] covariates = self.covariates[:, idx] if self.prev_cov_estimate is not None: z_hat = self.prev_cov_estimate all_Us = self.previous_Us[model] + z_hat - warm_start else: all_Us = 0 if warm_start is None: estimates[:, 0] = other_newton(covariates, np.zeros((ncov - 1)), np.zeros((ncov - 1, )), rho, estimates[:, 0], ncov - 1) z_hat = estimates else: estimates[:, 0] = other_newton(covariates, all_Us[:, 0], warm_start[:, 0], rho, z_hat[:, 0], ncov - 1) z_hat = alpha * estimates + (1 - alpha) * warm_start self.prev_cov_estimate = estimates self.previous_Us[model] = all_Us est = z_hat + all_Us msg = pickle.dumps({"VALS": est, "Estimated": "Small"}) networking.respond_to_server('api/tasks/ASSO/estimate', 'POST', msg, self.client_config['name'], self.env)
def cost(self, data): msg = pickle.loads(data) chrom = msg["Estimated"] mask = msg["conv"] x0 = msg["x0"] estimates = self.evaluate_estimate(chrom, mask, x0) estimates -= self.baseline_likelihood[chrom][mask[:, 0]] msg = pickle.dumps({'estimated': chrom, 'v': estimates}) networking.respond_to_server('api/tasks/ASSO/valback', 'POST', msg, self.client_config['name'], self.env)
def send_likelihood(self, message): # TODO Important, if we are excluding missing values, we should recompute baseline every time message = pickle.loads(message) include_mask = self.include_mask model = message["Estimated"] coef = message["Coef"] coef = coef.T covariates = self.covariates # n = int(np.sum(self.include_mask)) y = self.Ys.copy() ell = None if self.flipped_covar: self.covariates *= -y self.flipped_covar = False y += 1 y /= 2 if model == "Small": indx = [i for i in range(covariates.shape[1]) if i != 1] y_model = 1.0 / (1 + np.exp(-covariates[:, indx].dot(coef.T))) self.base_y_pred = y_model # ell = log_loss((y+1)/2, y_model, normalize=False, labels=[0,1]) else: group = self.store[model] af = group["MAF"].value tokeep = np.logical_and(af > self.threshold, 1 - af > self.threshold) positions = group["QC_positions"].value ell = np.zeros((1, positions.shape[0])) for i, position in enumerate(positions): if not tokeep[i]: ell[0, i] = np.nan else: val = group[str(position)].value[include_mask] ind = ~np.isnan(val) # TODO impute or something? covariates[:, 1] = val y_model = 1.0 / ( 1 + np.exp(-covariates[ind, :].dot(coef[:, i].T))) ell[0, i] = log_loss(y[ind], y_model, normalize=False, labels=[0, 1]) ell[0, i] -= log_loss(y[ind], self.base_y_pred[ind], normalize=False, labels=[0, 1]) msg = pickle.dumps({"Estimated": model, "estimate": ell}) networking.respond_to_server('api/tasks/ASSO/pval', 'POST', msg, self.client_config['name'], self.env)
def report_cov(client_config, env): def standardize_mat(mat, af, sd): af = 2 * af.reshape(af.shape[0], 1) mat -= af ind = sd > 0 mat[ind, :] /= sd[ind].reshape(np.sum(ind), 1) mat[np.isnan(mat)] = 0 return mat pfile = shared.get_plink_store(client_config["plinkfile"]) with h5py.File(pfile, 'r') as store: n = store.attrs["n"] chroms = sorted([ch for ch in store if ch != "meta"], key=int) size = 0 for chi, ch1 in enumerate(chroms): group = store[ch1] tokeep = group['PCA_mask'].value pos = group["positions"].value[tokeep] af1 = group["MAF"].value[tokeep] sd1 = np.sqrt(group["VAR"].value[tokeep]) g1 = np.empty((len(pos), n)) for i, snp1 in enumerate(pos): g1[i, :] = group[str(snp1)].value g1 = standardize_mat(g1, af1, sd1) size += i+1 for j, ch2 in enumerate(chroms): if j > chi: continue msg = {} group = store[ch2] tokeep = group['PCA_mask'].value af2 = group["MAF"].value[tokeep] sd2 = np.sqrt(group["VAR"].value[tokeep]) pos = group["positions"].value[tokeep] g2 = np.empty((n, len(pos))) for i, snp2 in enumerate(pos): g2[:, i] = group[str(snp2)].value g2 = standardize_mat(g2.transpose(), af2, sd2).transpose() msg["CH1"] = ch1 msg["CH2"] = ch2 logger.info(f"Reporting cov: {ch1}_{ch2}: {g1.shape} x {g2.shape}") msg["MAT"] = g1.dot(g2).astype(np.float32) if ch1 == chroms[-1] and ch2 == chroms[-1]: msg["E"] = True msg = pickle.dumps(msg) networking.respond_to_server('api/tasks/PCA/COV', 'POST', msg, client_config['name'], env) logger.info(f"Final size will be {size}")
def init_stats(msg_dict, client_config, env): print(msg_dict.keys()) # Wait on previous tasks to finish i = current_app.control.inspect() client_name = client_config['name'] while i.active() is not None: active_tasks = i.active()[f'celery@{client_name}'] dependent_tasks = list( filter(lambda x: x['name'] == 'tasks.init_store', active_tasks)) if len(dependent_tasks) > 0: logger.info('Waiting on tasks.init_store to finish.') time.sleep(.1) else: break #message = pickle.loads(message) pfile = client_config['plinkfile'] #chrom = message["CHROM"] with h5py.File(shared.get_plink_store(pfile), 'a') as store: for chrom, message in msg_dict.items(): logger.info(f'Computing statistics for Chrom: {chrom}.') chrom_group = store[chrom] if "MISS" in message: vals = np.array(message["MISS"]) task = "not_missing_per_snp" write_or_replace(chrom_group, task, val=1 - vals) if "AF" in message: vals = np.array(message["AF"]) task = 'MAF' write_or_replace(chrom_group, task, val=vals) if "HWE" in message: vals = np.array(message["HWE"]) task = "hwe" write_or_replace(chrom_group, task, val=vals) if "VAR" in message: vals = np.array(message["VAR"]) task = "VAR" write_or_replace(chrom_group, task, val=vals) logging.info(f'Finished initializing QC statistics for chrom {chrom}.') client_name = client_config['name'] status = f'Finished with init stats.' networking.respond_to_server( f'api/clients/{client_name}/report?status={status}', 'POST', env=env)
def send_summary_to_standardize(self): # This is a rough sketch, the assumption here is that if you are quantitative factor, then you # exhibit more than 2 values within each silo. I never actually verify this with the current version # So that might be a good TODO for future implementations. quant_covars = [ i for i in range(2, self.covariates.shape[1]) if len(np.unique(self.covariates[:, i])) > 2 ] sums = np.sum(self.covariates[:, quant_covars], axis=0) sumsq = np.sum(self.covariates[:, quant_covars]**2, axis=0) msg = { "Indx": quant_covars, "Sums": sums, "SS": sumsq, "N": self.covariates.shape[0] } msg = pickle.dumps(msg) networking.respond_to_server('api/tasks/ASSO/adjust', 'POST', msg, self.client_config['name'], self.env)
def update(self, data, client_config, env): data = pickle.loads(data) n = self.store.attrs['n'] msg = {} if self.r3 == 0: # fake start the first round for chrom in self.chroms: # if the length is less than r1, you deserve an error. # No apologies tags = self.store["{}/PCA_passed".format(chrom)] data[chrom] = tags[0:self.r1] for key, state in data.items(): if key == "TASK" or key == "SUBTASK": continue chrom = key tags = self.store["{}/PCA_passed".format(chrom)] if state[0] == "E": # Finished with this chrom if len(data) == 1: # Done with everything msg = pickle.dumps({}) networking.respond_to_server('api/tasks/PCA/PCAPOS', 'POST', msg, client_config['name'], env) self.store.close() logger.info("Done with LD pruning.") return continue else: tokeep = state end = self.r3 + len(tokeep) pos = self.store["{}/PCA_positions".format(chrom)] positions = pos[self.r3: end] positions = positions[tokeep] genotypes = np.empty((n, len(positions)), dtype=np.float32) for i, snp in enumerate(positions): genotypes[:, i] = self.store["{}/{}".format(chrom, snp)].value corr = nancorr(genotypes) msg[chrom] = corr msg = pickle.dumps(msg) networking.respond_to_server('api/tasks/PCA/LD', 'POST', msg, client_config['name'], env) self.r3 += self.r2 if self.r3 > self.print_int: logger.info(f"pruning at {self.r3}") self.print_int += 1000
def send_counts_to_server(data, client_config, env): client_name = client_config['name'] data = pickle.dumps(data) networking.respond_to_server('api/tasks/INIT/COUNT', 'POST', data, client_name, env)
def echo(client_config, env): networking.respond_to_server('api/tasks/Echo/itr', 'POST', env)
def run_logistic_regression(self, y, chrom=None, warm_start=None, rho=250.0, alpha=1.00): store = self.store include_mask = self.include_mask n = int(np.sum(self.include_mask)) y = y.reshape(n) covariates = self.covariates.copy() group = store[chrom] positions = group["QC_positions"] ncov = self.covariates.shape[1] estimates = np.zeros((len(positions), ncov)) if warm_start is None: est = np.zeros(ncov) covar_estimates = self.prev_cov_estimate # boundary condition for the loop est[1] = 0 est[0] = covar_estimates[0] est[2:] = covar_estimates[1:].ravel() af = group["MAF"] if chrom in self.previous_estimates: z_hat = self.previous_estimates[chrom] all_Us = self.previous_Us[chrom] + z_hat - warm_start else: all_Us = np.zeros((ncov)) all_Us[0] = self.previous_Us["Small"][0] all_Us[2:] = self.previous_Us["Small"][1:].ravel() count = 0 t = time.time() for i, position in enumerate(positions): # if i == 10: #TODO # totalT = (time.time()-t)/500 # print(f"Average time is {totalT} for n={n}, {t2/500}") # print(f"Count is {count}") # break if not i % 100: logger.info(f"{time.time()-t}") t = time.time() # t2 = 0 logger.info(f"{float(i/len(positions))}") if af[i] < self.threshold or (1 - af[i]) < self.threshold: estimates[i, :] = np.nan continue else: val = group[str(position)].value[include_mask] ind = ~np.isnan(val) covariates[ind, 1] = val[ind] * -y[ind] count += 1 # t3 = time.time() if warm_start is None: # est = np.ascontiguousarray(estimates[:,i]) # estimates[:,i] = minimize_lbfgs(covariates[ind, :], # np.zeros((ncov)), np.zeros((ncov,)),rho, est, ncov) # estimates[i, :] = minimize_lbfgs(covariates[ind, :], np.zeros((ncov)), est, rho, est, ncov) estimates[i, :] = other_newton(covariates[ind, :], all_Us, est, rho, est, ncov) # est *= .2 # est += .8*estimates[i, :] # est[1] = 0 # print(f"{estimates[i, :]} from myc") # estimates[i, :] = bfgs_more_gutted(covariates[ind, :], np.zeros((ncov)), # np.zeros((ncov,)), rho, est, ncov) # print(f"{estimates[i, :]} from sci") z_hat = alpha * estimates + (1 - alpha) * est else: # estimates[i, :] = bfgs_more_gutted(covariates[ind, :], all_Us[i, :], # warm_start[i, :], rho, z_hat[i, :], ncov) # estimates[i, :] = minimize_lbfgs(covariates[ind, :], all_Us[i, :], # warm_start[i, :], rho, z_hat[i, :], ncov) estimates[i, :] = other_newton(covariates[ind, :], all_Us[i, :], warm_start[i, :], rho, z_hat[i, :], ncov) z_hat = alpha * estimates + (1 - alpha) * warm_start # t2 += time.time()-t3 self.previous_estimates[chrom] = estimates self.previous_Us[chrom] = all_Us msg = pickle.dumps({ "Estimated": chrom, "VALS": z_hat + all_Us }) # , 'cov': covariates}) networking.respond_to_server('api/tasks/ASSO/estimate', 'POST', msg, self.client_config['name'], self.env)
def run_newton_lr(self, y, chrom=None, warm_start=None, unconverged=None): store = self.store logger.info("starting with newtons") include_mask = self.include_mask n = np.sum(include_mask) y = y.reshape(n) covariates = self.covariates.copy() group = store[chrom] positions = group["QC_positions"] mask = group["QC_mask"].value af = group["MAF"].value[mask] ncov = self.covariates.shape[1] baselikelihood = self.baseline_likelihood[chrom] if unconverged is None: L = len(positions) else: L = np.sum(unconverged) positions = positions[unconverged[:, 0]] af = af[unconverged[:, 0]] baselikelihood = baselikelihood[unconverged[:, 0]] hessians = np.zeros((int(np.ceil(L / 2)), ncov, ncov)) diagonals = np.zeros((L, ncov)) gradients = np.zeros((L, ncov)) vals = np.zeros((L, 1)) count = 0 t = time.time() for i, position in enumerate(positions): # if i == 10: #TODO # totalT = (time.time()-t)/500 # print(f"Average time is {totalT} for n={n}, {t2/500}") # print(f"Count is {count}") # break if not i % 5000: logger.info( f"After {time.time()-t:.1f}s done with {float(i/L)*100:.1f}% of iteration." ) if af[i] < self.threshold or (1 - af[i]) < self.threshold: continue val = group[str(position)].value[include_mask] # Dumb imputation. Hopefully your data is already imputed and this doesn't happen val[np.isnan(val)] = 0 # ind = ~np.isnan(val) # covariates[ind,1] = val[ind] * -y[ind] covariates[:, 1] = val * -y count += 1 # submat = covariates[ind, :] h, diagonals[i], gradients[i], vals[i, 0] = ltri_Hessians( # submat, warm_start[i, :,0], ncov, submat.shape[0], 0) #rho set to zero covariates, warm_start[i, :, 0], ncov, covariates.shape[0], 0) # rho set to zero if i % 2: hessians[i // 2, :, :] += h.T else: hessians[i // 2, :, :] += h vals -= baselikelihood msg = pickle.dumps({ "Estimated": chrom, "H": hessians, 'g': gradients, 'd': diagonals, 'v': vals, "covar": covariates }) networking.respond_to_server('api/tasks/ASSO/hessians', 'POST', msg, self.client_config['name'], self.env)
def run_QC(filters, client_config, prefix, remove=True, env="production"): def find_what_passes(qc_name, dset_name, tokeep, doubleSided=False): vals = group[dset_name].value if qc_name in filters: thresh = float(filters[qc_name]) if not doubleSided: tokeep = np.logical_and(tokeep, vals > thresh) else: tokeep = np.logical_and( tokeep, np.logical_and( vals > thresh - Settings.kSmallEpsilon, (1.0 - vals) > thresh - Settings.kSmallEpsilon)) return tokeep def replace_dataset(tokeep, dset_name, return_deleted=False): vals = group[dset_name].value remaining = vals[tokeep] deleted = vals[np.logical_not(tokeep)] write_or_replace(group, dset_name, remaining) if return_deleted: return deleted pfile = client_config["plinkfile"] store_name = shared.get_plink_store(pfile) with h5py.File(store_name, 'a') as store: for chrom in store.keys(): if chrom == "meta": continue group = store[chrom] positions = group['positions'].value if "QC_mask" in group: tokeep = group["QC_mask"].value else: tokeep = np.ones_like(positions, dtype=bool) tokeep = find_what_passes(QCFilterNames.QC_HWE, "hwe", tokeep) tokeep = find_what_passes(QCFilterNames.QC_MAF, "MAF", tokeep, doubleSided=True) if QCFilterNames.QC_MPS in filters: filters[ QCFilterNames.QC_MPS] = 1 - filters[QCFilterNames.QC_MPS] tokeep = find_what_passes(QCFilterNames.QC_MPS, "not_missing_per_snp", tokeep) logger.info( f"After filtering {chrom}, {np.sum(tokeep)} snps remain") if remove: # Delete what doesn't pass replace_dataset(tokeep, 'hwe') replace_dataset(tokeep, 'VAR') replace_dataset(tokeep, 'MAF') replace_dataset(tokeep, 'not_missing_per_snp') deleted = replace_dataset(tokeep, 'positions', return_deleted=True) for snp in deleted: snp = str(snp) if snp in group: del group[snp] else: # Store what has been tagged pass_mask = prefix + "_mask" pos_mask = prefix + "_positions" if pass_mask in group: del group[pass_mask] if pos_mask in group: del group[pos_mask] write_or_replace(group, pass_mask, val=tokeep, dtype=bool) positions = group['positions'].value[tokeep] write_or_replace(group, pos_mask, val=positions) if prefix == "PCA": write_or_replace(group, "PCA_passed", val=np.ones(np.sum(tokeep), dtype=bool)) if 'non_ld_mask' in group: del group['non_ld_mask'] client_name = client_config['name'] if prefix == "QC": networking.respond_to_server('api/tasks/QC/FIN', "POST", b'', client_name, env) else: networking.respond_to_server('api/tasks/PCA/FIN', "POST", b'', client_name, env)