Beispiel #1
0
 def __init__(self,
              covar_numbers,
              npcs,
              client_config,
              env,
              threshold=.005):
     if LogisticAdmm.__instance is not None:
         return
     else:
         pfile = client_config["plinkfile"]
         store_name = shared.get_plink_store(pfile)
         self.store = h5py.File(store_name, 'a')
         self.threshold = threshold
         self.client_config = client_config
         self.include_mask = []
         self.env = env
         self.load_Y(covar_numbers[-1], pfile)
         self.load_covar_mat(npcs, covar_numbers[:-1], pfile)
         self.warm_start = 0
         self.previous_estimates = {}
         self.prev_cov_estimate = None
         self.previous_Us = {}
         self.flipped_covar = None
         self.baseline_likelihood = {}
         LogisticAdmm.__instance = self
Beispiel #2
0
def store_filtered(message, client_config):
    pfile = client_config["plinkfile"]
    msg = pickle.loads(message)
    with h5py.File(shared.get_plink_store(pfile), 'a') as store:
        for chrom, val in msg.items():
            mask = store[f"{chrom}/PCA_mask"].value
            mask[mask] = val
            write_or_replace(store[f"{chrom}"], 'PCA_mask', val=mask)
Beispiel #3
0
def init_store(client_config, env):
    pfile = client_config['plinkfile']
    store_name = shared.get_plink_store(pfile)
    if os.path.isfile(store_name):
        clear_consistency_flag(store_name)
        report_file_info(store_name, client_config, env)
        logger.info(f"HDF5 file {store_name} already exists.")
    else:
        plinkToH5(client_config, env)
    logger.info("Preparing counts.")
    report_counts(client_config, env)
    logger.info('Finished reporting counts.')
Beispiel #4
0
 def __init__(self, win_size, client_config):
     if LdReporter.__instance is not None:
         return
     else:
         self.r3 = 0
         self.print_int = 1000
         self.r1, self.r2 = win_size, int(win_size/2)
         LdReporter.__instance = self
         pfile = client_config["plinkfile"]
         store_name = shared.get_plink_store(pfile)
         self.store = h5py.File(store_name, 'a')
         self.chroms = [key for key, items in self.store.items() if key != "meta"]
         LdReporter.__instance = self
Beispiel #5
0
def report_cov(client_config, env):
    def standardize_mat(mat, af, sd):
        af = 2 * af.reshape(af.shape[0], 1)
        mat -= af
        ind = sd > 0
        mat[ind, :] /= sd[ind].reshape(np.sum(ind), 1)
        mat[np.isnan(mat)] = 0
        return mat
    pfile = shared.get_plink_store(client_config["plinkfile"])
    with h5py.File(pfile, 'r') as store:
        n = store.attrs["n"]
        chroms = sorted([ch for ch in store if ch != "meta"], key=int)
        size = 0
        for chi, ch1 in enumerate(chroms):
            group = store[ch1]
            tokeep = group['PCA_mask'].value
            pos = group["positions"].value[tokeep]
            af1 = group["MAF"].value[tokeep]
            sd1 = np.sqrt(group["VAR"].value[tokeep])
            g1 = np.empty((len(pos), n))
            for i, snp1 in enumerate(pos):
                g1[i, :] = group[str(snp1)].value
            g1 = standardize_mat(g1, af1, sd1)
            size += i+1
            for j, ch2 in enumerate(chroms):
                if j > chi:
                    continue
                msg = {}
                group = store[ch2]
                tokeep = group['PCA_mask'].value
                af2 = group["MAF"].value[tokeep]
                sd2 = np.sqrt(group["VAR"].value[tokeep])
                pos = group["positions"].value[tokeep]
                g2 = np.empty((n, len(pos)))
                for i, snp2 in enumerate(pos):
                    g2[:, i] = group[str(snp2)].value
                g2 = standardize_mat(g2.transpose(), af2, sd2).transpose()
                msg["CH1"] = ch1
                msg["CH2"] = ch2
                logger.info(f"Reporting cov: {ch1}_{ch2}: {g1.shape} x {g2.shape}")
                msg["MAT"] = g1.dot(g2).astype(np.float32)
                if ch1 == chroms[-1] and ch2 == chroms[-1]:
                    msg["E"] = True
                msg = pickle.dumps(msg)
                networking.respond_to_server('api/tasks/PCA/COV', 'POST', msg, client_config['name'], env)
        logger.info(f"Final size will be {size}")
Beispiel #6
0
def init_stats(msg_dict, client_config, env):
    print(msg_dict.keys())
    # Wait on previous tasks to finish
    i = current_app.control.inspect()
    client_name = client_config['name']
    while i.active() is not None:
        active_tasks = i.active()[f'celery@{client_name}']
        dependent_tasks = list(
            filter(lambda x: x['name'] == 'tasks.init_store', active_tasks))
        if len(dependent_tasks) > 0:
            logger.info('Waiting on tasks.init_store to finish.')
            time.sleep(.1)
        else:
            break
    #message = pickle.loads(message)
    pfile = client_config['plinkfile']
    #chrom = message["CHROM"]
    with h5py.File(shared.get_plink_store(pfile), 'a') as store:
        for chrom, message in msg_dict.items():
            logger.info(f'Computing statistics for Chrom: {chrom}.')
            chrom_group = store[chrom]
            if "MISS" in message:
                vals = np.array(message["MISS"])
                task = "not_missing_per_snp"
                write_or_replace(chrom_group, task, val=1 - vals)
            if "AF" in message:
                vals = np.array(message["AF"])
                task = 'MAF'
                write_or_replace(chrom_group, task, val=vals)
            if "HWE" in message:
                vals = np.array(message["HWE"])
                task = "hwe"
                write_or_replace(chrom_group, task, val=vals)
            if "VAR" in message:
                vals = np.array(message["VAR"])
                task = "VAR"
                write_or_replace(chrom_group, task, val=vals)
        logging.info(f'Finished initializing QC statistics for chrom {chrom}.')

    client_name = client_config['name']
    status = f'Finished with init stats.'
    networking.respond_to_server(
        f'api/clients/{client_name}/report?status={status}', 'POST', env=env)
Beispiel #7
0
def report_counts(client_config, env):
    """
    Report the counts (Het, h**o Alt, missing)
    """
    pfile = client_config['plinkfile']
    store_name = shared.get_plink_store(pfile)
    with h5py.File(store_name, 'r') as store:
        countDict = {}
        n = store.attrs["n"]
        countDict["START"] = True
        keys = [i for i in store.keys() if i != 'meta']
        for chrom in keys:
            countDict["n"] = int(n)
            countDict["CHROM"] = chrom
            count_arr = store["{}/counts".format(chrom)].value
            countDict["COUNTS"] = count_arr
            if chrom == keys[-1]:
                countDict["END"] = True
            logger.info(f'Sending counts from chrom {chrom}.')
            send_counts_to_server(countDict, client_config, env)
Beispiel #8
0
def pca_projection(data, client_config):
    message = pickle.loads(data)
    inv_sigma = message["ISIG"]
    v = message["V"]
    chroms = message["CHROMS"]
    pfile = shared.get_plink_store(client_config["plinkfile"])
    with h5py.File(pfile, 'a') as store:
        n = 0
        for chrom in chroms:
            n += np.sum(store[f"{chrom}/PCA_mask"])
        num_inds = store.attrs["n"]
        # pca_sigma = dset.require_dataset('pca_sigma', shape=inv_sigma.shape, dtype=np.float32)
        # pca_sigma[:] = inv_sigma
        arr = np.empty((num_inds, n), dtype=np.float32)
        offset = 0
        for chrom in chroms:
            group = store[str(chrom)]
            tokeep = group["PCA_mask"].value
            af = group["MAF"].value[tokeep]
            sd = np.sqrt(group["VAR"].value[tokeep])
            positions = group["positions"].value[tokeep]
            for i, position in enumerate(positions):
                val = (group[str(position)].value - 2 * af[i])/sd[i]
                val[np.isnan(val)] = 0
                arr[:, offset+i] = val
            offset += i+1
        u = arr.dot(v.T).dot(np.diag(inv_sigma))
        u, v = svd_flip(u, v, u_based_decision=False)
        dset = store.require_group("pca")
        write_or_replace(dset, 'pca_sigma', val=inv_sigma)
        write_or_replace(dset, 'pca_v.T', val=v)
        write_or_replace(dset, 'pca_u', val=u)
        # pca_vt = dset.require_dataset('pca_v.T', shape=v.shape,
        #    dtype=np.float32)
        # pca_vt[:,:] = v
        # pca_u = dset.require_dataset('pca_u', shape=u.shape,
        #    dtype=np.float32)
        # pca_u[:,:] = u
    logger.info("Done with projection!")
Beispiel #9
0
def plinkToH5(client_config, env):
    """Gets plink prefix, produces an HDF file with the same prefix"""
    pfile = client_config['plinkfile']
    store_name = shared.get_plink_store(pfile)
    logger.info(f'Opening plinkfile: {pfile}')
    try:
        plink_file = plinkfile.open(pfile)
    except MemoryError as e:
        logger.error('MemoryError!')
        logger.error(e)
    if not plink_file.one_locus_per_row():
        logger.error("""This script requires that snps are
            rows and samples columns.""")
        sys.exit(1)
    sample_list = plink_file.get_samples()
    locus_list = plink_file.get_loci()
    n_tot = len(sample_list)
    logger.info(f'Opening h5py file:{store_name}')
    with h5py.File(store_name, 'w', libver='latest') as store:
        store.attrs['n'] = len(sample_list)
        store.attrs['has_local_AF'] = False
        store.attrs['has_global_AF'] = False
        store.attrs['has_centering'] = False
        store.attrs['has_normalization'] = False
        potential_pheno_file = pfile + ".pheno"
        if os.path.isfile(pfile + ".pheno"):
            affection = np.loadtxt(potential_pheno_file, dtype=int, usecols=2)
        else:
            affection = [sample.affection for sample in sample_list]
        if len(np.unique(affection)) > 2:
            raise ValueError(
                "phenotype is not binary. We only support binary for now")
        write_or_replace(store, 'meta/Status', affection, np.int8)
        ids = [sample.iid for sample in sample_list]
        write_or_replace(store, 'meta/id', ids, 'S11')
        del ids, affection
        # Read Demographic file
        logger.info(f'Reading demographic file at {pfile}.ind')
        logger.info(f'File exists: {os.path.isfile(pfile + ".ind")}')
        with open(pfile + ".ind", 'r') as dem_f:
            dem = [(row.split("\t")[2]).encode("UTF8") for row in dem_f]
            write_or_replace(store, 'meta/regions', dem)
        # Read chromosome data
        current_chr = 1
        positions = []
        rsids = []
        all_counts = []
        current_group = store.require_group(str(current_chr))
        genotypes = np.zeros(n_tot, dtype=np.float32)
        for locus, row in zip(locus_list, plink_file):
            if locus.chromosome != current_chr:
                if len(positions) == 0:
                    del store[str(current_chr)]
                else:
                    write_or_replace(current_group,
                                     'positions',
                                     positions,
                                     dtype=np.uint)
                    write_or_replace(current_group, 'rsids', rsids)
                    write_or_replace(current_group, 'counts', all_counts,
                                     np.uint32)

                    send_positions_to_server(positions, current_chr,
                                             client_config, env)
                    positions = []
                    # rsid = []
                    all_counts = []
                current_chr = locus.chromosome
                if current_chr == 23:
                    break
                current_group = store.require_group(str(current_chr))
            pos = str(locus.bp_position)
            counts, geno = process_plink_row(row, genotypes)
            # This should be a try except
            try:
                current_group.create_dataset(pos, data=geno)
            except Exception:
                logger.error(
                    f"Cannot write position: chr{locus.chromosome} {pos}")
            rsids.append(locus.name.encode('utf8'))
            positions.append(pos)
            all_counts.append(counts)
        if locus.chromosome != 23:
            write_or_replace(current_group, 'positions', positions, np.uint32)
            write_or_replace(current_group, 'rsids', rsids)
            write_or_replace(current_group, 'counts', all_counts, np.uint32)
            send_positions_to_server(positions, current_chr, client_config,
                                     env)
    plink_file.close()
    logger.info('Finished writing plink to hdf5.')
Beispiel #10
0
def run_QC(filters, client_config, prefix, remove=True, env="production"):
    def find_what_passes(qc_name, dset_name, tokeep, doubleSided=False):
        vals = group[dset_name].value
        if qc_name in filters:
            thresh = float(filters[qc_name])
            if not doubleSided:
                tokeep = np.logical_and(tokeep, vals > thresh)
            else:
                tokeep = np.logical_and(
                    tokeep,
                    np.logical_and(
                        vals > thresh - Settings.kSmallEpsilon,
                        (1.0 - vals) > thresh - Settings.kSmallEpsilon))
        return tokeep

    def replace_dataset(tokeep, dset_name, return_deleted=False):
        vals = group[dset_name].value
        remaining = vals[tokeep]
        deleted = vals[np.logical_not(tokeep)]
        write_or_replace(group, dset_name, remaining)
        if return_deleted:
            return deleted

    pfile = client_config["plinkfile"]
    store_name = shared.get_plink_store(pfile)
    with h5py.File(store_name, 'a') as store:
        for chrom in store.keys():
            if chrom == "meta":
                continue
            group = store[chrom]
            positions = group['positions'].value
            if "QC_mask" in group:
                tokeep = group["QC_mask"].value
            else:
                tokeep = np.ones_like(positions, dtype=bool)

            tokeep = find_what_passes(QCFilterNames.QC_HWE, "hwe", tokeep)
            tokeep = find_what_passes(QCFilterNames.QC_MAF,
                                      "MAF",
                                      tokeep,
                                      doubleSided=True)
            if QCFilterNames.QC_MPS in filters:
                filters[
                    QCFilterNames.QC_MPS] = 1 - filters[QCFilterNames.QC_MPS]
            tokeep = find_what_passes(QCFilterNames.QC_MPS,
                                      "not_missing_per_snp", tokeep)
            logger.info(
                f"After filtering {chrom}, {np.sum(tokeep)} snps remain")
            if remove:  # Delete what doesn't pass
                replace_dataset(tokeep, 'hwe')
                replace_dataset(tokeep, 'VAR')
                replace_dataset(tokeep, 'MAF')
                replace_dataset(tokeep, 'not_missing_per_snp')
                deleted = replace_dataset(tokeep,
                                          'positions',
                                          return_deleted=True)
                for snp in deleted:
                    snp = str(snp)
                    if snp in group:
                        del group[snp]
            else:  # Store what has been tagged
                pass_mask = prefix + "_mask"
                pos_mask = prefix + "_positions"
                if pass_mask in group:
                    del group[pass_mask]
                if pos_mask in group:
                    del group[pos_mask]
                write_or_replace(group, pass_mask, val=tokeep, dtype=bool)
                positions = group['positions'].value[tokeep]
                write_or_replace(group, pos_mask, val=positions)
                if prefix == "PCA":
                    write_or_replace(group,
                                     "PCA_passed",
                                     val=np.ones(np.sum(tokeep), dtype=bool))
                    if 'non_ld_mask' in group:
                        del group['non_ld_mask']
    client_name = client_config['name']
    if prefix == "QC":
        networking.respond_to_server('api/tasks/QC/FIN', "POST", b'',
                                     client_name, env)
    else:
        networking.respond_to_server('api/tasks/PCA/FIN', "POST", b'',
                                     client_name, env)