Exemple #1
0
def compute_kinship(control,impl,scale,loco):
    """GRM/Kinship computation"""
    opts = get_options_ns()
    standardized = "standardized" in scale
    if impl == "gemma1":
        gemma1grm.compute_kinship(control,standardized)
    else:
        gemma2grm.compute_kinship(control,standardized)
Exemple #2
0
def write_new_control(control: dict, transformation: dict):
    """Writes a new control reusing the old version, but adding the last
    command

    """
    opts = get_options_ns()
    cmd = " ".join(opts.args)
    transformation['command'] = cmd
    control['transformations'].append(transformation)
    with safe.control_write_open() as controlf:
        json.dump(control, controlf, indent=4)
Exemple #3
0
def memory_usage(msg: str = None):
    options = get_options_ns()
    if options.debug_ram or options.verbose > 2:
        process = psutil.Process(os.getpid())
        mem = process.memory_full_info()[0]
        if msg:
            print(msg, end=":\t")
        if (mem > 1024**3):
            print(f"{round(mem / float(2 ** 20) / 102.4 )/10}Gb RAM used")
        else:
            print(f"{round(mem / float(2 ** 20) )}Mb RAM used")
 def __init__(self, file_type: str, postfix: str = None):
     """file_type is control"""
     self.file_type = file_type
     opts = get_options_ns()
     self.opts = opts
     self.file_name = opts.out_prefix
     self.is_gzip = False
     if postfix:
         if postfix.endswith(".gz"):
             self.is_gzip = True
         self.file_name += postfix
     self.compression_level = opts.compression_level
Exemple #5
0
def compute_kinship(control, standardized):
    opts = get_options_ns()
    output_path = dirname(opts.out_prefix)
    if not output_path:
        output_path = "."
    output_basename = basename(opts.out_prefix)
    logging.info('Computing GRM with GEMMA1')
    logging.info('Convert to intermediate BIMBAM')
    genofn, phenofn = write_bimbam(control['name'])
    logging.info(f"Call gemma with {genofn}")
    k_type = "2" if not standardized else "1"
    args1 = [
        opts.gemma1_bin, '-debug', '-debug-data', '-outdir', output_path, '-o',
        output_basename, '-gk', k_type, '-g', genofn, '-p', phenofn
    ]
    cmd = " ".join(args1)
    logging.warning("Calling: " + cmd)
    # print(args1)
    run(args1, check=True)
    logging.info(f"Writing to {output_path}/{output_basename}.cXX.txt")
Exemple #6
0
def convert_bimbam(genofn: str, phenofn: str, annofn: str):
    """Read/convert/import BIMBAM and output to Rqtl2 format"""
    options = get_options_ns()
    path = options.out_prefix

    basefn = path

    logging.info(f"Reading BIMBAM marker/SNP {annofn}")
    with open(annofn, "r") as f:
        with safe.gmap_write_open() as out:
            outgmapfn = out.name
            out.write(f"marker,chr,pos\n".encode())
            for line in f:
                list = re.split('[,\t\s]+', line.strip())
                # print(list)
                marker, pos, chr = list
                out.write(f"{marker}\t{chr}\t{pos}\n".encode())

    logging.info(f"Reading BIMBAM phenofile {phenofn}")
    in_header = True
    p_inds = 0
    phenos = None
    with open(phenofn, "r") as f:
        with safe.pheno_write_open("_pheno_bimbam.txt.gz") as out:
            outphenofn = out.name
            for line in f:
                ps = line.strip().split("\t")
                if not phenos:
                    phenos = len(ps)
                if in_header:
                    out.write("id\t".encode())
                    out.write("\t".join([f"{i+1}"
                                         for i in range(phenos)]).encode())
                    in_header = False
                    out.write("\n".encode())
                p_inds += 1
                out.write(f"{p_inds}\t".encode())
                out.write("\t".join(ps).encode())
                out.write("\n".encode())

    inds = None
    markers = 0
    translate = {"1": "A", "0": "B", "0.5": "H"}  # FIXME hard coded

    in_header = True
    with safe.geno_write_open("_geno_bimbam.txt.gz") as out:
        outgenofn = out.name
        with gzip.open(genofn, mode='r') as f:
            for line in f:
                markers += 1
                l = line.decode().strip()
                gs = l.split("\t")
                if len(gs) == 1:
                    gs = l.split(", ")
                genos = len(gs) - 3
                assert genos != 1
                if in_header:
                    out.write("marker\t".encode())
                    out.write("\t".join([f"{i+1}"
                                         for i in range(genos)]).encode())
                    in_header = False
                    out.write("\n".encode())
                out.write(gs[0].encode())
                out.write("\t".encode())
                # print(gs[3:])
                out.write("".join([translate[item]
                                   for item in gs[3:]]).encode())
                out.write("\n".encode())
                if not inds:
                    inds = len(gs) - 3
                    logging.info(f"{inds} individuals")

    logging.info(f"{markers} markers")
    logging.info(f"{phenos} phenotypes")
    assert inds == p_inds, f"Individuals not matching {inds} != {p_inds}"
    transformation = {
        "type": "export",
        "original": "rqtl2",
        "format": "bimbam"
    }
    write_control(None, inds, markers, phenos, outgenofn, outphenofn,
                  outgmapfn, transformation)
def compute_kinship(control, standardized):
    # FIXME: these values are hard coded to develop the algorithm
    miss = 0.05
    maf = 0.01

    def filter_gs_ok(marker: str, gs: List[float]) -> bool:
        # 1. [X] Always apply the MAF filter when reading genotypes
        # 2. [X] Apply missiness filter
        return maf_num_filter(marker, gs, miss, maf)

    opts = get_options_ns()
    G, markerlist = load_geno(control, filter_gs_ok)
    # print(G)
    ctrl = data.methodize(control)
    # print(type(G))

    for idx, gs in enumerate(G):
        values = gs[~np.isnan(gs)]
        mean = np.mean(values)  # skip NAN
        print(gs)

        if idx == 1:
            print("orig", gs)
            print("mean", mean)
        # print(mean,variance)
        # 3. [X] Always impute missing data (injecting the row mean) FIXME
        def f(value: float):
            if np.isnan(value):
                return mean
            return value

        gs = [f(g) for g in gs]
        # 4. [X] Always subtract the row mean serves to "center" the data
        gs -= mean
        if idx == 1:
            print("mean", gs)
        # 5. [X] Center the data by row (which is the default option
        # ~-gk 1~) std is sqrt(var) to normalize each feature value to
        # a z-score.
        # gs /= np.std(gs) # std is always about 1.0 with BXD. assert std != 0.0:
        # genovar = np.sum(gs**2)/len(gs)-(mean**2)
        # genovar = np.var(gs)
        if standardized:
            genovar = np.var(gs)
            if genovar != 0:
                gs /= math.sqrt(genovar)
        # gs /= math.sqrt(genovar)
        G[idx, :] = gs
        if idx == 1:
            print("z-scored", gs)

    # G = G[:-1, :]
    print("G", G)
    markers = ctrl.markers
    K = np.dot(G.T, G)
    print("raw K", K)
    # 6. Always scale the matrix dividing by # of SNPs
    print("scale", markers)
    K /= float(markers - 1)
    # print(G)
    print("G dim", G.shape)
    print("K dim", K.shape)
    print(K)
    memory_usage()
def convert_plink(path: str, annofn: str):
    """Convert PLINK format to GEMMA2"""
    def mknum(v):
        if v != v:
            return "NA"
        return (str(v))

    options = get_options_ns()
    compression_level = options.compression_level
    verbose = options.verbose
    memory_usage("plink before load")

    logging.info(f"Reading PLINK files {path}")
    (bim, fam, bed) = read_plink(path,
                                 verbose=(True if verbose > 1 else False))
    m = bed.compute()
    if options.debug_data:
        print("Debug view of PLINK\n")
        print("===> BIM alleles/markers")
        print(bim.head())
        print(bim.info())
        print("===> FAM samples/phenotypes")
        print(fam.head())
        print(fam.info())
        print("===> BED genotypes")
        print(m)
        print([x for x in m[0]])
        print(bim.shape)

    markers2, phenos2 = bim.shape
    inds2, phenos = fam.shape
    markers, inds = m.shape
    assert inds == inds2, "Number of individuals not matching in fam and bed files"
    assert markers == markers2, "Number of markers not matching in bim and bed files"
    assert phenos == phenos2, "Number of phenotypes not matching in bim and fam files"

    basefn = options.out_prefix
    memory_usage("plink pandas")

    outgmapfn = "NONE"
    if annofn:
        logging.info(f"Reading BIMBAM marker/SNP {annofn}")
        with open(annofn, "r") as f:
            with safe.gmap_write_open() as out:
                outgmapfn = out.name
                out.write(f"marker,chr,pos\n".encode())
                for line in f:
                    marker, pos, chr, rest = line.strip().split("\t")
                    out.write(f"{marker}\t{chr}\t{pos}\n".encode())

    phenofn = basefn + "_pheno.tsv"
    p = fam.to_numpy()
    with safe.pheno_write_open() as f:
        outphenofn = f.name
        f.write("id".encode())
        for c in fam.columns.values:
            if c != "i":  # we skip the last i column
                f.write(f"\t{c}".encode())
        f.write("\n".encode())
        for j in range(inds):
            f.write((str(j + 1) + "\t").encode())
            f.write("\t".join([mknum(v) for v in p[j, :-1]
                               ]).encode())  # except for i column
            f.write("\n".encode())

    memory_usage("plink pheno")

    genofn = basefn + "_geno.txt.gz"
    logging.info(f"Writing GEMMA2 geno file {genofn}")
    translate = {1.0: "A", 2.0: "B", 0.0: "H", -9.0: "-"}

    import gzip
    with gzip.open(genofn, mode='wb', compresslevel=compression_level) as f:
        f.write("marker".encode())
        for i in range(inds):
            f.write(f"\t{i+1}".encode())
        for j in range(markers):
            markername = bim.snp[j]
            f.write(f"\n{markername}\t".encode())
            values = [-9.0 if np.isnan(x) else x for x in m[j]]
            if options.low_mem:  # shaves 20%
                for i in range(inds):
                    f.write(f"{translate[values[i]]}".encode())
            else:
                f.write("".join([translate[item] for item in values]).encode())
        outgenofn = genofn

    transformation = {
        "type": "convert",
        "original": "plink",
        "format": "rqtl2"
    }
    write_control(None, inds, markers, phenos, outgenofn, outphenofn,
                  outgmapfn, transformation)

    memory_usage("plink geno")