def compute_kinship(control,impl,scale,loco): """GRM/Kinship computation""" opts = get_options_ns() standardized = "standardized" in scale if impl == "gemma1": gemma1grm.compute_kinship(control,standardized) else: gemma2grm.compute_kinship(control,standardized)
def write_new_control(control: dict, transformation: dict): """Writes a new control reusing the old version, but adding the last command """ opts = get_options_ns() cmd = " ".join(opts.args) transformation['command'] = cmd control['transformations'].append(transformation) with safe.control_write_open() as controlf: json.dump(control, controlf, indent=4)
def memory_usage(msg: str = None): options = get_options_ns() if options.debug_ram or options.verbose > 2: process = psutil.Process(os.getpid()) mem = process.memory_full_info()[0] if msg: print(msg, end=":\t") if (mem > 1024**3): print(f"{round(mem / float(2 ** 20) / 102.4 )/10}Gb RAM used") else: print(f"{round(mem / float(2 ** 20) )}Mb RAM used")
def __init__(self, file_type: str, postfix: str = None): """file_type is control""" self.file_type = file_type opts = get_options_ns() self.opts = opts self.file_name = opts.out_prefix self.is_gzip = False if postfix: if postfix.endswith(".gz"): self.is_gzip = True self.file_name += postfix self.compression_level = opts.compression_level
def compute_kinship(control, standardized): opts = get_options_ns() output_path = dirname(opts.out_prefix) if not output_path: output_path = "." output_basename = basename(opts.out_prefix) logging.info('Computing GRM with GEMMA1') logging.info('Convert to intermediate BIMBAM') genofn, phenofn = write_bimbam(control['name']) logging.info(f"Call gemma with {genofn}") k_type = "2" if not standardized else "1" args1 = [ opts.gemma1_bin, '-debug', '-debug-data', '-outdir', output_path, '-o', output_basename, '-gk', k_type, '-g', genofn, '-p', phenofn ] cmd = " ".join(args1) logging.warning("Calling: " + cmd) # print(args1) run(args1, check=True) logging.info(f"Writing to {output_path}/{output_basename}.cXX.txt")
def convert_bimbam(genofn: str, phenofn: str, annofn: str): """Read/convert/import BIMBAM and output to Rqtl2 format""" options = get_options_ns() path = options.out_prefix basefn = path logging.info(f"Reading BIMBAM marker/SNP {annofn}") with open(annofn, "r") as f: with safe.gmap_write_open() as out: outgmapfn = out.name out.write(f"marker,chr,pos\n".encode()) for line in f: list = re.split('[,\t\s]+', line.strip()) # print(list) marker, pos, chr = list out.write(f"{marker}\t{chr}\t{pos}\n".encode()) logging.info(f"Reading BIMBAM phenofile {phenofn}") in_header = True p_inds = 0 phenos = None with open(phenofn, "r") as f: with safe.pheno_write_open("_pheno_bimbam.txt.gz") as out: outphenofn = out.name for line in f: ps = line.strip().split("\t") if not phenos: phenos = len(ps) if in_header: out.write("id\t".encode()) out.write("\t".join([f"{i+1}" for i in range(phenos)]).encode()) in_header = False out.write("\n".encode()) p_inds += 1 out.write(f"{p_inds}\t".encode()) out.write("\t".join(ps).encode()) out.write("\n".encode()) inds = None markers = 0 translate = {"1": "A", "0": "B", "0.5": "H"} # FIXME hard coded in_header = True with safe.geno_write_open("_geno_bimbam.txt.gz") as out: outgenofn = out.name with gzip.open(genofn, mode='r') as f: for line in f: markers += 1 l = line.decode().strip() gs = l.split("\t") if len(gs) == 1: gs = l.split(", ") genos = len(gs) - 3 assert genos != 1 if in_header: out.write("marker\t".encode()) out.write("\t".join([f"{i+1}" for i in range(genos)]).encode()) in_header = False out.write("\n".encode()) out.write(gs[0].encode()) out.write("\t".encode()) # print(gs[3:]) out.write("".join([translate[item] for item in gs[3:]]).encode()) out.write("\n".encode()) if not inds: inds = len(gs) - 3 logging.info(f"{inds} individuals") logging.info(f"{markers} markers") logging.info(f"{phenos} phenotypes") assert inds == p_inds, f"Individuals not matching {inds} != {p_inds}" transformation = { "type": "export", "original": "rqtl2", "format": "bimbam" } write_control(None, inds, markers, phenos, outgenofn, outphenofn, outgmapfn, transformation)
def compute_kinship(control, standardized): # FIXME: these values are hard coded to develop the algorithm miss = 0.05 maf = 0.01 def filter_gs_ok(marker: str, gs: List[float]) -> bool: # 1. [X] Always apply the MAF filter when reading genotypes # 2. [X] Apply missiness filter return maf_num_filter(marker, gs, miss, maf) opts = get_options_ns() G, markerlist = load_geno(control, filter_gs_ok) # print(G) ctrl = data.methodize(control) # print(type(G)) for idx, gs in enumerate(G): values = gs[~np.isnan(gs)] mean = np.mean(values) # skip NAN print(gs) if idx == 1: print("orig", gs) print("mean", mean) # print(mean,variance) # 3. [X] Always impute missing data (injecting the row mean) FIXME def f(value: float): if np.isnan(value): return mean return value gs = [f(g) for g in gs] # 4. [X] Always subtract the row mean serves to "center" the data gs -= mean if idx == 1: print("mean", gs) # 5. [X] Center the data by row (which is the default option # ~-gk 1~) std is sqrt(var) to normalize each feature value to # a z-score. # gs /= np.std(gs) # std is always about 1.0 with BXD. assert std != 0.0: # genovar = np.sum(gs**2)/len(gs)-(mean**2) # genovar = np.var(gs) if standardized: genovar = np.var(gs) if genovar != 0: gs /= math.sqrt(genovar) # gs /= math.sqrt(genovar) G[idx, :] = gs if idx == 1: print("z-scored", gs) # G = G[:-1, :] print("G", G) markers = ctrl.markers K = np.dot(G.T, G) print("raw K", K) # 6. Always scale the matrix dividing by # of SNPs print("scale", markers) K /= float(markers - 1) # print(G) print("G dim", G.shape) print("K dim", K.shape) print(K) memory_usage()
def convert_plink(path: str, annofn: str): """Convert PLINK format to GEMMA2""" def mknum(v): if v != v: return "NA" return (str(v)) options = get_options_ns() compression_level = options.compression_level verbose = options.verbose memory_usage("plink before load") logging.info(f"Reading PLINK files {path}") (bim, fam, bed) = read_plink(path, verbose=(True if verbose > 1 else False)) m = bed.compute() if options.debug_data: print("Debug view of PLINK\n") print("===> BIM alleles/markers") print(bim.head()) print(bim.info()) print("===> FAM samples/phenotypes") print(fam.head()) print(fam.info()) print("===> BED genotypes") print(m) print([x for x in m[0]]) print(bim.shape) markers2, phenos2 = bim.shape inds2, phenos = fam.shape markers, inds = m.shape assert inds == inds2, "Number of individuals not matching in fam and bed files" assert markers == markers2, "Number of markers not matching in bim and bed files" assert phenos == phenos2, "Number of phenotypes not matching in bim and fam files" basefn = options.out_prefix memory_usage("plink pandas") outgmapfn = "NONE" if annofn: logging.info(f"Reading BIMBAM marker/SNP {annofn}") with open(annofn, "r") as f: with safe.gmap_write_open() as out: outgmapfn = out.name out.write(f"marker,chr,pos\n".encode()) for line in f: marker, pos, chr, rest = line.strip().split("\t") out.write(f"{marker}\t{chr}\t{pos}\n".encode()) phenofn = basefn + "_pheno.tsv" p = fam.to_numpy() with safe.pheno_write_open() as f: outphenofn = f.name f.write("id".encode()) for c in fam.columns.values: if c != "i": # we skip the last i column f.write(f"\t{c}".encode()) f.write("\n".encode()) for j in range(inds): f.write((str(j + 1) + "\t").encode()) f.write("\t".join([mknum(v) for v in p[j, :-1] ]).encode()) # except for i column f.write("\n".encode()) memory_usage("plink pheno") genofn = basefn + "_geno.txt.gz" logging.info(f"Writing GEMMA2 geno file {genofn}") translate = {1.0: "A", 2.0: "B", 0.0: "H", -9.0: "-"} import gzip with gzip.open(genofn, mode='wb', compresslevel=compression_level) as f: f.write("marker".encode()) for i in range(inds): f.write(f"\t{i+1}".encode()) for j in range(markers): markername = bim.snp[j] f.write(f"\n{markername}\t".encode()) values = [-9.0 if np.isnan(x) else x for x in m[j]] if options.low_mem: # shaves 20% for i in range(inds): f.write(f"{translate[values[i]]}".encode()) else: f.write("".join([translate[item] for item in values]).encode()) outgenofn = genofn transformation = { "type": "convert", "original": "plink", "format": "rqtl2" } write_control(None, inds, markers, phenos, outgenofn, outphenofn, outgmapfn, transformation) memory_usage("plink geno")