def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name): """ Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random set of snps of total == pre-defined int, where the int is set to snps_to_id. :param memory_location: Location of bgen memory file :type memory_location: Path | str :param snps_to_id: Location of snps csv to id :type snps_to_id: Path | str :param gen_path: The path to the genetic file :type gen_path: Path | str :param write_dir: The directory to write the snp index csv file to :type write_dir: Path | str :param file_name: The name of the snp index file :type file_name: str :return: Nothing, write the id's to a csv then stop :rtype: None :raise TypeError: If a str / int is not passed """ # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref custom_meta_path(validate_path(memory_location)) gen = Bgen(str(validate_path(gen_path).absolute())) # Construct a lookup dict for variant_id-rsid v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]} # Load the list of snps to validate snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0] # Get the index of each snp that is present snp_indexes = [] for snp in snps_list: try: snp_indexes.append( gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist()) except KeyError: pass # Write the snp indexes out write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes) print( f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}" )
file = ResOut(validate_path(output_directory), "ResidualsOVERIDE") # Construct IO stream to write out to and write the header of Snp + [IID1, IID2, ... IID(N)]. # Bgen files store [variant id, rs_id], we just want the rs_id hence the [1]; see https://bit.ly/2J0C1kC # Loader will utilise load_variants of the same file as referenced in load gen line 158 which makes it bgen, bed # and non .mmm file compliant. file.write_from_list(["Snp"] + [iid for fid, iid in gen.iid]) # For each snp in the current chromosome file for i in range(gen.sid_count): if i % 100 == 0: print(f"{i}/{gen.sid_count}: {terminal_time()}") # Instance the memory for all individuals (:) for snp i current_snp = gen[:, gen.sid_to_index([gen.sid[i]])] # Transform bgen dosage of [0, 1, 0] -> 0, 1, or 2 respectively. dosage = sum( np.array([ snp * i for i, snp in enumerate(current_snp.read(dtype=np.int8).val.T) ], dtype=np.int8)) # Set the dosage data into an array with covariant's df = pd.DataFrame([dosage[0]]).T df.columns = ["Dosage"] df["Gender"] = gender df["District"] = district