def pgs_infinitesimal(self): """ This will weight the beta values conditional on LD using infinitesimal shrinkage """ # Check to see if the ld radius is sufficient (sufficiency inferred from LD-pred) t0 = time.time() if self.genomic_data[self.genome][self.avg_ld] > self.ld_radius / 10.0: raise ValueError("WARNING LD Radius appears small compare to the genome wide average LD estimate\n" "Increase the LD radius or use less snps") # Re load the filtered snps, LD information for this chromosome, and genome-wide data into memory sm_dict = self.sm_dict_from_csv(self.filter_directory, f"Filtered_{self.target_chromosome}.csv") ld_data = load_pickle(self.ld_directory, f"LD{self.target_chromosome}") gen_data = self.genomic_data[self.target_chromosome] # Isolate the information we need to construct the inf-decimal weights for this chromosome sid_count, iid_count, herit = gen_data[self.sid_count], gen_data[self.iid_count], gen_data[self.herit] normalised_snps, snp_stds = ld_data[self.norm_snps], ld_data[self.raw_stds] # Update the betas via infinitesimal shrinkage, weight by standard errors updated_betas = self._infinitesimal_betas(herit, iid_count, normalised_snps, sid_count, sm_dict) infinitesimal = updated_betas / snp_stds.flatten() # Write the snp name - constructed betas to a csv write_out = flip_list([self.snp_names(sm_dict), infinitesimal]) write_csv(self.inf_directory, f"Inf_{self.target_chromosome}", [self.snp_id, self.inf_beta], write_out) print(f"Constructed infinitesimal weights in {time.time() - t0} at {terminal_time()}")
def compile_and_write(self, merger="Maximum"): """ This will compile all the data you have collected into a single dataset based on a merge type :param merger: This takes the value of Maximum or Constant to determine the merge type. If Maximum then if 1 is present in any of the dataset's then this individual will be assigned a 1. If constant, then individuals must be assigned a 1 in every dataset to be assigned a 1 :type merger: str :return: Nothing, write out the output to a file then stop :rtype: None """ assert merger in ("Maximum", "Constant"), f"Merger takes the values of Maximum or Constant yet was given " \ f"{merger}" # Isolate the unique ids unique_ids = {i: [] for i in sorted(list(set(self.ids_list)))} # For each dataset that has been appended for data_set in self.data_set_values: # Iterate the rows for row in data_set: # Add the row values to the id unique_ids[row[self._id_column]].append([r for i, r in enumerate(row) if i != self._id_column]) # For id - value set output_values = [] for ids, value_list in unique_ids.items(): id_row = [ids] # If we isolated an entry in every dataset if len(value_list) == len(self.data_set_values): # Then isolate the row values based on one of the rules for row in flip_list(value_list): # If the merger type of maximum then if 1 is present in any of the dataset's then this individual # will be assigned a 1 if merger == "Maximum": id_row.append(max(row)) # If constant, then individuals must be assigned a 1 in every dataset to be assigned a 1 elif merger == "Constant": if sum(row) == len(row): id_row.append(1) else: id_row.append(0) else: raise TypeError("Critical error: Reached Non-set merger value") output_values.append(id_row) write_csv(self._write_directory, self._write_name, ["ID"] + list(self._icd_9_lookup.keys()), output_values) print(f"Constructed {self._write_name} at {terminal_time()}")
if __name__ == '__main__': # Setup sample_size = 483 pc_count = 10 example_snp = 10 output_directory = r"C:\Users\Samuel\PycharmProjects\SR-GWAS\Data" # Setup IID iid = [f"sample_{i}" for i in range(sample_size)] # Setup basic Identifiers gender = [randint(0, 1) for _ in range(sample_size)] yob = [randint(1934, 1971) for _ in range(sample_size)] # Phenotype of BMI bmi = [uniform(14.4, 30.2) for _ in range(sample_size)] output = [iid, bmi, gender, yob] # Add PC's then write as covariant file for i in range(pc_count): output.append(z_scores([randint(0, 1000) for _ in range(sample_size)])) headers = ["IID", "BMI", "Gender", "Age"] + [f"PC{i}" for i in range(1, 11)] write_csv(output_directory, "Covariant", headers, flip_list(output)) # Add Example Snps then write as covariant + snp file for i in range(example_snp): output.append([randint(0, 2) for _ in range(sample_size)]) headers = headers + [f"rs{i}{i+1}{i+2}" for i in range(example_snp)] write_csv(output_directory, "CovariantSnp", headers, flip_list(output))
pob = [randint(0, 5000) for _ in range(sample_size)] # Age yob = [randint(65, 90) for _ in range(sample_size)] # Gender gender = [randint(0, 1) for _ in range(sample_size)] # Ever Smoked smoke = [randint(0, 1) for _ in range(sample_size)] # Units drank per week alcohol = [randint(0, 30) for _ in range(sample_size)] # Average daily intake of calories calories = [randint(1000, 3500) for _ in range(sample_size)] # asthmatic asthma = [randint(0, 1) for _ in range(sample_size)] # Ever used drugs drug_user = [randint(0, 1) for _ in range(sample_size)] # 5 random snp dosages dosages = [[randint(0, 2) for i in range(sample_size)] for _ in range(random_snps)] out_rows = [iid, bmi, pob, yob, gender, smoke, alcohol, calories, asthma, drug_user] + dosages headers = ["IID", "BMI", "PoB", "YoB", "Gender", "Smoke", "Alcohol", "Calories", "Asthma", "Drug_user"] + \ [f"rs{i}{i+1}{i+2}" for i in range(random_snps)] write_csv(Path(__file__).parent, "ExampleData", headers, flip_list(out_rows))
def link_places_across_time(self, lowest_level, other_shapefile_levels, record_indexes, base_gid=0): """ This will link to geo-levels together, files must have a numeric component and each sub_unit must be matched with a match-unit file with the same numeric component Example -------- District and county shapefiles must have the dates within the names and there must be a matching shapefile for each district. So if you have 1931 districts you must have a 1931 county. The actual names doesn't matter as long as the dates match. Whilst the defaults column indexes for gid, name and type for districts and name for county may work, you should check it against the column indexes you have in your download. :param lowest_level: A directory of shapefiles that are of the lowest level in terms of geo-levels :type lowest_level: str :param other_shapefile_levels: A list of lists, where each sub list is a list of shapefiles at a geo-level :type other_shapefile_levels: list[str] :param record_indexes: Indexes to slice names out of each level, base must be the first :type record_indexes: list[list[int]] :param base_gid: The gid index in the base shapefile, defaults to zero :type base_gid: int :return: Nothing, write the relations and ambiguity file is exists to file then stop. :rtype: None. """ # Load the shapefiles, determine we have sufficient names base_shapefiles, other_shapefiles = self._setup_shapefiles( lowest_level, other_shapefile_levels) assert len(other_shapefiles) == len( self._level_names ), f"Not all other shapefiles levels have name provided" # Get the name indexes from the list of record_indexes base_indexes = record_indexes[0] other_level_indexes = record_indexes[1:] ambiguous = [] for base_file in base_shapefiles: print(f"\nProcessing {base_file}") # Determine the current year for this base unit year = re.sub(r"[\D]", "", base_file.file_name) # Determine the relations within this base file and set the headers of the output file relation_list, headers = self._determine_relations_to_base( ambiguous, base_file, base_gid, base_indexes, other_level_indexes, other_shapefiles, year) # Extract the base names from the first set of relations base_shape_names = [relation[:2] for relation in relation_list[0]] # Extract the relation names from all relations, then flip them so they are combined relation_names = [[relation[2:] for relation in relation_level] for relation_level in relation_list] relation_names = flip_list(relation_names) # Join the base names and relations two together then write it out relation_data = [ base + flatten(relation) for base, relation in zip(base_shape_names, relation_names) ] write_csv(self._working_dir, f"{year}_relation", ["GID", self._base_name] + headers, relation_data) if len(ambiguous) > 0: write_csv(self._working_dir, "Ambiguous_Relations", [], ambiguous) print( f"Please validate the {len(ambiguous)} ambiguous relations before proceeding by creating a file" f"called 'SetAmbiguous.csv' where there is now only a single relation for each ambiguous link" ) else: print( "No problems detected, please move to _write_linked_unique next but set ambiguity to False" )