Example #1
0
    def pgs_infinitesimal(self):
        """
        This will weight the beta values conditional on LD using infinitesimal shrinkage
        """
        # Check to see if the ld radius is sufficient (sufficiency inferred from LD-pred)
        t0 = time.time()
        if self.genomic_data[self.genome][self.avg_ld] > self.ld_radius / 10.0:
            raise ValueError("WARNING LD Radius appears small compare to the genome wide average LD estimate\n"
                             "Increase the LD radius or use less snps")

        # Re load the filtered snps, LD information for this chromosome, and genome-wide data into memory
        sm_dict = self.sm_dict_from_csv(self.filter_directory, f"Filtered_{self.target_chromosome}.csv")
        ld_data = load_pickle(self.ld_directory, f"LD{self.target_chromosome}")
        gen_data = self.genomic_data[self.target_chromosome]

        # Isolate the information we need to construct the inf-decimal weights for this chromosome
        sid_count, iid_count, herit = gen_data[self.sid_count], gen_data[self.iid_count], gen_data[self.herit]
        normalised_snps, snp_stds = ld_data[self.norm_snps], ld_data[self.raw_stds]

        # Update the betas via infinitesimal shrinkage, weight by standard errors
        updated_betas = self._infinitesimal_betas(herit, iid_count, normalised_snps, sid_count, sm_dict)
        infinitesimal = updated_betas / snp_stds.flatten()

        # Write the snp name - constructed betas to a csv
        write_out = flip_list([self.snp_names(sm_dict), infinitesimal])
        write_csv(self.inf_directory, f"Inf_{self.target_chromosome}", [self.snp_id, self.inf_beta], write_out)
        print(f"Constructed infinitesimal weights in {time.time() - t0} at {terminal_time()}")
Example #2
0
    def compile_and_write(self, merger="Maximum"):
        """
        This will compile all the data you have collected into a single dataset based on a merge type

        :param merger: This takes the value of Maximum or Constant to determine the merge type. If Maximum then if 1 is
            present in any of the dataset's then this individual will be assigned a 1. If constant, then individuals
            must be assigned a 1 in every dataset to be assigned a 1
        :type merger: str

        :return: Nothing, write out the output to a file then stop
        :rtype: None
        """

        assert merger in ("Maximum", "Constant"), f"Merger takes the values of Maximum or Constant yet was given " \
                                                  f"{merger}"

        # Isolate the unique ids
        unique_ids = {i: [] for i in sorted(list(set(self.ids_list)))}

        # For each dataset that has been appended
        for data_set in self.data_set_values:
            # Iterate the rows
            for row in data_set:
                # Add the row values to the id
                unique_ids[row[self._id_column]].append([r for i, r in enumerate(row) if i != self._id_column])

        # For id - value set
        output_values = []
        for ids, value_list in unique_ids.items():
            id_row = [ids]

            # If we isolated an entry in every dataset
            if len(value_list) == len(self.data_set_values):

                # Then isolate the row values based on one of the rules
                for row in flip_list(value_list):
                    # If the merger type of maximum then if 1 is present in any of the dataset's then this individual
                    # will be assigned a 1
                    if merger == "Maximum":
                        id_row.append(max(row))

                    # If constant, then individuals must be assigned a 1 in every dataset to be assigned a 1
                    elif merger == "Constant":
                        if sum(row) == len(row):
                            id_row.append(1)
                        else:
                            id_row.append(0)

                    else:
                        raise TypeError("Critical error: Reached Non-set merger value")

            output_values.append(id_row)

        write_csv(self._write_directory, self._write_name, ["ID"] + list(self._icd_9_lookup.keys()), output_values)
        print(f"Constructed {self._write_name} at {terminal_time()}")
Example #3
0
if __name__ == '__main__':

    # Setup
    sample_size = 483
    pc_count = 10
    example_snp = 10
    output_directory = r"C:\Users\Samuel\PycharmProjects\SR-GWAS\Data"

    # Setup IID
    iid = [f"sample_{i}" for i in range(sample_size)]

    # Setup basic Identifiers
    gender = [randint(0, 1) for _ in range(sample_size)]
    yob = [randint(1934, 1971) for _ in range(sample_size)]

    # Phenotype of BMI
    bmi = [uniform(14.4, 30.2) for _ in range(sample_size)]
    output = [iid, bmi, gender, yob]

    # Add PC's then write as covariant file
    for i in range(pc_count):
        output.append(z_scores([randint(0, 1000) for _ in range(sample_size)]))
    headers = ["IID", "BMI", "Gender", "Age"] + [f"PC{i}" for i in range(1, 11)]
    write_csv(output_directory, "Covariant", headers, flip_list(output))

    # Add Example Snps then write as covariant + snp file
    for i in range(example_snp):
        output.append([randint(0, 2) for _ in range(sample_size)])
    headers = headers + [f"rs{i}{i+1}{i+2}" for i in range(example_snp)]
    write_csv(output_directory, "CovariantSnp", headers, flip_list(output))
    pob = [randint(0, 5000) for _ in range(sample_size)]

    # Age
    yob = [randint(65, 90) for _ in range(sample_size)]

    # Gender
    gender = [randint(0, 1) for _ in range(sample_size)]

    # Ever Smoked
    smoke = [randint(0, 1) for _ in range(sample_size)]

    # Units drank per week
    alcohol = [randint(0, 30) for _ in range(sample_size)]

    # Average daily intake of calories
    calories = [randint(1000, 3500) for _ in range(sample_size)]

    # asthmatic
    asthma = [randint(0, 1) for _ in range(sample_size)]

    # Ever used drugs
    drug_user = [randint(0, 1) for _ in range(sample_size)]

    # 5 random snp dosages
    dosages = [[randint(0, 2) for i in range(sample_size)] for _ in range(random_snps)]

    out_rows = [iid, bmi, pob, yob, gender, smoke, alcohol, calories, asthma, drug_user] + dosages
    headers = ["IID", "BMI", "PoB", "YoB", "Gender", "Smoke", "Alcohol", "Calories", "Asthma", "Drug_user"] + \
              [f"rs{i}{i+1}{i+2}" for i in range(random_snps)]
    write_csv(Path(__file__).parent, "ExampleData", headers, flip_list(out_rows))
Example #5
0
    def link_places_across_time(self,
                                lowest_level,
                                other_shapefile_levels,
                                record_indexes,
                                base_gid=0):
        """
        This will link to geo-levels together, files must have a numeric component and each sub_unit must be matched
        with a match-unit file with the same numeric component

        Example
        --------
        District and county shapefiles must have the dates within the names and there must be a matching shapefile for
        each district. So if you have 1931 districts you must have a 1931 county. The actual names doesn't matter as
        long as the dates match. Whilst the defaults column indexes for gid, name and type for districts and name for
        county may work, you should check it against the column indexes you have in your download.

        :param lowest_level: A directory of shapefiles that are of the lowest level in terms of geo-levels
        :type lowest_level: str

        :param other_shapefile_levels: A list of lists, where each sub list is a list of shapefiles at a geo-level
        :type other_shapefile_levels: list[str]

        :param record_indexes: Indexes to slice names out of each level, base must be the first
        :type record_indexes: list[list[int]]

        :param base_gid: The gid index in the base shapefile, defaults to zero
        :type base_gid: int

        :return: Nothing, write the relations and ambiguity file is exists to file then stop.
        :rtype: None.
        """

        # Load the shapefiles, determine we have sufficient names
        base_shapefiles, other_shapefiles = self._setup_shapefiles(
            lowest_level, other_shapefile_levels)
        assert len(other_shapefiles) == len(
            self._level_names
        ), f"Not all other shapefiles levels have name provided"

        # Get the name indexes from the list of record_indexes
        base_indexes = record_indexes[0]
        other_level_indexes = record_indexes[1:]

        ambiguous = []
        for base_file in base_shapefiles:
            print(f"\nProcessing {base_file}")

            # Determine the current year for this base unit
            year = re.sub(r"[\D]", "", base_file.file_name)

            # Determine the relations within this base file and set the headers of the output file
            relation_list, headers = self._determine_relations_to_base(
                ambiguous, base_file, base_gid, base_indexes,
                other_level_indexes, other_shapefiles, year)

            # Extract the base names from the first set of relations
            base_shape_names = [relation[:2] for relation in relation_list[0]]

            # Extract the relation names from all relations, then flip them so they are combined
            relation_names = [[relation[2:] for relation in relation_level]
                              for relation_level in relation_list]
            relation_names = flip_list(relation_names)

            # Join the base names and relations two together then write it out
            relation_data = [
                base + flatten(relation)
                for base, relation in zip(base_shape_names, relation_names)
            ]
            write_csv(self._working_dir, f"{year}_relation",
                      ["GID", self._base_name] + headers, relation_data)

        if len(ambiguous) > 0:
            write_csv(self._working_dir, "Ambiguous_Relations", [], ambiguous)
            print(
                f"Please validate the {len(ambiguous)} ambiguous relations before proceeding by creating a file"
                f"called 'SetAmbiguous.csv' where there is now only a single relation for each ambiguous link"
            )
        else:
            print(
                "No problems detected, please move to _write_linked_unique next but set ambiguity to False"
            )