Example #1
0
    def places_into_dates(self, cleaned_data, write_directory, file_gid=0):
        """
        Sometimes you may have data that is not missing in dates, but just wasn't recorded. This places every place in
        into ever date.
        """

        # Format the reference into lower case
        formatted = [[r.lower() for r in row]
                     for row in self._reference.row_data]

        for file in directory_iterator(cleaned_data):
            # Load the file as a csv object
            loaded_file = CsvObject(Path(cleaned_data, file))

            # Isolate the GID: Row relation from the file
            gid = {row[file_gid]: row for row in loaded_file.row_data}

            # If the place exists in our file, use the file row, else use set to zero's
            all_places = []
            for row in formatted:
                if row[file_gid] in gid:
                    all_places.append(gid[row[file_gid]])
                else:
                    all_places.append([row[i]
                                       for i in self.isolates] + [0, 0, 0, 0])

            write_csv(write_directory,
                      Path(cleaned_data, file).stem, loaded_file.headers,
                      all_places)
Example #2
0
    def pgs_infinitesimal(self):
        """
        This will weight the beta values conditional on LD using infinitesimal shrinkage
        """
        # Check to see if the ld radius is sufficient (sufficiency inferred from LD-pred)
        t0 = time.time()
        if self.genomic_data[self.genome][self.avg_ld] > self.ld_radius / 10.0:
            raise ValueError("WARNING LD Radius appears small compare to the genome wide average LD estimate\n"
                             "Increase the LD radius or use less snps")

        # Re load the filtered snps, LD information for this chromosome, and genome-wide data into memory
        sm_dict = self.sm_dict_from_csv(self.filter_directory, f"Filtered_{self.target_chromosome}.csv")
        ld_data = load_pickle(self.ld_directory, f"LD{self.target_chromosome}")
        gen_data = self.genomic_data[self.target_chromosome]

        # Isolate the information we need to construct the inf-decimal weights for this chromosome
        sid_count, iid_count, herit = gen_data[self.sid_count], gen_data[self.iid_count], gen_data[self.herit]
        normalised_snps, snp_stds = ld_data[self.norm_snps], ld_data[self.raw_stds]

        # Update the betas via infinitesimal shrinkage, weight by standard errors
        updated_betas = self._infinitesimal_betas(herit, iid_count, normalised_snps, sid_count, sm_dict)
        infinitesimal = updated_betas / snp_stds.flatten()

        # Write the snp name - constructed betas to a csv
        write_out = flip_list([self.snp_names(sm_dict), infinitesimal])
        write_csv(self.inf_directory, f"Inf_{self.target_chromosome}", [self.snp_id, self.inf_beta], write_out)
        print(f"Constructed infinitesimal weights in {time.time() - t0} at {terminal_time()}")
Example #3
0
    def construct_lookup(self, write_directory, write_name):
        """
        This will construct a geo-relation csv from a base shapefile relative to a list of other shapefiles based on
        intersection of geometry. For this to work your base shapefile must be the lowest level, otherwise you will end
        up with large levels of ambiguity

        :param write_directory: Where to save this csv
        :param write_name: What to call this csv
        :return: Nothing, write file then stop
        :rtype: None
        """
        for i, (place,
                record) in enumerate(zip(self.base.polygons,
                                         self.base.records)):
            if i % 100 == 0:
                print(f"{i}/{len(self.base.records)}")

            # Set the place records via the first index as well as the area for the lowest order shape
            name_base = self._index_record(record, self.base_index, place)

            # Then do the same for all the other shapes that intersect with this shape
            match_names = [
                self._find_matches(place, match_shape,
                                   indexes) for match_shape, indexes in zip(
                                       self.others, self.other_indexes)
            ]

            self._place_data.append(flatten([name_base] + match_names))

        write_csv(write_directory, write_name, self.headers, self._place_data)
        print(f"Constructed GeoRelations {terminal_time()}")
Example #4
0
    def standardise_names(self, data_directory, write_directory):
        """
        Standardise each place name to a single name if it has multiple

        If working with time series data places may change their names over time which can lead to a lot of merge errors
        or difficulty in linking data. This will standardise all names to a single entry, ensuring that regardless of
        the actual name of the place in that year that all data from that place is grouped to a single entry.

        :param data_directory: Directory containing csv files named in a yyyymmdd format
        :type data_directory: Path | str

        :param write_directory: Output directory
        :type write_directory: Path | str

        :return: Nothing, write out the data for each file found in the data_directory and then stop
        :rtype: None
        """
        for file in directory_iterator(data_directory):
            print(file)

            # Load the data into memory.
            data = CsvObject(Path(data_directory, file), set_columns=True)

            # Standardise the name via the matcher
            rows = []
            for i, name in enumerate(data.column_data[self._name_index], 0):
                reformatted = self._convert_names(name, i, data)
                if reformatted:
                    rows.append(reformatted)

            # Set the headers of the output file then write the file of the same name to the write_directory
            headers = self._reference_types + data.headers[1:]
            write_csv(write_directory, data.file_path.stem, headers, rows)
Example #5
0
    def write_linked_unique(self,
                            ambiguity=True,
                            ambiguity_file_name="SetAmbiguous.csv"):
        """
        Construct a base lookup-file to append alternative names to as well as lists of unique name files

        :param ambiguity: If there is ambiguity in the file system
        :type ambiguity: bool

        :param ambiguity_file_name: The name of the fix file, defaults to SetAmbiguous.csv
        :type ambiguity_file_name: str

        :return: Nothing, construct files then stop
        :rtype: None
        """

        # Load the files for each shapefile that where written by link_districts_counties as well as the user ambiguous
        # file named ambiguity_file_name
        ambiguity_file = self._ambiguity_setter(ambiguity, ambiguity_file_name)
        relation_files = [
            CsvObject(f"{self._working_dir}/{file}")
            for file in directory_iterator(self._working_dir)
            if "_relation" in file
        ]

        # Construct a list of all the names without any ambiguity
        name_list = [
            self._fix_row_ambiguity(row, ambiguity_file,
                                    re.sub(r"[\D]", "", file.file_name))
            for file in relation_files for row in file.row_data
        ]

        # Write out the reference base
        unique_relations = [
            list(relation) for relation in list({tuple(i)
                                                 for i in name_list})
        ]

        if not Path(self._working_dir, "LookupBase.csv").exists():
            write_csv(self._working_dir, "LookupBase", ["GID"] + self._headers,
                      unique_relations)
        else:
            print("Lookup already written, passing")

        # For each level, write out a list of unique names
        for index, level in enumerate(self._headers, 1):

            # Isolate the unique places for a given level
            unique_places = list(
                set([level_relation[index] for level_relation in name_list]))

            # Write it out if it doesn't already exist
            if not Path(self._working_dir, f"Unique_{level}.csv").exists():
                write_csv(self._working_dir, f"Unique_{level}", [level],
                          unique_places)
            else:
                print(f"Unique_{level} Already exists, skipping")
Example #6
0
    def solve_ambiguity(self, standardised_directory, write_directory):
        """
        Remove perfect duplicates and combine non perfect duplicates so that all GIDs are unique.

        Some places may end up being duplicated, in the raw data or after standardisation. This method will remove
        perfect duplicates, and combine non perfect duplicates into a single entry. Keep in mind, that if this is not
        desirable, that the system will print out each non-perfect duplication merge it has done. You may wish to alter
        your original data set, or change your place reference to avoid this from happening.

        :param standardised_directory: The data directory of the output from standardise_names
        :type standardised_directory: str | Path

        :param write_directory: The output directory
        :type write_directory: str | Path
        """
        for file in directory_iterator(standardised_directory):
            print(file)

            # Load the original file and look for duplicate GIDs; which should be unique
            data = CsvObject(Path(standardised_directory, file),
                             set_columns=True)
            duplicate_list = find_duplicates(data.column_data[0])

            # Isolate any row that does not suffer from duplication as the base of the write return
            reset_row = [
                row for row in data.row_data if row[0] not in duplicate_list
            ]

            for dup in duplicate_list:
                # Isolate the row names
                row_names = data.row_data[data.column_data[0].index(
                    dup)][:len(self._reference_types)]

                # Isolate the values for each duplicate name
                sub_list = [[
                    parse_as_numeric(rr, float)
                    for rr in r[len(self._reference_types):]
                ] for r in data.row_data if dup == r[0]]

                # Isolate unique lists, to remove duplicates
                unique_sub_lists = [
                    list(x) for x in set(tuple(x) for x in sub_list)
                ]

                # Warn the user that some values have been combined.
                if len(unique_sub_lists) > 1:
                    print(
                        f"Found and combined multiple entries that where not perfect duplicates for {row_names}"
                    )

                # Add the combined values or singular entry of duplicate values to the reset list
                reset_row.append(row_names +
                                 [sum(i) for i in zip(*unique_sub_lists)])

            write_csv(write_directory, data.file_path.stem, data.headers,
                      reset_row)
Example #7
0
    def compile_and_write(self, merger="Maximum"):
        """
        This will compile all the data you have collected into a single dataset based on a merge type

        :param merger: This takes the value of Maximum or Constant to determine the merge type. If Maximum then if 1 is
            present in any of the dataset's then this individual will be assigned a 1. If constant, then individuals
            must be assigned a 1 in every dataset to be assigned a 1
        :type merger: str

        :return: Nothing, write out the output to a file then stop
        :rtype: None
        """

        assert merger in ("Maximum", "Constant"), f"Merger takes the values of Maximum or Constant yet was given " \
                                                  f"{merger}"

        # Isolate the unique ids
        unique_ids = {i: [] for i in sorted(list(set(self.ids_list)))}

        # For each dataset that has been appended
        for data_set in self.data_set_values:
            # Iterate the rows
            for row in data_set:
                # Add the row values to the id
                unique_ids[row[self._id_column]].append([r for i, r in enumerate(row) if i != self._id_column])

        # For id - value set
        output_values = []
        for ids, value_list in unique_ids.items():
            id_row = [ids]

            # If we isolated an entry in every dataset
            if len(value_list) == len(self.data_set_values):

                # Then isolate the row values based on one of the rules
                for row in flip_list(value_list):
                    # If the merger type of maximum then if 1 is present in any of the dataset's then this individual
                    # will be assigned a 1
                    if merger == "Maximum":
                        id_row.append(max(row))

                    # If constant, then individuals must be assigned a 1 in every dataset to be assigned a 1
                    elif merger == "Constant":
                        if sum(row) == len(row):
                            id_row.append(1)
                        else:
                            id_row.append(0)

                    else:
                        raise TypeError("Critical error: Reached Non-set merger value")

            output_values.append(id_row)

        write_csv(self._write_directory, self._write_name, ["ID"] + list(self._icd_9_lookup.keys()), output_values)
        print(f"Constructed {self._write_name} at {terminal_time()}")
Example #8
0
    def reformat_raw_names(self,
                           raw_csv,
                           raw_name_i,
                           date_i,
                           data_start,
                           out_directory,
                           date_type="yyyymmdd",
                           date_delimiter="/"):
        """
        This will attempt to reformat names that are in a different style to the required weightGIS format

        :param raw_csv: The path of the csv of data you want to standardise
        :type raw_csv: Path | str

        :param raw_name_i: The place name index in the raw file
        :type raw_name_i: int

        :param date_i: The date index in the raw file
        :type date_i: int

        :param data_start: The column index wherein after the data starts
        :type data_start: int

        :param out_directory: Where you want this file to be written to
        :type out_directory: str | Path

        :param date_type: The type of date, takes the values of yyyy, yyyymmdd, or ddmmyyyy.
        :type date_type: str

        :param date_delimiter: Delimiter for if dates are standard dd/mm/yyyy
        :type date_delimiter: str

        :return:
        """

        raw_csv = CsvObject(raw_csv, set_columns=True)
        headers = ["Place"] + raw_csv.headers[data_start:]

        place_dict = self._create_place_dict(raw_csv, raw_name_i)

        unique_dates = self._set_name_dates(date_delimiter, date_i, date_type,
                                            raw_csv)

        for row_date, date in unique_dates.items():
            place_rows = []
            for row in raw_csv.row_data:
                if row[date_i] == row_date:
                    place_rows.append(
                        [place_dict[self._simplify_string(row[raw_name_i])]] +
                        row[data_start:])

            write_csv(out_directory, date, headers, place_rows)
Example #9
0
def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name):
    """
    Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random
    set of snps of total == pre-defined int, where the int is set to snps_to_id.

    :param memory_location: Location of bgen memory file
    :type memory_location: Path | str

    :param snps_to_id: Location of snps csv to id
    :type snps_to_id: Path | str

    :param gen_path: The path to the genetic file
    :type gen_path: Path | str

    :param write_dir: The directory to write the snp index csv file to
    :type write_dir: Path | str

    :param file_name: The name of the snp index file
    :type file_name: str

    :return: Nothing, write the id's to a csv then stop
    :rtype: None

    :raise TypeError: If a str / int is not passed
    """

    # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref
    custom_meta_path(validate_path(memory_location))
    gen = Bgen(str(validate_path(gen_path).absolute()))

    # Construct a lookup dict for variant_id-rsid
    v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]}

    # Load the list of snps to validate
    snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0]

    # Get the index of each snp that is present
    snp_indexes = []
    for snp in snps_list:
        try:
            snp_indexes.append(
                gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist())
        except KeyError:
            pass

    # Write the snp indexes out
    write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes)
    print(
        f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}"
    )
Example #10
0
    def write_summary_files(self, sm_dict, write, chromosome, summary_type,
                            directory):
        """Write out the information from sm_dict into a csv"""
        if write:
            rows_out = []
            for v, log_odds, beta, freq, in zip(sm_dict[self.sm_variants],
                                                sm_dict[self.log_odds],
                                                sm_dict[self.beta],
                                                sm_dict[self.freq]):
                rows_out.append(v.items() + [log_odds, beta, freq])

            write_csv(directory, f"{summary_type}_{chromosome}",
                      self.summary_headers, rows_out)
        return sm_dict
Example #11
0
    def construct_reference(self,
                            base_weights_name="LookupBase.csv",
                            alternative_key="Unique"):
        """
        The construct a reference of every name for every place for every level within the Lookup Base

        :param base_weights_name: Name of the base weights file
        :type base_weights_name: str

        :param alternative_key: Key within files that contains alternative names
        :type alternative_key: str

        :return: Nothing, write out place reference csv then stop
        :rtype: None
        """

        # Load the lookup base
        base_relation = CsvObject(Path(self._working_dir, base_weights_name))

        # Load alternative files
        alt_files = [
            CsvObject(Path(self._working_dir, file), set_columns=True)
            for file in directory_iterator(self._working_dir)
            if alternative_key in file
        ]

        # Order them in the same manner as the headers
        order = [
            index for header in self._headers
            for index, file in enumerate(alt_files) if header in file.file_name
        ]
        alt_files = np.array(alt_files)[order].tolist()

        # Link each row to a unique list to create the reference place look up file
        rows = [
            flatten([[row[0]]] + [
                self._match_row(match, match_file)
                for match, match_file in zip(row[1:], alt_files)
            ]) for row in base_relation.row_data
        ]

        write_csv(self._working_dir, "PlaceReference",
                  ["GID"] + flatten([file.headers for file in alt_files]),
                  rows)
Example #12
0
    def remove_duplicates(raw_csv, write_directory):
        """
        Sometimes we may have known duplicates in a file which does not link to ambiguity, in this case we can just
        purge the duplicates and re-write the file

        :param raw_csv: The csv with potential duplicates within them
        :type raw_csv: str | Path

        :param write_directory: The output directory for the file
        :type write_directory: Path | str

        :return: Nothing, write file then stop
        :rtype: None
        """

        csv_obj = CsvObject(validate_path(raw_csv))
        unique_values = [
            list(v) for v in list(
                Counter([tuple(r) for r in csv_obj.row_data]).keys())
        ]
        write_csv(write_directory, csv_obj.file_path.stem, csv_obj.headers,
                  unique_values)
Example #13
0
    def aggregate_scores(self):
        """
        This will combine the scores found by chromosome into a single file
        """

        combined_array = []
        for index, file in enumerate(directory_iterator(
                self.scores_directory)):
            score_file = CsvObject(Path(self.scores_directory, file),
                                   set_columns=True)

            # If its the first file we want to extract the iid and fid values as well
            if index == 0:
                fid, iid, score = score_file[self.fid], score_file[
                    self.iid], score_file["Scores"]
                fid, iid, score = np.array(fid), np.array(iid), np.array(
                    score).astype(float)
                fid.shape, iid.shape, score.shape = (len(fid),
                                                     1), (len(iid),
                                                          1), (len(score), 1)
                combined_array = [fid, iid, score]

            # Else extract the scores and append it to the array
            else:
                score = np.array(score_file["Scores"]).astype(float)
                score.shape = (len(score), 1)
                combined_array.append(score)

        # Combine the (IID_Count, 1) * (chromosome count + 2) arrays into a single (IDD_Count, chromosome_count)
        iid_array = np.hstack(combined_array[:2])
        score_array = np.sum(np.hstack(combined_array[2:]), axis=1)
        score_array.shape = (len(score_array), 1)

        # Write the scores to the working directory
        write_rows = np.hstack([iid_array, score_array]).tolist()
        write_csv(Path(self.working_dir, "PGS"), "PolyGenicScores",
                  ["FID", "IID", "Scores"], write_rows)
Example #14
0
    def pgs_scores(self):
        """
        This will construct the pgs for a given weight beta type, such as infinitesimal, within this chromosome
        """

        # Load the reference to the full sample of ID's, and use it to extract genetic phenotype information
        core = self.gen_reference(self.select_file_on_chromosome())
        ph_dict = self.genetic_phenotypes(core)

        # Load the betas based on the weighted beta type specified by the user
        score_file = f"{self.score_type.split('_')[0]}_{self.target_chromosome}.csv"
        weights = CsvObject(Path(self.working_dir, "PGS", self.score_type,
                                 score_file), [str, float],
                            set_columns=True)

        # Chunk the data into memory chunks to be processed
        chunked_snps, chunks = self.chunked_snp_names(weights[self.snp_id],
                                                      True)
        chunked_weights = np.array_split(weights[self.inf_beta], chunks)

        # Weight the dosage data to construct the scores
        scores = self._weight_dosage(chunked_snps, chunked_weights, core,
                                     ph_dict)

        # Combine the FID/IID, genetic phenotype information, and the score for this chromosome
        scores.shape = (len(scores), 1)
        iid_fid = np.array([[v[i] for v in ph_dict.values()]
                            for i in range(core.iid_count)])
        write_out = np.hstack([iid_fid, scores]).tolist()

        # Write this information to a csv
        headers = list(ph_dict.keys()) + ["Scores"]
        write_csv(self.scores_directory, f"Scores_{self.target_chromosome}",
                  headers, write_out)
        print(
            f"Finished Constructing scores for Chromosome {self.target_chromosome} {terminal_time()}"
        )
Example #15
0
def main_call(out_dir, write_dir, headers):

    output = []
    for file in directory_iterator(out_dir):
        if ".log" not in file:
            csv_file = CsvObject(Path(output_dir, file))

            # Isolate the model values from the aggregated [snp] + [model 1, ... model N]
            for row in csv_file.row_data:
                snp, models = row[0], chunk_list(row[1:], len(headers))
                output.append([snp, models])

    print(f"For {len(output)} Snps")
    model_count = len(output[0][1])

    model_comp = []
    for i in range(model_count):
        print(f"For model {i+1}")

        # Write out the aggregated chromosome model data to a directory
        model_out = []
        for snp, model in output:
            model_out.append([snp] + model[i])
        write_csv(write_dir, f"Model{i + 1}", ["Snp"] + headers, model_out)

        # Append the comparision to a master list of models
        model_comp.append([f"Model {i+1}"] + [
            str(np.mean([float(values[vi]) for values in model_out]))
            for vi in range(1, 3)
        ])

    # Write the model comp out
    write_csv(
        write_dir, "Model Comparision",
        ["Model", "Mean Coefficent", "Mean Standard Error", "Mean P Values"],
        model_comp)
Example #16
0
        'rs012~BMI+Gender|PoB|PoB+YoB', 'rs012~BMI+Gender|PoB+YoB|PoB+YoB'
    ]

    row_names = [
        "Cov", "2Cov", "7Cov", "2Cov+FE", "2Cov+FE+Cl", "2Cov+2FE", "2Cov+3FE",
        "2Cov+FE+2CL", "2Cov+2FE+2CL"
    ]

    runs = 100
    run_rows = []
    for name, formula in zip(row_names, formula_list):
        print(formula)

        phenotype, covariant, fixed_effects, clusters = formula_transform(
            formula)
        demeaned = demean(covariant + phenotype, df, fixed_effects, len(df))
        rank = cal_df(df, fixed_effects)

        time_new = []
        for i in range(runs):
            start = time.time()
            HDFE(demeaned, formula).reg_hdfe(rank, False)
            time_new.append(time.time() - start)
        print(np.average(time_new))
        print("")

        run_rows.append([name, np.average(time_new)])

    write_csv(
        Path(__file__).parent, "TimingUpdate", ["Model Name", "New"], run_rows)
Example #17
0
    def combine(self, unique_id, data_start, root_directory, write_directory,
                write_name):
        """
        weightGIS expects each file to have a single date, so if you have lots of files of the same date that you wan
        to process at the same time you will need ot combine them

        :param unique_id: The unique id index
        :type unique_id: int

        :param data_start: The index wherein the data starts from
        :type data_start: int

        :param root_directory: The root directory of the csv files
        :type root_directory: Path | str

        :param write_directory: The output directory for the file
        :type write_directory: Path | str

        :param write_name: Name of the combined file
        :type write_name: str

        :return: Nothing, write file then stop
        :rtype: None
        """

        # Create the unique ID's
        unique_id_list = sorted(
            list(
                set(
                    flatten([
                        CsvObject(Path(root_directory, file),
                                  set_columns=True)[unique_id]
                        for file in directory_iterator(root_directory)
                    ]))))

        # For each unique ID
        out_list = []
        for count_i, ids in enumerate(unique_id_list):

            if count_i % 10 == 0:
                print(f"{count_i} / {len(unique_id_list)}")

            # Check each file for a matching row, and then
            ids_list = []
            for index, file in enumerate(directory_iterator(root_directory)):

                # If its the first index, take the full values
                if index == 0:
                    ids_list += self._isolate(root_directory, file, unique_id,
                                              ids)

                # Otherwise only take the values after the data start
                else:
                    ids_list += self._isolate(root_directory, file, unique_id,
                                              ids)[data_start:]

            out_list.append(ids_list)

        headers = []
        for index, file in enumerate(directory_iterator(root_directory)):
            if index == 0:
                headers += CsvObject(Path(root_directory, file)).headers
            else:
                headers += CsvObject(Path(root_directory,
                                          file)).headers[data_start:]

        write_csv(write_directory, write_name, headers, out_list)
Example #18
0
    def link_places_across_time(self,
                                lowest_level,
                                other_shapefile_levels,
                                record_indexes,
                                base_gid=0):
        """
        This will link to geo-levels together, files must have a numeric component and each sub_unit must be matched
        with a match-unit file with the same numeric component

        Example
        --------
        District and county shapefiles must have the dates within the names and there must be a matching shapefile for
        each district. So if you have 1931 districts you must have a 1931 county. The actual names doesn't matter as
        long as the dates match. Whilst the defaults column indexes for gid, name and type for districts and name for
        county may work, you should check it against the column indexes you have in your download.

        :param lowest_level: A directory of shapefiles that are of the lowest level in terms of geo-levels
        :type lowest_level: str

        :param other_shapefile_levels: A list of lists, where each sub list is a list of shapefiles at a geo-level
        :type other_shapefile_levels: list[str]

        :param record_indexes: Indexes to slice names out of each level, base must be the first
        :type record_indexes: list[list[int]]

        :param base_gid: The gid index in the base shapefile, defaults to zero
        :type base_gid: int

        :return: Nothing, write the relations and ambiguity file is exists to file then stop.
        :rtype: None.
        """

        # Load the shapefiles, determine we have sufficient names
        base_shapefiles, other_shapefiles = self._setup_shapefiles(
            lowest_level, other_shapefile_levels)
        assert len(other_shapefiles) == len(
            self._level_names
        ), f"Not all other shapefiles levels have name provided"

        # Get the name indexes from the list of record_indexes
        base_indexes = record_indexes[0]
        other_level_indexes = record_indexes[1:]

        ambiguous = []
        for base_file in base_shapefiles:
            print(f"\nProcessing {base_file}")

            # Determine the current year for this base unit
            year = re.sub(r"[\D]", "", base_file.file_name)

            # Determine the relations within this base file and set the headers of the output file
            relation_list, headers = self._determine_relations_to_base(
                ambiguous, base_file, base_gid, base_indexes,
                other_level_indexes, other_shapefiles, year)

            # Extract the base names from the first set of relations
            base_shape_names = [relation[:2] for relation in relation_list[0]]

            # Extract the relation names from all relations, then flip them so they are combined
            relation_names = [[relation[2:] for relation in relation_level]
                              for relation_level in relation_list]
            relation_names = flip_list(relation_names)

            # Join the base names and relations two together then write it out
            relation_data = [
                base + flatten(relation)
                for base, relation in zip(base_shape_names, relation_names)
            ]
            write_csv(self._working_dir, f"{year}_relation",
                      ["GID", self._base_name] + headers, relation_data)

        if len(ambiguous) > 0:
            write_csv(self._working_dir, "Ambiguous_Relations", [], ambiguous)
            print(
                f"Please validate the {len(ambiguous)} ambiguous relations before proceeding by creating a file"
                f"called 'SetAmbiguous.csv' where there is now only a single relation for each ambiguous link"
            )
        else:
            print(
                "No problems detected, please move to _write_linked_unique next but set ambiguity to False"
            )
Example #19
0
    pob = [randint(0, 5000) for _ in range(sample_size)]

    # Age
    yob = [randint(65, 90) for _ in range(sample_size)]

    # Gender
    gender = [randint(0, 1) for _ in range(sample_size)]

    # Ever Smoked
    smoke = [randint(0, 1) for _ in range(sample_size)]

    # Units drank per week
    alcohol = [randint(0, 30) for _ in range(sample_size)]

    # Average daily intake of calories
    calories = [randint(1000, 3500) for _ in range(sample_size)]

    # asthmatic
    asthma = [randint(0, 1) for _ in range(sample_size)]

    # Ever used drugs
    drug_user = [randint(0, 1) for _ in range(sample_size)]

    # 5 random snp dosages
    dosages = [[randint(0, 2) for i in range(sample_size)] for _ in range(random_snps)]

    out_rows = [iid, bmi, pob, yob, gender, smoke, alcohol, calories, asthma, drug_user] + dosages
    headers = ["IID", "BMI", "PoB", "YoB", "Gender", "Smoke", "Alcohol", "Calories", "Asthma", "Drug_user"] + \
              [f"rs{i}{i+1}{i+2}" for i in range(random_snps)]
    write_csv(Path(__file__).parent, "ExampleData", headers, flip_list(out_rows))
Example #20
0
if __name__ == '__main__':

    # Setup
    sample_size = 483
    pc_count = 10
    example_snp = 10
    output_directory = r"C:\Users\Samuel\PycharmProjects\SR-GWAS\Data"

    # Setup IID
    iid = [f"sample_{i}" for i in range(sample_size)]

    # Setup basic Identifiers
    gender = [randint(0, 1) for _ in range(sample_size)]
    yob = [randint(1934, 1971) for _ in range(sample_size)]

    # Phenotype of BMI
    bmi = [uniform(14.4, 30.2) for _ in range(sample_size)]
    output = [iid, bmi, gender, yob]

    # Add PC's then write as covariant file
    for i in range(pc_count):
        output.append(z_scores([randint(0, 1000) for _ in range(sample_size)]))
    headers = ["IID", "BMI", "Gender", "Age"] + [f"PC{i}" for i in range(1, 11)]
    write_csv(output_directory, "Covariant", headers, flip_list(output))

    # Add Example Snps then write as covariant + snp file
    for i in range(example_snp):
        output.append([randint(0, 2) for _ in range(sample_size)])
    headers = headers + [f"rs{i}{i+1}{i+2}" for i in range(example_snp)]
    write_csv(output_directory, "CovariantSnp", headers, flip_list(output))
Example #21
0
def locate_individuals(ids_path, lowest_level_shapefile_path, geo_lookup,
                       east_i, north_i, shape_match_index, write_directory,
                       write_name):
    """
    This will assist you locating individuals with a geo lookup files

    :param ids_path: The path to a csv file filled with id's with a eastings and northings
    :type ids_path: Path | str

    :param lowest_level_shapefile_path: path to the lowest level shapefile you used in your geo reference
    :type lowest_level_shapefile_path: Path | str

    :param geo_lookup: The path to the geo lookup
    :type geo_lookup: Path | str

    :param east_i: Index of the eastings in the id data
    :type east_i: int

    :param north_i: Index of the northings in the id data
    :type north_i: int

    :param shape_match_index: Index of the mating parameter, should be common in both geo reference and shapefile
    :type shape_match_index: int

    :param write_directory: Saved file will be writen here
    :type write_name: Path | str

    :param write_name: The name of the file to write
    :type write_name: str

    :return: Nothing, write the file then stop
    :rtype: None
    """

    id_file = CsvObject(ids_path)
    geo_file = CsvObject(geo_lookup)
    shape_obj = ShapeObject(lowest_level_shapefile_path)

    # Create a list of unique easting_westing coordinates to avoid unnecessary iteration
    unique_places = sorted(
        list(
            set([
                f"{respondent[east_i]}__{respondent[north_i]}"
                for respondent in id_file.row_data
            ])))

    # Create an id: all other rows lookup so we can identify each location from the lowest
    geo_lookup = {row[shape_match_index]: row for row in geo_file.row_data}

    # Link all the geometry
    geo_link = create_geo_link(unique_places, geo_file, geo_lookup, shape_obj,
                               shape_match_index)

    output_rows = []
    for respondent in id_file.row_data:
        # Isolate the rows that are not east/north
        non_location = [
            r for i, r in enumerate(respondent) if i not in (east_i, north_i)
        ]

        # Prepend this along with the geo_link birth location
        birth_location = f"{respondent[east_i]}__{respondent[north_i]}"
        output_rows.append(non_location + geo_link[birth_location])

    headers = [
        h for i, h in enumerate(id_file.headers) if i not in (east_i, north_i)
    ] + geo_file.headers
    write_csv(write_directory, write_name, headers, output_rows)