Esempio n. 1
0
    def sm_dict_from_csv(self, directory, name):
        """
        Load a saved cleaned file from summary statistics for a given chromosome found at the load_path provided, and
        use this to construct the sm_dict we pass between methods
        """
        load_file = CsvObject(Path(directory, name),
                              self.cleaned_types,
                              set_columns=True)

        chromo = load_file.column_data[self.summary_dict[self.chromosome]]
        bp_pos = load_file.column_data[self.summary_dict[self.bp_position]]
        snp_id = load_file.column_data[self.summary_dict[self.snp_id]]
        effect = load_file.column_data[self.summary_dict[self.effect_allele]]
        alt = load_file.column_data[self.summary_dict[self.alt_allele]]
        log = load_file.column_data[self.summary_dict[self.log_odds]]
        beta = load_file.column_data[self.summary_dict[self.beta]]
        freq = load_file.column_data[self.summary_dict[self.freq]]

        sm_variants = [
            Variant(ch, bp, sn, ef, al)
            for ch, bp, sn, ef, al in zip(chromo, bp_pos, snp_id, effect, alt)
        ]
        return {
            self.sm_variants: np.array(sm_variants),
            self.log_odds: np.array(log),
            self.beta: np.array(beta),
            self.freq: np.array(freq)
        }
Esempio n. 2
0
    def places_into_dates(self, cleaned_data, write_directory, file_gid=0):
        """
        Sometimes you may have data that is not missing in dates, but just wasn't recorded. This places every place in
        into ever date.
        """

        # Format the reference into lower case
        formatted = [[r.lower() for r in row]
                     for row in self._reference.row_data]

        for file in directory_iterator(cleaned_data):
            # Load the file as a csv object
            loaded_file = CsvObject(Path(cleaned_data, file))

            # Isolate the GID: Row relation from the file
            gid = {row[file_gid]: row for row in loaded_file.row_data}

            # If the place exists in our file, use the file row, else use set to zero's
            all_places = []
            for row in formatted:
                if row[file_gid] in gid:
                    all_places.append(gid[row[file_gid]])
                else:
                    all_places.append([row[i]
                                       for i in self.isolates] + [0, 0, 0, 0])

            write_csv(write_directory,
                      Path(cleaned_data, file).stem, loaded_file.headers,
                      all_places)
Esempio n. 3
0
    def _isolate(root_directory, file, unique_id, ids):
        """
        Isolate the row that match's the ID if it exists, else return an empty list

        Note
        ----
        This assumes no duplicates, and if you have duplicates where the values are different you will need to clean
        that yourself. Otherwise, run the remove_duplicates command first before this method.

        :param root_directory: The root directory of the csv files
        :type root_directory: Path | str

        :param file: The name of the file to search
        :type file: str

        :param unique_id: The unique id index
        :type unique_id: int

        :param ids: The current match id
        :type ids: str

        :return: The row in the data that was matched if it was found, else an empty list of length CsvObject
        :rtype: list
        """

        csv_obj = CsvObject(Path(root_directory, file))

        for row in csv_obj.row_data:
            if row[unique_id] == ids:
                return row
        return ["" for _ in range(csv_obj.row_length)]
Esempio n. 4
0
    def _position_values(self):
        csv_data = CsvObject(self.csv_path, [str, float, float, float])

        if csv_data.row_length != 4:
            msg = f"Csv file should contain phenotype, coefficient, lower bound, upper bound yet found" \
                  f" {csv_data.row_length} rows"
            raise IndexError(msg)

        # Normalise the values for the table plot with 0 added so we know where to draw the axis
        numerical_values = flatten([row[1:] for row in csv_data.row_data])
        normalised_value_list = normalisation_min_max(numerical_values +
                                                      [self.axis_target])

        # Isolate the axis and normal array, then chunk the normal array back into the coefficient, lower bound and
        # upper bound
        x_axis_point = normalised_value_list[-1]
        normal_array = chunk_list(normalised_value_list[:-1], 3)

        # Format the rows so we have actual - positional values for each numeric
        formatted_rows = []
        for row, normalised in zip(csv_data.row_data, normal_array):
            formatted_rows.append(
                flatten([[row[0]]] + [[row[i + 1], normalised[i]]
                                      for i in range(3)]))
        return formatted_rows, x_axis_point
Esempio n. 5
0
    def standardise_names(self, data_directory, write_directory):
        """
        Standardise each place name to a single name if it has multiple

        If working with time series data places may change their names over time which can lead to a lot of merge errors
        or difficulty in linking data. This will standardise all names to a single entry, ensuring that regardless of
        the actual name of the place in that year that all data from that place is grouped to a single entry.

        :param data_directory: Directory containing csv files named in a yyyymmdd format
        :type data_directory: Path | str

        :param write_directory: Output directory
        :type write_directory: Path | str

        :return: Nothing, write out the data for each file found in the data_directory and then stop
        :rtype: None
        """
        for file in directory_iterator(data_directory):
            print(file)

            # Load the data into memory.
            data = CsvObject(Path(data_directory, file), set_columns=True)

            # Standardise the name via the matcher
            rows = []
            for i, name in enumerate(data.column_data[self._name_index], 0):
                reformatted = self._convert_names(name, i, data)
                if reformatted:
                    rows.append(reformatted)

            # Set the headers of the output file then write the file of the same name to the write_directory
            headers = self._reference_types + data.headers[1:]
            write_csv(write_directory, data.file_path.stem, headers, rows)
Esempio n. 6
0
    def distribute_heritability_genome_wide(self):
        """If we can't calculate heritability, distribute it from a provided float"""
        total_snps = 0
        config_dict = {}
        for file in directory_iterator(self.summary_directory):
            print(file)
            load_file = CsvObject(Path(self.summary_directory, file),
                                  self.cleaned_types,
                                  set_columns=True)

            # Isolate the generic information
            n_snps, n_iid = self._chromosome_from_load(load_file)
            chromosome_values = {
                self.count_snp: n_snps,
                self.count_iid: n_iid,
                "Description": f"Chromosome {self.target_chromosome}"
            }
            config_dict[self.target_chromosome] = chromosome_values
            total_snps += n_snps

        print(
            f"Suggested LD_Radius based on {total_snps} / 3000 is {total_snps / 3000}"
        )

        for key, value in config_dict.items():
            config_dict[key][self.herit] = self.herit_calculated * (
                config_dict[key][self.count_snp] / total_snps)
        config_dict["Genome"] = {
            f"{self.genome}_{self.herit}": self.herit_calculated
        }
        ArgMaker().write_yaml_group_dict(config_dict, self.working_dir,
                                         "genome_wide_config")
Esempio n. 7
0
    def write_linked_unique(self,
                            ambiguity=True,
                            ambiguity_file_name="SetAmbiguous.csv"):
        """
        Construct a base lookup-file to append alternative names to as well as lists of unique name files

        :param ambiguity: If there is ambiguity in the file system
        :type ambiguity: bool

        :param ambiguity_file_name: The name of the fix file, defaults to SetAmbiguous.csv
        :type ambiguity_file_name: str

        :return: Nothing, construct files then stop
        :rtype: None
        """

        # Load the files for each shapefile that where written by link_districts_counties as well as the user ambiguous
        # file named ambiguity_file_name
        ambiguity_file = self._ambiguity_setter(ambiguity, ambiguity_file_name)
        relation_files = [
            CsvObject(f"{self._working_dir}/{file}")
            for file in directory_iterator(self._working_dir)
            if "_relation" in file
        ]

        # Construct a list of all the names without any ambiguity
        name_list = [
            self._fix_row_ambiguity(row, ambiguity_file,
                                    re.sub(r"[\D]", "", file.file_name))
            for file in relation_files for row in file.row_data
        ]

        # Write out the reference base
        unique_relations = [
            list(relation) for relation in list({tuple(i)
                                                 for i in name_list})
        ]

        if not Path(self._working_dir, "LookupBase.csv").exists():
            write_csv(self._working_dir, "LookupBase", ["GID"] + self._headers,
                      unique_relations)
        else:
            print("Lookup already written, passing")

        # For each level, write out a list of unique names
        for index, level in enumerate(self._headers, 1):

            # Isolate the unique places for a given level
            unique_places = list(
                set([level_relation[index] for level_relation in name_list]))

            # Write it out if it doesn't already exist
            if not Path(self._working_dir, f"Unique_{level}.csv").exists():
                write_csv(self._working_dir, f"Unique_{level}", [level],
                          unique_places)
            else:
                print(f"Unique_{level} Already exists, skipping")
Esempio n. 8
0
    def construct_reference(self,
                            base_weights_name="LookupBase.csv",
                            alternative_key="Unique"):
        """
        The construct a reference of every name for every place for every level within the Lookup Base

        :param base_weights_name: Name of the base weights file
        :type base_weights_name: str

        :param alternative_key: Key within files that contains alternative names
        :type alternative_key: str

        :return: Nothing, write out place reference csv then stop
        :rtype: None
        """

        # Load the lookup base
        base_relation = CsvObject(Path(self._working_dir, base_weights_name))

        # Load alternative files
        alt_files = [
            CsvObject(Path(self._working_dir, file), set_columns=True)
            for file in directory_iterator(self._working_dir)
            if alternative_key in file
        ]

        # Order them in the same manner as the headers
        order = [
            index for header in self._headers
            for index, file in enumerate(alt_files) if header in file.file_name
        ]
        alt_files = np.array(alt_files)[order].tolist()

        # Link each row to a unique list to create the reference place look up file
        rows = [
            flatten([[row[0]]] + [
                self._match_row(match, match_file)
                for match, match_file in zip(row[1:], alt_files)
            ]) for row in base_relation.row_data
        ]

        write_csv(self._working_dir, "PlaceReference",
                  ["GID"] + flatten([file.headers for file in alt_files]),
                  rows)
Esempio n. 9
0
    def solve_ambiguity(self, standardised_directory, write_directory):
        """
        Remove perfect duplicates and combine non perfect duplicates so that all GIDs are unique.

        Some places may end up being duplicated, in the raw data or after standardisation. This method will remove
        perfect duplicates, and combine non perfect duplicates into a single entry. Keep in mind, that if this is not
        desirable, that the system will print out each non-perfect duplication merge it has done. You may wish to alter
        your original data set, or change your place reference to avoid this from happening.

        :param standardised_directory: The data directory of the output from standardise_names
        :type standardised_directory: str | Path

        :param write_directory: The output directory
        :type write_directory: str | Path
        """
        for file in directory_iterator(standardised_directory):
            print(file)

            # Load the original file and look for duplicate GIDs; which should be unique
            data = CsvObject(Path(standardised_directory, file),
                             set_columns=True)
            duplicate_list = find_duplicates(data.column_data[0])

            # Isolate any row that does not suffer from duplication as the base of the write return
            reset_row = [
                row for row in data.row_data if row[0] not in duplicate_list
            ]

            for dup in duplicate_list:
                # Isolate the row names
                row_names = data.row_data[data.column_data[0].index(
                    dup)][:len(self._reference_types)]

                # Isolate the values for each duplicate name
                sub_list = [[
                    parse_as_numeric(rr, float)
                    for rr in r[len(self._reference_types):]
                ] for r in data.row_data if dup == r[0]]

                # Isolate unique lists, to remove duplicates
                unique_sub_lists = [
                    list(x) for x in set(tuple(x) for x in sub_list)
                ]

                # Warn the user that some values have been combined.
                if len(unique_sub_lists) > 1:
                    print(
                        f"Found and combined multiple entries that where not perfect duplicates for {row_names}"
                    )

                # Add the combined values or singular entry of duplicate values to the reset list
                reset_row.append(row_names +
                                 [sum(i) for i in zip(*unique_sub_lists)])

            write_csv(write_directory, data.file_path.stem, data.headers,
                      reset_row)
Esempio n. 10
0
 def suggest_ld_radius(self):
     """Suggest the size of LD that the user should be using"""
     total_snps = sum([
         CsvObject(Path(self.filter_directory, file)).column_length
         for file in directory_iterator(self.filter_directory)
     ])
     print(
         f"Suggested LD Radius based on total snps found after filtering / 3000 is {total_snps / 3000}"
     )
     return total_snps / 3000
Esempio n. 11
0
    def relational_subprocess(self, index_list, index_of_process,
                              data_directory, write_directory):
        """
        This sub process is run via a call from relational_database via Process

        Each process is set a sub selection of indexes from the PlaceReference loaded into _matcher. Each process will
        then isolate this name and create a output json database for it by extracting any matching entries attributes
        from the data directory.

        :param index_list: A list of indexes to load from the PlaceReference for this process
        :type index_list: list[int]

        :param index_of_process: Which process thread this is
        :type index_of_process: int

        :param data_directory: Load directory the of standardised, cleaned, and correct data
        :type data_directory: str | Path

        :param write_directory: Write Directory for the json database
        :type write_directory: str | Path

        :return: Nothing, write a json database for each location that has been indexed from the PlaceReference.
        :rtype: None
        """

        # Currently processed files in the output directory
        current_files = [f for f in directory_iterator(write_directory)]

        for call_index, place_index in enumerate(index_list, 1):
            print(
                f"{call_index} / {len(index_list)} for process {index_of_process}"
            )

            # Create the unique name from the groups and isolate the gid for parsing the csv
            unique_name = "__".join(
                self._set_standardised_place(self._matcher[place_index]))
            gid = self._matcher[place_index][0]

            # Set the output stub for this place's json database
            place_data = {"Place_Name": unique_name, "GID": gid}

            # If the data has not already been processed
            if self._not_processed(unique_name, current_files):
                for file in directory_iterator(data_directory):

                    # Load the data into memory
                    data = CsvObject(Path(data_directory, file),
                                     set_columns=True)

                    # Isolate any data pertaining to this place from this file and add them to the place_data dict
                    self._process_relation_data(data, gid, place_data)

                write_json(place_data, write_directory,
                           f"{unique_name}_{self._data_name}")
Esempio n. 12
0
    def reformat_raw_names(self,
                           raw_csv,
                           raw_name_i,
                           date_i,
                           data_start,
                           out_directory,
                           date_type="yyyymmdd",
                           date_delimiter="/"):
        """
        This will attempt to reformat names that are in a different style to the required weightGIS format

        :param raw_csv: The path of the csv of data you want to standardise
        :type raw_csv: Path | str

        :param raw_name_i: The place name index in the raw file
        :type raw_name_i: int

        :param date_i: The date index in the raw file
        :type date_i: int

        :param data_start: The column index wherein after the data starts
        :type data_start: int

        :param out_directory: Where you want this file to be written to
        :type out_directory: str | Path

        :param date_type: The type of date, takes the values of yyyy, yyyymmdd, or ddmmyyyy.
        :type date_type: str

        :param date_delimiter: Delimiter for if dates are standard dd/mm/yyyy
        :type date_delimiter: str

        :return:
        """

        raw_csv = CsvObject(raw_csv, set_columns=True)
        headers = ["Place"] + raw_csv.headers[data_start:]

        place_dict = self._create_place_dict(raw_csv, raw_name_i)

        unique_dates = self._set_name_dates(date_delimiter, date_i, date_type,
                                            raw_csv)

        for row_date, date in unique_dates.items():
            place_rows = []
            for row in raw_csv.row_data:
                if row[date_i] == row_date:
                    place_rows.append(
                        [place_dict[self._simplify_string(row[raw_name_i])]] +
                        row[data_start:])

            write_csv(out_directory, date, headers, place_rows)
Esempio n. 13
0
    def _select_snps(self):
        """
        We may only want to run a subset of snps. If so, then this loads the snp indexes from a csv. Else, just return
        all the snp ids

        :return: A list of snp ids
        :rtype: list[snp]
        """
        if self.args["snps"]:
            return CsvObject(validate_path(self.args["snps"]),
                             set_columns=True,
                             column_types=int)[0]
        else:
            return [i for i in range(self.gen.sid_count)]
Esempio n. 14
0
def set_snp_ids(memory_location, snps_to_id, gen_path, write_dir, file_name):
    """
    Isolate a subset of snps based on pre-defined named snps in a csv, passed as a str to snps_to_id, or a random
    set of snps of total == pre-defined int, where the int is set to snps_to_id.

    :param memory_location: Location of bgen memory file
    :type memory_location: Path | str

    :param snps_to_id: Location of snps csv to id
    :type snps_to_id: Path | str

    :param gen_path: The path to the genetic file
    :type gen_path: Path | str

    :param write_dir: The directory to write the snp index csv file to
    :type write_dir: Path | str

    :param file_name: The name of the snp index file
    :type file_name: str

    :return: Nothing, write the id's to a csv then stop
    :rtype: None

    :raise TypeError: If a str / int is not passed
    """

    # Load the args dict, then set the custom write location for the bgen file memory files and load the genetic ref
    custom_meta_path(validate_path(memory_location))
    gen = Bgen(str(validate_path(gen_path).absolute()))

    # Construct a lookup dict for variant_id-rsid
    v_dict = {snp[1]: snp[0] for snp in [snp.split(",") for snp in gen.sid]}

    # Load the list of snps to validate
    snps_list = CsvObject(validate_path(snps_to_id), set_columns=True)[0]

    # Get the index of each snp that is present
    snp_indexes = []
    for snp in snps_list:
        try:
            snp_indexes.append(
                gen.sid_to_index([f"{v_dict[snp]},{snp}"]).tolist())
        except KeyError:
            pass

    # Write the snp indexes out
    write_csv(write_dir, f"{file_name}", ["Snp"], snp_indexes)
    print(
        f"Constructed snp id list of length {len(snp_indexes)} for {gen_path} at {terminal_time()}"
    )
Esempio n. 15
0
 def _ambiguity_setter(self, ambiguity, ambiguity_file_name):
     """
     If there is ambiguity, load the fix file
     """
     if ambiguity:
         try:
             return CsvObject(f"{self._working_dir}/{ambiguity_file_name}",
                              file_headers=False)
         except FileNotFoundError:
             raise FileNotFoundError(
                 f"Ambiguity specified but no fix file named {ambiguity_file_name} found"
             )
     else:
         return None
Esempio n. 16
0
    def __init__(self, args):

        read_path, write_path, start_index, name_index = args

        self._setup_camera()

        self.data_path = Path(read_path)

        self.data = CsvObject(self.data_path, set_columns=True)
        self.write_directory = write_path

        self.start_index = int(start_index)
        self.name_i = int(name_index)

        self.create_frames()
Esempio n. 17
0
    def __init__(self, args):
        write_directory, file_path, name_index, isolate, y_scale, border_width, colour, border_colour, write_name = args

        self.write_directory = write_directory
        self.csv_obj = CsvObject(file_path)
        self.name_i = int(name_index)
        self.isolate = int(isolate)
        self.y_scale = float(y_scale)
        self.border_width = float(border_width)
        self.colour = tuple_convert(colour)
        self.border_colour = tuple_convert(border_colour)
        self.write_name = write_name

        bpy.context.tool_settings.mesh_select_mode = (False, False, True)
        self.make_histogram()
Esempio n. 18
0
    def _set_reference_panel(self):
        """
        Many operations will need a reference panel of individuals that are genetically dis-similar/ Not related to each
        other. This will load a csv or similar text file with two columns of type FID - IID if set. Else will return
        None.

        Note
        -----
        This operation does NOT allow for headers, so do not set them!
        """
        if self.args["Reference_Panel"]:
            path_to_file = Path(self.args["Reference_Panel"])
            validate_path(path_to_file, False)
            return CsvObject(path_to_file, set_columns=True, file_headers=False).row_data

        else:
            return None
Esempio n. 19
0
    def _set_corrections(self, correction_path):
        """
        Set the correction list for changing names after reformatting

        This is designed to change names when names occur via spelling mistakes in the original source material. You
        could clean these in the reformatting stage, but if you have multiple dataset's where the spelling errors occur
        but the formatting is different then this allows for a standardised approach to fixing the error.

        :param correction_path: A path to the correction file, which contains as many rows as twice the number of types,
            + 1 for the operator column.
        :type correction_path: str | Path

        :return: A list of length match_type + 1, where the addtional is from the operator column so we know to delete
            or replace a value.
        """

        # Load the data into memory
        correction_data = CsvObject(validate_path(correction_path))

        # Assert there are as many rows as twice the number of types, + 1 for the operator column
        assert (self._match_types * 2) + 1 == correction_data.row_length

        # Create the original, correction rows
        original_i, new_i = [[
            i * self._match_types, (i * self._match_types) + self._match_types
        ] for i in range(2)]

        correction_list = []
        for row in correction_data.row_data:
            # Isolate the original names and the replacements
            originals = [
                self._simplify_string(name)
                for name in row[original_i[0]:original_i[1]]
            ]
            corrections = [
                self._simplify_string(name) for name in row[new_i[0]:new_i[1]]
            ]

            # Append this to a list with the operator column; the last one hence -1
            correction_list.append([originals, corrections, row[-1]])

        return correction_list
Esempio n. 20
0
    def __init__(self,
                 place_reference,
                 data_name,
                 correction_path=None,
                 cpu_cores=1,
                 splitter="__",
                 name_index=0,
                 place_map=None):
        # Set the standardised name reference from a path to its csv
        self._reference = CsvObject(validate_path(place_reference),
                                    set_columns=True)

        # The name for this particular sub set of data
        self._data_name = data_name

        # Number of cores to use for multi-core enabled methods
        self._cpu_cores = cpu_cores

        # Match lists to standardise names to, set the number of match types, -1 is from removing GID
        self._matcher, self._reference_types, self.isolates = self._construct_match_list(
        )
        self._gid, self._did, self._cid = self.isolates
        self._match_types = len(self._matcher[0]) - 1

        # If there is a correction file, validate it exists, then load it; else None.
        if correction_path:
            self._corrections = self._set_corrections(correction_path)
        else:
            self._corrections = None

        # How to break names into chunks and the column index of names in the reformatted data
        self._splitter = splitter
        self._name_index = name_index

        if place_map:
            # If names need to be remapped then assert there are as many maps as places in the matcher
            assert self._match_types == len(place_map)
            self.order = place_map
        else:
            # Otherwise set the place_map to be just an ordered list of ints of range equal to the place types
            self.order = [i for i in range(len(self._matcher[0]) - 1)]
Esempio n. 21
0
    def remove_duplicates(raw_csv, write_directory):
        """
        Sometimes we may have known duplicates in a file which does not link to ambiguity, in this case we can just
        purge the duplicates and re-write the file

        :param raw_csv: The csv with potential duplicates within them
        :type raw_csv: str | Path

        :param write_directory: The output directory for the file
        :type write_directory: Path | str

        :return: Nothing, write file then stop
        :rtype: None
        """

        csv_obj = CsvObject(validate_path(raw_csv))
        unique_values = [
            list(v) for v in list(
                Counter([tuple(r) for r in csv_obj.row_data]).keys())
        ]
        write_csv(write_directory, csv_obj.file_path.stem, csv_obj.headers,
                  unique_values)
Esempio n. 22
0
    def pie_chart(self, start_angle=90, display_values=None):

        warnings.warn("Deprecated: Will be moved into Seaborn.py soon.tm",
                      DeprecationWarning)

        # Easier to use a csv object rather than pandas for this so recast the data to CsvObject
        labels, amount, explode = CsvObject(self._read_directory,
                                            column_types=[str, int,
                                                          float]).column_data

        # Construct the pie chart from the raw data
        ax = self.seaborn_figure()
        ax.pie(
            amount,
            explode=explode,
            labels=labels,
            startangle=start_angle,
            colors=self.palette(),
            autopct=display_values,
        )
        ax.axis("equal")
        self.write_plot(ax)
        return ax
Esempio n. 23
0
    def _load_file(file_path, column_indexes):
        """
        Load the file containing the ICD codes, and return this and the column indexes of interest based on the type of
        column indexes.

        :param file_path: Path to the icd file
        :type file_path: str | Path

        :param column_indexes:
        The indexes to use will be compared to column indexes if they are set.
        If a str, headers will be checked to see if this str is in the header and kept it true.
        If a list[int], then these indexes will be used.
        If None, then all indexes will be used.
        :type column_indexes: None | str | list[int]

        :return: The load Csv file as well as the min and max indexes to use
        :rtype: (CsvObject, int, int)
        """
        print("Loading file...")
        icd_file = CsvObject(file_path)

        if column_indexes is None:
            indexes = [i for i in range(len(icd_file.headers))]
        elif isinstance(column_indexes, str):
            indexes = [i for i, header in enumerate(icd_file.headers) if column_indexes in header]
        elif isinstance(column_indexes, list):
            indexes = column_indexes
        else:
            sys.exit(f"Unexpected argument for column indexes: Found type {type(column_indexes)} but expected a "
                     f"NoneType, string or list\n"
                     f"If you want to use all columns, leave Column indexes as None\n"
                     f"If you want to use columns containing a string, for example for ICD 10 Primary 41202, then "
                     f"assign 41202 to column indexes\n"
                     f"If you want to use only specific columns, pass a list of the indexes of this columns to "
                     f"column indexes")

        return icd_file, min(indexes), max(indexes)
Esempio n. 24
0
    def aggregate_scores(self):
        """
        This will combine the scores found by chromosome into a single file
        """

        combined_array = []
        for index, file in enumerate(directory_iterator(
                self.scores_directory)):
            score_file = CsvObject(Path(self.scores_directory, file),
                                   set_columns=True)

            # If its the first file we want to extract the iid and fid values as well
            if index == 0:
                fid, iid, score = score_file[self.fid], score_file[
                    self.iid], score_file["Scores"]
                fid, iid, score = np.array(fid), np.array(iid), np.array(
                    score).astype(float)
                fid.shape, iid.shape, score.shape = (len(fid),
                                                     1), (len(iid),
                                                          1), (len(score), 1)
                combined_array = [fid, iid, score]

            # Else extract the scores and append it to the array
            else:
                score = np.array(score_file["Scores"]).astype(float)
                score.shape = (len(score), 1)
                combined_array.append(score)

        # Combine the (IID_Count, 1) * (chromosome count + 2) arrays into a single (IDD_Count, chromosome_count)
        iid_array = np.hstack(combined_array[:2])
        score_array = np.sum(np.hstack(combined_array[2:]), axis=1)
        score_array.shape = (len(score_array), 1)

        # Write the scores to the working directory
        write_rows = np.hstack([iid_array, score_array]).tolist()
        write_csv(Path(self.working_dir, "PGS"), "PolyGenicScores",
                  ["FID", "IID", "Scores"], write_rows)
Esempio n. 25
0
    def pgs_scores(self):
        """
        This will construct the pgs for a given weight beta type, such as infinitesimal, within this chromosome
        """

        # Load the reference to the full sample of ID's, and use it to extract genetic phenotype information
        core = self.gen_reference(self.select_file_on_chromosome())
        ph_dict = self.genetic_phenotypes(core)

        # Load the betas based on the weighted beta type specified by the user
        score_file = f"{self.score_type.split('_')[0]}_{self.target_chromosome}.csv"
        weights = CsvObject(Path(self.working_dir, "PGS", self.score_type,
                                 score_file), [str, float],
                            set_columns=True)

        # Chunk the data into memory chunks to be processed
        chunked_snps, chunks = self.chunked_snp_names(weights[self.snp_id],
                                                      True)
        chunked_weights = np.array_split(weights[self.inf_beta], chunks)

        # Weight the dosage data to construct the scores
        scores = self._weight_dosage(chunked_snps, chunked_weights, core,
                                     ph_dict)

        # Combine the FID/IID, genetic phenotype information, and the score for this chromosome
        scores.shape = (len(scores), 1)
        iid_fid = np.array([[v[i] for v in ph_dict.values()]
                            for i in range(core.iid_count)])
        write_out = np.hstack([iid_fid, scores]).tolist()

        # Write this information to a csv
        headers = list(ph_dict.keys()) + ["Scores"]
        write_csv(self.scores_directory, f"Scores_{self.target_chromosome}",
                  headers, write_out)
        print(
            f"Finished Constructing scores for Chromosome {self.target_chromosome} {terminal_time()}"
        )
Esempio n. 26
0
def main_call(out_dir, write_dir, headers):

    output = []
    for file in directory_iterator(out_dir):
        if ".log" not in file:
            csv_file = CsvObject(Path(output_dir, file))

            # Isolate the model values from the aggregated [snp] + [model 1, ... model N]
            for row in csv_file.row_data:
                snp, models = row[0], chunk_list(row[1:], len(headers))
                output.append([snp, models])

    print(f"For {len(output)} Snps")
    model_count = len(output[0][1])

    model_comp = []
    for i in range(model_count):
        print(f"For model {i+1}")

        # Write out the aggregated chromosome model data to a directory
        model_out = []
        for snp, model in output:
            model_out.append([snp] + model[i])
        write_csv(write_dir, f"Model{i + 1}", ["Snp"] + headers, model_out)

        # Append the comparision to a master list of models
        model_comp.append([f"Model {i+1}"] + [
            str(np.mean([float(values[vi]) for values in model_out]))
            for vi in range(1, 3)
        ])

    # Write the model comp out
    write_csv(
        write_dir, "Model Comparision",
        ["Model", "Mean Coefficent", "Mean Standard Error", "Mean P Values"],
        model_comp)
Esempio n. 27
0
 def _construct_icd9_lookup(self, definition_path):
     """
     For ICD 9, we need to take the first and second columns to get the min and max range to convert these into a
     process similar to our ICD10 codes. Row[0] represents the phenotype name.
     """
     return {row[0]: self._set_icd9_def(row) for row in CsvObject(definition_path).row_data}
Esempio n. 28
0
    def combine(self, unique_id, data_start, root_directory, write_directory,
                write_name):
        """
        weightGIS expects each file to have a single date, so if you have lots of files of the same date that you wan
        to process at the same time you will need ot combine them

        :param unique_id: The unique id index
        :type unique_id: int

        :param data_start: The index wherein the data starts from
        :type data_start: int

        :param root_directory: The root directory of the csv files
        :type root_directory: Path | str

        :param write_directory: The output directory for the file
        :type write_directory: Path | str

        :param write_name: Name of the combined file
        :type write_name: str

        :return: Nothing, write file then stop
        :rtype: None
        """

        # Create the unique ID's
        unique_id_list = sorted(
            list(
                set(
                    flatten([
                        CsvObject(Path(root_directory, file),
                                  set_columns=True)[unique_id]
                        for file in directory_iterator(root_directory)
                    ]))))

        # For each unique ID
        out_list = []
        for count_i, ids in enumerate(unique_id_list):

            if count_i % 10 == 0:
                print(f"{count_i} / {len(unique_id_list)}")

            # Check each file for a matching row, and then
            ids_list = []
            for index, file in enumerate(directory_iterator(root_directory)):

                # If its the first index, take the full values
                if index == 0:
                    ids_list += self._isolate(root_directory, file, unique_id,
                                              ids)

                # Otherwise only take the values after the data start
                else:
                    ids_list += self._isolate(root_directory, file, unique_id,
                                              ids)[data_start:]

            out_list.append(ids_list)

        headers = []
        for index, file in enumerate(directory_iterator(root_directory)):
            if index == 0:
                headers += CsvObject(Path(root_directory, file)).headers
            else:
                headers += CsvObject(Path(root_directory,
                                          file)).headers[data_start:]

        write_csv(write_directory, write_name, headers, out_list)
Esempio n. 29
0
        # Make the District the active object
        bpy.context.view_layer.objects.active = ob
        ob.select_set(True)

        for mat in ob.material_slots:
            mat.material.node_tree.nodes["Emission"].inputs[0].default_value = colour


root = r"I:\Work\DataBases\Adjacent\Months"

not_processed = []
for file in directory_iterator(root):
    print(file)

    a = CsvObject(Path(root, file))

    target = len(a.headers) - 2

    found = 0
    for img in directory_iterator(r"I:\Work\Figures_and_tables\DiseasesOverTime"):
        year = img.split("_")[-1].split(".")[0]

        if year == Path(root, file).stem:
            found += 1

    if found != target:
        not_processed.append(file)

print(not_processed)
print(len(not_processed))
Esempio n. 30
0
def locate_individuals(ids_path, lowest_level_shapefile_path, geo_lookup,
                       east_i, north_i, shape_match_index, write_directory,
                       write_name):
    """
    This will assist you locating individuals with a geo lookup files

    :param ids_path: The path to a csv file filled with id's with a eastings and northings
    :type ids_path: Path | str

    :param lowest_level_shapefile_path: path to the lowest level shapefile you used in your geo reference
    :type lowest_level_shapefile_path: Path | str

    :param geo_lookup: The path to the geo lookup
    :type geo_lookup: Path | str

    :param east_i: Index of the eastings in the id data
    :type east_i: int

    :param north_i: Index of the northings in the id data
    :type north_i: int

    :param shape_match_index: Index of the mating parameter, should be common in both geo reference and shapefile
    :type shape_match_index: int

    :param write_directory: Saved file will be writen here
    :type write_name: Path | str

    :param write_name: The name of the file to write
    :type write_name: str

    :return: Nothing, write the file then stop
    :rtype: None
    """

    id_file = CsvObject(ids_path)
    geo_file = CsvObject(geo_lookup)
    shape_obj = ShapeObject(lowest_level_shapefile_path)

    # Create a list of unique easting_westing coordinates to avoid unnecessary iteration
    unique_places = sorted(
        list(
            set([
                f"{respondent[east_i]}__{respondent[north_i]}"
                for respondent in id_file.row_data
            ])))

    # Create an id: all other rows lookup so we can identify each location from the lowest
    geo_lookup = {row[shape_match_index]: row for row in geo_file.row_data}

    # Link all the geometry
    geo_link = create_geo_link(unique_places, geo_file, geo_lookup, shape_obj,
                               shape_match_index)

    output_rows = []
    for respondent in id_file.row_data:
        # Isolate the rows that are not east/north
        non_location = [
            r for i, r in enumerate(respondent) if i not in (east_i, north_i)
        ]

        # Prepend this along with the geo_link birth location
        birth_location = f"{respondent[east_i]}__{respondent[north_i]}"
        output_rows.append(non_location + geo_link[birth_location])

    headers = [
        h for i, h in enumerate(id_file.headers) if i not in (east_i, north_i)
    ] + geo_file.headers
    write_csv(write_directory, write_name, headers, output_rows)