Esempio n. 1
0
    def relational_subprocess(self, index_list, index_of_process,
                              data_directory, write_directory):
        """
        This sub process is run via a call from relational_database via Process

        Each process is set a sub selection of indexes from the PlaceReference loaded into _matcher. Each process will
        then isolate this name and create a output json database for it by extracting any matching entries attributes
        from the data directory.

        :param index_list: A list of indexes to load from the PlaceReference for this process
        :type index_list: list[int]

        :param index_of_process: Which process thread this is
        :type index_of_process: int

        :param data_directory: Load directory the of standardised, cleaned, and correct data
        :type data_directory: str | Path

        :param write_directory: Write Directory for the json database
        :type write_directory: str | Path

        :return: Nothing, write a json database for each location that has been indexed from the PlaceReference.
        :rtype: None
        """

        # Currently processed files in the output directory
        current_files = [f for f in directory_iterator(write_directory)]

        for call_index, place_index in enumerate(index_list, 1):
            print(
                f"{call_index} / {len(index_list)} for process {index_of_process}"
            )

            # Create the unique name from the groups and isolate the gid for parsing the csv
            unique_name = "__".join(
                self._set_standardised_place(self._matcher[place_index]))
            gid = self._matcher[place_index][0]

            # Set the output stub for this place's json database
            place_data = {"Place_Name": unique_name, "GID": gid}

            # If the data has not already been processed
            if self._not_processed(unique_name, current_files):
                for file in directory_iterator(data_directory):

                    # Load the data into memory
                    data = CsvObject(Path(data_directory, file),
                                     set_columns=True)

                    # Isolate any data pertaining to this place from this file and add them to the place_data dict
                    self._process_relation_data(data, gid, place_data)

                write_json(place_data, write_directory,
                           f"{unique_name}_{self._data_name}")
Esempio n. 2
0
    def standardise_names(self, data_directory, write_directory):
        """
        Standardise each place name to a single name if it has multiple

        If working with time series data places may change their names over time which can lead to a lot of merge errors
        or difficulty in linking data. This will standardise all names to a single entry, ensuring that regardless of
        the actual name of the place in that year that all data from that place is grouped to a single entry.

        :param data_directory: Directory containing csv files named in a yyyymmdd format
        :type data_directory: Path | str

        :param write_directory: Output directory
        :type write_directory: Path | str

        :return: Nothing, write out the data for each file found in the data_directory and then stop
        :rtype: None
        """
        for file in directory_iterator(data_directory):
            print(file)

            # Load the data into memory.
            data = CsvObject(Path(data_directory, file), set_columns=True)

            # Standardise the name via the matcher
            rows = []
            for i, name in enumerate(data.column_data[self._name_index], 0):
                reformatted = self._convert_names(name, i, data)
                if reformatted:
                    rows.append(reformatted)

            # Set the headers of the output file then write the file of the same name to the write_directory
            headers = self._reference_types + data.headers[1:]
            write_csv(write_directory, data.file_path.stem, headers, rows)
Esempio n. 3
0
def create_heat_map_frames(working_directory, point_colour, point_out, gradient_out, gradient_scalar, gradient_divider):
    dates = {"".join([n.replace(".png", "").zfill(2) for n in name.split("_")]): name
             for name in directory_iterator(working_directory)}

    # Format dates to be sorted
    dates = {date: name for date, name in sorted(dates.items(), key=lambda kv: kv[0])}

    # Create the bound to mask on, the isolate the first frame
    base = _base_image(dates, working_directory)
    bound_min = (max(point_colour[0]-5, 0), max(point_colour[1]-5, 0), max(point_colour[2]-5, 0))
    base.mask_on_colour_range(bound_min, point_colour)

    # Point frames
    for index, (date, name) in enumerate(dates.items()):
        if index % 10 == 0:
            print(f"Frame {index}: {len(dates.items())}")

        if index > 0:
            current_date = ImageObject(load_image(str(Path(working_directory, name).absolute())))
            current_date.mask_on_colour_range(bound_min, point_colour)

            current_date.write_to_file(point_out, index)

    # Gradient frames
    for index, (date, name) in enumerate(dates.items()):
        if index % 10 == 0:
            print(f"Frame {index}: {len(dates.items())}")

        if index > 0:
            difference = _create_difference(working_directory, name, bound_min, point_colour, base,
                                            gradient_scalar, gradient_divider)
            difference.write_to_file(gradient_out, index)
            base = difference
Esempio n. 4
0
    def manhattan_plot(self, colours, output_directory):
        """
        This will take the images in the working directory from manhattan_points and compile them into the images

        :param colours: A list of 0-255 BGR colours, must be of equal length to the chromosome subdivision
        :type colours: list[(int, int, int)]

        :param output_directory: Where you want to save the images, it is STRONGLY recommend you put this somewhere
            else as if you re-run the script and place the output in the working_directory it may raise index errors
            for missing AXIS files for the output images
        :type output_directory: str | Path

        :return: Nothing, compile images then stop
        :rtype: None
        """

        # Isolate the unique image names
        unique_names = list(
            set([
                file.split("__")[0]
                for file in directory_iterator(self._working_dir)
                if (".log" not in file) and (".blend" not in file)
            ]))

        # For each plot, compile the images
        for name in unique_names:
            create_manhattan_plot(name, self._working_dir, colours,
                                  output_directory)
Esempio n. 5
0
    def distribute_heritability_genome_wide(self):
        """If we can't calculate heritability, distribute it from a provided float"""
        total_snps = 0
        config_dict = {}
        for file in directory_iterator(self.summary_directory):
            print(file)
            load_file = CsvObject(Path(self.summary_directory, file),
                                  self.cleaned_types,
                                  set_columns=True)

            # Isolate the generic information
            n_snps, n_iid = self._chromosome_from_load(load_file)
            chromosome_values = {
                self.count_snp: n_snps,
                self.count_iid: n_iid,
                "Description": f"Chromosome {self.target_chromosome}"
            }
            config_dict[self.target_chromosome] = chromosome_values
            total_snps += n_snps

        print(
            f"Suggested LD_Radius based on {total_snps} / 3000 is {total_snps / 3000}"
        )

        for key, value in config_dict.items():
            config_dict[key][self.herit] = self.herit_calculated * (
                config_dict[key][self.count_snp] / total_snps)
        config_dict["Genome"] = {
            f"{self.genome}_{self.herit}": self.herit_calculated
        }
        ArgMaker().write_yaml_group_dict(config_dict, self.working_dir,
                                         "genome_wide_config")
Esempio n. 6
0
    def places_into_dates(self, cleaned_data, write_directory, file_gid=0):
        """
        Sometimes you may have data that is not missing in dates, but just wasn't recorded. This places every place in
        into ever date.
        """

        # Format the reference into lower case
        formatted = [[r.lower() for r in row]
                     for row in self._reference.row_data]

        for file in directory_iterator(cleaned_data):
            # Load the file as a csv object
            loaded_file = CsvObject(Path(cleaned_data, file))

            # Isolate the GID: Row relation from the file
            gid = {row[file_gid]: row for row in loaded_file.row_data}

            # If the place exists in our file, use the file row, else use set to zero's
            all_places = []
            for row in formatted:
                if row[file_gid] in gid:
                    all_places.append(gid[row[file_gid]])
                else:
                    all_places.append([row[i]
                                       for i in self.isolates] + [0, 0, 0, 0])

            write_csv(write_directory,
                      Path(cleaned_data, file).stem, loaded_file.headers,
                      all_places)
Esempio n. 7
0
 def _load_shapefiles(path):
     """
     Load the shapefiles into memory
     """
     return [
         ShapeObject(f"{path}/{file}") for file in directory_iterator(path)
         if Path(path, file).suffix == ".shp"
     ]
Esempio n. 8
0
    def write_linked_unique(self,
                            ambiguity=True,
                            ambiguity_file_name="SetAmbiguous.csv"):
        """
        Construct a base lookup-file to append alternative names to as well as lists of unique name files

        :param ambiguity: If there is ambiguity in the file system
        :type ambiguity: bool

        :param ambiguity_file_name: The name of the fix file, defaults to SetAmbiguous.csv
        :type ambiguity_file_name: str

        :return: Nothing, construct files then stop
        :rtype: None
        """

        # Load the files for each shapefile that where written by link_districts_counties as well as the user ambiguous
        # file named ambiguity_file_name
        ambiguity_file = self._ambiguity_setter(ambiguity, ambiguity_file_name)
        relation_files = [
            CsvObject(f"{self._working_dir}/{file}")
            for file in directory_iterator(self._working_dir)
            if "_relation" in file
        ]

        # Construct a list of all the names without any ambiguity
        name_list = [
            self._fix_row_ambiguity(row, ambiguity_file,
                                    re.sub(r"[\D]", "", file.file_name))
            for file in relation_files for row in file.row_data
        ]

        # Write out the reference base
        unique_relations = [
            list(relation) for relation in list({tuple(i)
                                                 for i in name_list})
        ]

        if not Path(self._working_dir, "LookupBase.csv").exists():
            write_csv(self._working_dir, "LookupBase", ["GID"] + self._headers,
                      unique_relations)
        else:
            print("Lookup already written, passing")

        # For each level, write out a list of unique names
        for index, level in enumerate(self._headers, 1):

            # Isolate the unique places for a given level
            unique_places = list(
                set([level_relation[index] for level_relation in name_list]))

            # Write it out if it doesn't already exist
            if not Path(self._working_dir, f"Unique_{level}.csv").exists():
                write_csv(self._working_dir, f"Unique_{level}", [level],
                          unique_places)
            else:
                print(f"Unique_{level} Already exists, skipping")
Esempio n. 9
0
    def solve_ambiguity(self, standardised_directory, write_directory):
        """
        Remove perfect duplicates and combine non perfect duplicates so that all GIDs are unique.

        Some places may end up being duplicated, in the raw data or after standardisation. This method will remove
        perfect duplicates, and combine non perfect duplicates into a single entry. Keep in mind, that if this is not
        desirable, that the system will print out each non-perfect duplication merge it has done. You may wish to alter
        your original data set, or change your place reference to avoid this from happening.

        :param standardised_directory: The data directory of the output from standardise_names
        :type standardised_directory: str | Path

        :param write_directory: The output directory
        :type write_directory: str | Path
        """
        for file in directory_iterator(standardised_directory):
            print(file)

            # Load the original file and look for duplicate GIDs; which should be unique
            data = CsvObject(Path(standardised_directory, file),
                             set_columns=True)
            duplicate_list = find_duplicates(data.column_data[0])

            # Isolate any row that does not suffer from duplication as the base of the write return
            reset_row = [
                row for row in data.row_data if row[0] not in duplicate_list
            ]

            for dup in duplicate_list:
                # Isolate the row names
                row_names = data.row_data[data.column_data[0].index(
                    dup)][:len(self._reference_types)]

                # Isolate the values for each duplicate name
                sub_list = [[
                    parse_as_numeric(rr, float)
                    for rr in r[len(self._reference_types):]
                ] for r in data.row_data if dup == r[0]]

                # Isolate unique lists, to remove duplicates
                unique_sub_lists = [
                    list(x) for x in set(tuple(x) for x in sub_list)
                ]

                # Warn the user that some values have been combined.
                if len(unique_sub_lists) > 1:
                    print(
                        f"Found and combined multiple entries that where not perfect duplicates for {row_names}"
                    )

                # Add the combined values or singular entry of duplicate values to the reset list
                reset_row.append(row_names +
                                 [sum(i) for i in zip(*unique_sub_lists)])

            write_csv(write_directory, data.file_path.stem, data.headers,
                      reset_row)
Esempio n. 10
0
 def suggest_ld_radius(self):
     """Suggest the size of LD that the user should be using"""
     total_snps = sum([
         CsvObject(Path(self.filter_directory, file)).column_length
         for file in directory_iterator(self.filter_directory)
     ])
     print(
         f"Suggested LD Radius based on total snps found after filtering / 3000 is {total_snps / 3000}"
     )
     return total_snps / 3000
Esempio n. 11
0
    def select_file_on_chromosome(self):
        """
        For a given chromosome, get the respective file from the genetic directory

        :return: Path to the current file as a Path from pathlib
        """
        for file in directory_iterator(self.gen_directory):
            if Path(self.gen_directory, file).suffix == self.gen_type:
                try:
                    if int(re.sub(r'[\D]', "", Path(self.gen_directory, file).stem)) == self.target_chromosome:
                        return str(Path(self.gen_directory, file).absolute())
                except (ValueError, TypeError):
                    continue

        raise Exception(f"Failed to find any relevant file for {self.target_chromosome} in {self.gen_directory}")
Esempio n. 12
0
    def qq_make(self, point_colour, output_directory):
        # Isolate the unique image names
        unique_names = list(
            set([
                file.split("__")[0]
                for file in directory_iterator(self._working_dir)
                if (".log" not in file) and (".blend" not in file)
            ]))

        print(unique_names)
        # For each plot, compile the images
        for name in unique_names:
            create_qq_plot(name, self._working_dir, point_colour,
                           output_directory)

        return
Esempio n. 13
0
    def combine_dataset(self, path_list, write_directory, database_name):
        """
        This will combine all the dataset's you have made into a single json database

        This will combine all the regional data from all standardised dataset's into a single json database. If you only
        had 1 database to begin with, then this just adds all the separate json databases into a single 1. Where it is
        mostly used, is when you have run this process on multiple dataset's and now want all the standardised places to
        share attribute data in a single database.

        :param path_list: A list of paths, where each path goes to a set directory
        :type path_list: list[str | Path]

        :param write_directory: The write directory of the master database
        :type write_directory: str | Path

        :param database_name: The master database name
        :type database_name: str

        :return: Nothing, write the database to file then stop
        :rtype: None
        """

        # Initialise the output database
        master_database = {}

        # Isolate all the paths to all the files we want to load across all the database for this geo-level
        level_data = [
            Path(path, file) for path in path_list
            for file in directory_iterator(path)
        ]

        for index, file in enumerate(level_data):
            if index % 100 == 0:
                print(f"{index}/{len(level_data)}")

            # Load the data for this file into memory, set the master database assign name via Place_Name
            load_data = load_json(file)
            assign_name = load_data["Place_Name"]

            # If the current attribute does not exist within the current database, add it to it
            current_attributes = self._current_attributes(
                master_database, assign_name)
            for attr in load_data.keys():
                if attr not in current_attributes:
                    master_database[assign_name][attr] = load_data[attr]

        write_json(master_database, write_directory, database_name)
Esempio n. 14
0
    def construct_reference(self,
                            base_weights_name="LookupBase.csv",
                            alternative_key="Unique"):
        """
        The construct a reference of every name for every place for every level within the Lookup Base

        :param base_weights_name: Name of the base weights file
        :type base_weights_name: str

        :param alternative_key: Key within files that contains alternative names
        :type alternative_key: str

        :return: Nothing, write out place reference csv then stop
        :rtype: None
        """

        # Load the lookup base
        base_relation = CsvObject(Path(self._working_dir, base_weights_name))

        # Load alternative files
        alt_files = [
            CsvObject(Path(self._working_dir, file), set_columns=True)
            for file in directory_iterator(self._working_dir)
            if alternative_key in file
        ]

        # Order them in the same manner as the headers
        order = [
            index for header in self._headers
            for index, file in enumerate(alt_files) if header in file.file_name
        ]
        alt_files = np.array(alt_files)[order].tolist()

        # Link each row to a unique list to create the reference place look up file
        rows = [
            flatten([[row[0]]] + [
                self._match_row(match, match_file)
                for match, match_file in zip(row[1:], alt_files)
            ]) for row in base_relation.row_data
        ]

        write_csv(self._working_dir, "PlaceReference",
                  ["GID"] + flatten([file.headers for file in alt_files]),
                  rows)
Esempio n. 15
0
    def validation_chromosomes(self):
        """
        This will create a dataset of all the chromosomes that we have to work with

        :Note: Mostly used externally to aid multi-core processing

        :return: A list of valid chromosomes
        :rtype: list
        """

        valid_chromosomes = []
        for file in directory_iterator(self.gen_directory):
            if Path(self.gen_directory, file).suffix == self.gen_type:
                valid_chromosomes.append(
                    int(
                        re.sub(r'[\D]', "",
                               Path(self.gen_directory, file).stem)))
        valid_chromosomes.sort()
        return valid_chromosomes
Esempio n. 16
0
def create_manhattan_plot(plot_name, working_dir, colours, output_directory):
    # Isolate the files related to this name
    name_files = []
    for file in directory_iterator(working_dir):
        if (plot_name in file) and (".log" not in file) and (".blend" not in file):
            name_files.append(file)

    # If we find the axis image then we can construct a plot
    if sum([True if "AXIS" in file else False for file in name_files]) == 1:

        # Isolate the axis and the point image file names
        axis = [file for file in name_files if "AXIS" in file][0]
        points = [file for file in name_files if "AXIS" not in file]
        assert len(points) == len(colours), f"Found {len(points)} but was only provided {len(colours)} colours"

        # Construct the plot
        _construct_plot_image(working_dir, axis, points, colours, output_directory, plot_name)

    else:
        raise IndexError(f"Failed to find {plot_name}_AXIS.png")
Esempio n. 17
0
def create_qq_plot(plot_name, working_dir, point_colour, output_directory):
    # Isolate the files related to this name
    name_files = []
    for file in directory_iterator(working_dir):
        if (plot_name in file) and (".log" not in file) and (".blend"
                                                             not in file):
            name_files.append(file)

    # If we find the axis image then we can construct a plot
    if sum([True if "AXIS" in file else False for file in name_files]) == 1:
        # Isolate the axis and the point image file names
        axis = [file for file in name_files if "AXIS" in file][0]
        points = [file for file in name_files if "AXIS" not in file][0]

        # Construct the plot
        _plot_image(working_dir, axis, points, point_colour, output_directory,
                    plot_name)

    else:
        raise IndexError(f"Failed to find {plot_name}__AXIS.png")
Esempio n. 18
0
    def _heritability_by_chromosome(self, config_dict):
        """
        This will calculate the heritability for each chromosome cumulatively the values for the genome-wide
        calculation
        """
        cumulative_ld = sum_sq_beta = total_snps = 0
        for file in directory_iterator(self.ld_directory):

            # Infer the chromosome of this file via remove the 'LD' prefix
            chromosome = file[2:]
            print(f"Processing Chromosome {chromosome}")

            # Load the ld data from the ld_directory and use it to set sid/iid count
            data = load_pickle(self.ld_directory, file)
            sid_count, iid_count = data[self.norm_snps].shape

            # Isolate the betas from the Filtered to calculate the sum squared betas, also ld scores from ld data
            betas = self.sm_dict_from_csv(
                self.filter_directory, f"Filtered_{chromosome}.csv")[self.beta]
            sum_beta_sq = np.sum(np.array(betas)**2)
            ld_scores = data[self.ld_scores]

            # Calculate the heritability and average LD at a chromosome level
            heritability, average_ld = self._chromosome_heritability(
                chromosome, ld_scores, sum_beta_sq, sid_count)

            # Cumulate ld, snps, and sum square beta
            cumulative_ld += np.sum(ld_scores)
            total_snps += sid_count
            sum_sq_beta += np.sum(sum_beta_sq)

            # Store Values for config file
            chromosome_values = {
                self.herit: heritability,
                self.count_snp: sid_count,
                self.count_iid: iid_count,
                self.avg_ld: average_ld,
                "Description": f"Chromosome {chromosome}"
            }
            config_dict[chromosome] = chromosome_values
        return cumulative_ld, sum_sq_beta, total_snps
Esempio n. 19
0
    def aggregate_scores(self):
        """
        This will combine the scores found by chromosome into a single file
        """

        combined_array = []
        for index, file in enumerate(directory_iterator(
                self.scores_directory)):
            score_file = CsvObject(Path(self.scores_directory, file),
                                   set_columns=True)

            # If its the first file we want to extract the iid and fid values as well
            if index == 0:
                fid, iid, score = score_file[self.fid], score_file[
                    self.iid], score_file["Scores"]
                fid, iid, score = np.array(fid), np.array(iid), np.array(
                    score).astype(float)
                fid.shape, iid.shape, score.shape = (len(fid),
                                                     1), (len(iid),
                                                          1), (len(score), 1)
                combined_array = [fid, iid, score]

            # Else extract the scores and append it to the array
            else:
                score = np.array(score_file["Scores"]).astype(float)
                score.shape = (len(score), 1)
                combined_array.append(score)

        # Combine the (IID_Count, 1) * (chromosome count + 2) arrays into a single (IDD_Count, chromosome_count)
        iid_array = np.hstack(combined_array[:2])
        score_array = np.sum(np.hstack(combined_array[2:]), axis=1)
        score_array.shape = (len(score_array), 1)

        # Write the scores to the working directory
        write_rows = np.hstack([iid_array, score_array]).tolist()
        write_csv(Path(self.working_dir, "PGS"), "PolyGenicScores",
                  ["FID", "IID", "Scores"], write_rows)
Esempio n. 20
0
def main_call(out_dir, write_dir, headers):

    output = []
    for file in directory_iterator(out_dir):
        if ".log" not in file:
            csv_file = CsvObject(Path(output_dir, file))

            # Isolate the model values from the aggregated [snp] + [model 1, ... model N]
            for row in csv_file.row_data:
                snp, models = row[0], chunk_list(row[1:], len(headers))
                output.append([snp, models])

    print(f"For {len(output)} Snps")
    model_count = len(output[0][1])

    model_comp = []
    for i in range(model_count):
        print(f"For model {i+1}")

        # Write out the aggregated chromosome model data to a directory
        model_out = []
        for snp, model in output:
            model_out.append([snp] + model[i])
        write_csv(write_dir, f"Model{i + 1}", ["Snp"] + headers, model_out)

        # Append the comparision to a master list of models
        model_comp.append([f"Model {i+1}"] + [
            str(np.mean([float(values[vi]) for values in model_out]))
            for vi in range(1, 3)
        ])

    # Write the model comp out
    write_csv(
        write_dir, "Model Comparision",
        ["Model", "Mean Coefficent", "Mean Standard Error", "Mean P Values"],
        model_comp)
Esempio n. 21
0
        # Select the current Object that has the same name as place
        ob = bpy.context.scene.objects[place]
        bpy.ops.object.select_all(action='DESELECT')

        # Make the District the active object
        bpy.context.view_layer.objects.active = ob
        ob.select_set(True)

        for mat in ob.material_slots:
            mat.material.node_tree.nodes["Emission"].inputs[0].default_value = colour


root = r"I:\Work\DataBases\Adjacent\Months"

not_processed = []
for file in directory_iterator(root):
    print(file)

    a = CsvObject(Path(root, file))

    target = len(a.headers) - 2

    found = 0
    for img in directory_iterator(r"I:\Work\Figures_and_tables\DiseasesOverTime"):
        year = img.split("_")[-1].split(".")[0]

        if year == Path(root, file).stem:
            found += 1

    if found != target:
        not_processed.append(file)
Esempio n. 22
0
    def combine(self, unique_id, data_start, root_directory, write_directory,
                write_name):
        """
        weightGIS expects each file to have a single date, so if you have lots of files of the same date that you wan
        to process at the same time you will need ot combine them

        :param unique_id: The unique id index
        :type unique_id: int

        :param data_start: The index wherein the data starts from
        :type data_start: int

        :param root_directory: The root directory of the csv files
        :type root_directory: Path | str

        :param write_directory: The output directory for the file
        :type write_directory: Path | str

        :param write_name: Name of the combined file
        :type write_name: str

        :return: Nothing, write file then stop
        :rtype: None
        """

        # Create the unique ID's
        unique_id_list = sorted(
            list(
                set(
                    flatten([
                        CsvObject(Path(root_directory, file),
                                  set_columns=True)[unique_id]
                        for file in directory_iterator(root_directory)
                    ]))))

        # For each unique ID
        out_list = []
        for count_i, ids in enumerate(unique_id_list):

            if count_i % 10 == 0:
                print(f"{count_i} / {len(unique_id_list)}")

            # Check each file for a matching row, and then
            ids_list = []
            for index, file in enumerate(directory_iterator(root_directory)):

                # If its the first index, take the full values
                if index == 0:
                    ids_list += self._isolate(root_directory, file, unique_id,
                                              ids)

                # Otherwise only take the values after the data start
                else:
                    ids_list += self._isolate(root_directory, file, unique_id,
                                              ids)[data_start:]

            out_list.append(ids_list)

        headers = []
        for index, file in enumerate(directory_iterator(root_directory)):
            if index == 0:
                headers += CsvObject(Path(root_directory, file)).headers
            else:
                headers += CsvObject(Path(root_directory,
                                          file)).headers[data_start:]

        write_csv(write_directory, write_name, headers, out_list)