Example #1
0
    def _set_summary_headers(self):
        """
        We may have users using custom headers, or they may be using a format we already have covered

        Note
        -----
        In GUI we basically want to call the header check to make sure we align columns correctly. If not they can set
        it themselves

        :return: The headers to validate
        """
        if not self.summary_file:
            return None

        custom_headers = self.args["Custom_Summary_Header"]
        if custom_headers:
            # Recast so that the values are in a list so they can be checked by the same method as defaults
            header_sets = {
                key: [v]
                for key, v in zip(custom_headers.keys(),
                                  custom_headers.values())
            }
        else:
            # Based on known summary statistics from LDPred sum_stats_parsers.py
            header_sets = self._config["header_keys"]

        with open_setter(self.summary_file)(self.summary_file) as file:

            # Determine if we have custom headers or not via _loaded_sum_headers
            raw_headers = file.readline()

            headers = {
                header:
                self._check_header(header, decode_line(raw_headers,
                                                       self.zipped),
                                   header_sets)
                for header in header_sets
            }

            file.close()
            return headers
    def _line_by_line_summary(self, validation_snps):
        """This will check, for every line in the summary statistics, if a snp is within our list of accept snps."""
        sm_line = []
        with open_setter(self.summary_file)(self.summary_file) as file:
            # Skip the header
            file.readline()

            # For each line in the GWAS Summary file
            for index, line_byte in enumerate(file):

                # Decode the line and extract the snp_id
                line = decode_line(line_byte, self.zipped)
                snp_id = line[self.sm_snp_id]

                # If the snp exists in both the validation and core snp samples then clean this line, else skip.
                if snp_id in validation_snps:
                    sm_line.append(line)

                else:
                    self._sum_error_dict["Invalid_Snps"] += 1

        return np.array(sm_line)
Example #3
0
    def set_summary_headers(self, chromosome_header, snp_header, base_position_header, p_value_header):
        """
        Validate which columns are the chromosome, snp, base position and p value.

        To do this, each name provided or the respective default is checked against the first row of the summary file.
        If all are present, the indexes are returned in the order of the header input arguments so they can be assigned
        and used to isolate values later.

        :param chromosome_header: chromosome header name
        :type chromosome_header: str

        :param snp_header: snp header name
        :type snp_header: str

        :param base_position_header: base position header name
        :type base_position_header: str

        :param p_value_header: p value header
        :type p_value_header: str

        :return: A list of [chromosome_header_index, snp_header_index, base_position_index, p_value_index]
        :rtype: list[int, int, int, int]

        :raises KeyError: If a header provided was not found in the decoded headers
        """

        # Decode the headers
        with open_setter(self.summary_file)(self.summary_file) as file:
            decoded_headers = decode_line(file.readline(), self.zipped)
        file.close()

        header_indexes = []
        for header in [chromosome_header, snp_header, base_position_header, p_value_header]:
            if header in decoded_headers:
                header_indexes.append(decoded_headers.index(header))
            else:
                raise KeyError(f"{header} was not found in {decoded_headers}")
        return header_indexes
Example #4
0
    def _y_values_from_p(self, p_value_index, log_transform):
        """
        Create the y values from the -log 10 p values.

        :param p_value_index: The index of the p value column in the summary stat file
        :type p_value_index: int

        :param log_transform: If the p value is not in a -log 10 then it needs to be converted, otherwise this can be
            False
        :type log_transform: bool

        :return: The sorted list of -log 10 p values
        :rtype: list
        """

        log_v = []
        with open_setter(self.summary_file)(self.summary_file) as file:

            # Skip the header
            file.readline()

            for index, line_byte in enumerate(file):
                if index % 100000 == 0:
                    self.logger.write(f"Processed {index} Lines")

                # Extract the p value
                p_value = float(
                    decode_line(line_byte, self.zipped)[p_value_index])

                # Log transform the p value if required
                if log_transform:
                    p_value = -math.log10(p_value)
                log_v.append(p_value)

        # Sort the values from smallest to largest
        return sorted(log_v)
Example #5
0
 def _isolate_from_line(self, line_byte):
     """Decode a line byte then isolate its attributes"""
     line = decode_line(line_byte, self.zipped)
     return [int(line[self.chr_h]), str(line[self.snp_h]), int(line[self.bp_h]), float(line[self.p_h])]