def _set_summary_headers(self): """ We may have users using custom headers, or they may be using a format we already have covered Note ----- In GUI we basically want to call the header check to make sure we align columns correctly. If not they can set it themselves :return: The headers to validate """ if not self.summary_file: return None custom_headers = self.args["Custom_Summary_Header"] if custom_headers: # Recast so that the values are in a list so they can be checked by the same method as defaults header_sets = { key: [v] for key, v in zip(custom_headers.keys(), custom_headers.values()) } else: # Based on known summary statistics from LDPred sum_stats_parsers.py header_sets = self._config["header_keys"] with open_setter(self.summary_file)(self.summary_file) as file: # Determine if we have custom headers or not via _loaded_sum_headers raw_headers = file.readline() headers = { header: self._check_header(header, decode_line(raw_headers, self.zipped), header_sets) for header in header_sets } file.close() return headers
def _line_by_line_summary(self, validation_snps): """This will check, for every line in the summary statistics, if a snp is within our list of accept snps.""" sm_line = [] with open_setter(self.summary_file)(self.summary_file) as file: # Skip the header file.readline() # For each line in the GWAS Summary file for index, line_byte in enumerate(file): # Decode the line and extract the snp_id line = decode_line(line_byte, self.zipped) snp_id = line[self.sm_snp_id] # If the snp exists in both the validation and core snp samples then clean this line, else skip. if snp_id in validation_snps: sm_line.append(line) else: self._sum_error_dict["Invalid_Snps"] += 1 return np.array(sm_line)
def set_summary_headers(self, chromosome_header, snp_header, base_position_header, p_value_header): """ Validate which columns are the chromosome, snp, base position and p value. To do this, each name provided or the respective default is checked against the first row of the summary file. If all are present, the indexes are returned in the order of the header input arguments so they can be assigned and used to isolate values later. :param chromosome_header: chromosome header name :type chromosome_header: str :param snp_header: snp header name :type snp_header: str :param base_position_header: base position header name :type base_position_header: str :param p_value_header: p value header :type p_value_header: str :return: A list of [chromosome_header_index, snp_header_index, base_position_index, p_value_index] :rtype: list[int, int, int, int] :raises KeyError: If a header provided was not found in the decoded headers """ # Decode the headers with open_setter(self.summary_file)(self.summary_file) as file: decoded_headers = decode_line(file.readline(), self.zipped) file.close() header_indexes = [] for header in [chromosome_header, snp_header, base_position_header, p_value_header]: if header in decoded_headers: header_indexes.append(decoded_headers.index(header)) else: raise KeyError(f"{header} was not found in {decoded_headers}") return header_indexes
def _y_values_from_p(self, p_value_index, log_transform): """ Create the y values from the -log 10 p values. :param p_value_index: The index of the p value column in the summary stat file :type p_value_index: int :param log_transform: If the p value is not in a -log 10 then it needs to be converted, otherwise this can be False :type log_transform: bool :return: The sorted list of -log 10 p values :rtype: list """ log_v = [] with open_setter(self.summary_file)(self.summary_file) as file: # Skip the header file.readline() for index, line_byte in enumerate(file): if index % 100000 == 0: self.logger.write(f"Processed {index} Lines") # Extract the p value p_value = float( decode_line(line_byte, self.zipped)[p_value_index]) # Log transform the p value if required if log_transform: p_value = -math.log10(p_value) log_v.append(p_value) # Sort the values from smallest to largest return sorted(log_v)
def _isolate_from_line(self, line_byte): """Decode a line byte then isolate its attributes""" line = decode_line(line_byte, self.zipped) return [int(line[self.chr_h]), str(line[self.snp_h]), int(line[self.bp_h]), float(line[self.p_h])]