def calc_metric_full(self): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass if not self.dynamic_delete: raise Exception('Error: dynamic delete must be True for metric calculation!') flat.print_log_msg('Start metric') curr_breakpoint_index = 0 block_height = 0 block_width = 0 total_N_SNPs = decimal.Decimal('0') block_width_sum = decimal.Decimal('0') # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions)-1): if self.snp_first >= self.partitions[p_num_init+1][0]: flat.print_log_msg('Pre-reading partition: '+str(self.partitions[p_num_init])) flat.read_partition_into_matrix(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) last_p_num = p_num_init else: break curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num+1, len(self.partitions)): p = self.partitions[p_num] flat.print_log_msg('Reading partition: '+str(p)) flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # Determine first locus if curr_locus<0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list)>0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list)>0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus<0: flat.print_log_msg('Warning: curr_locus not found! Continuing to next partition.') flat.print_log_msg('Comment: This is possibly due to snp_first being very close to end of partition.') flat.print_log_msg('Details: ') flat.print_log_msg('Partition: '+repr(p)) flat.print_log_msg('snp_first: '+repr(self.snp_first)) flat.print_log_msg('curr_locus: '+repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') # Determine last locus if p_num+1 < len(self.partitions): end_locus = self.partitions[p_num+1][0] end_locus_index = -1 else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i end_locus_found = True break if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] flat.print_log_msg('Running metric for partition: '+str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus <= end_locus: if curr_breakpoint_index<len(self.breakpoints): if curr_locus > self.breakpoints[curr_breakpoint_index]: # Breakpoint is the last element of the block! # block_height = len(self.locus_list) - curr_locus_index block_height = 0 - total_N_SNPs # - 1 # ? # this is in accordance with the formula for deferred sum calculation self.metric['N_zero'] += block_height * block_width block_width_sum += block_width curr_breakpoint_index += 1 block_width = 0 if curr_breakpoint_index>=len(self.breakpoints): break # found = False try: for key, el in self.matrix[curr_locus]['data'].items(): if key > self.breakpoints[curr_breakpoint_index]: # Only add those above the breakpoint! corr_coeff = self.matrix[curr_locus]['data'][key]['shrink'] / math.sqrt( self.matrix[curr_locus]['data'][curr_locus]['shrink'] * self.matrix[key]['data'][key]['shrink'] ) self.metric['sum'] += decimal.Decimal(corr_coeff**2) self.metric['N_nonzero'] += 1 # found = True except IndexError as e: print('Error!') print(e) print(key, el) print(curr_locus) print(self.matrix) print(self.breakpoints) print(curr_breakpoint_index) # if found: block_width += 1 # block_width needs to be increased even if it doesn't have values in the outer part of the matrix! if curr_locus_index+1 < len(self.locus_list): curr_locus_index+=1 curr_locus = self.locus_list[curr_locus_index] total_N_SNPs += 1 else: flat.print_log_msg('curr_locus_index out of bounds') break # if block_width > 0: # If an LD block hasn't finished, but a new partition must be read into memory # # index_of_breakpoint_in_locus_list = -1 # for ind in range(curr_locus_index, len(self.locus_list)): # if self.locus_list[ind] >= self.breakpoints[curr_breakpoint_index]: # # index_of_breakpoint_in_locus_list = ind # break # # num_of_SNPs_to_add = ind - curr_locus_index # # # if index_of_breakpoint_in_locus_list < 0: # # raise Exception('Error: index_of_breakpoint_in_locus_list not found!') # # # block_height = len(self.locus_list) - index_of_breakpoint_in_locus_list # block_height = 0 - (total_N_SNPs+num_of_SNPs_to_add) # self.metric['N_zero'] += block_height * block_width # # block_width_sum += block_width # block_width = 0 # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') flat.delete_loci_smaller_than(end_locus, self.matrix, self.locus_list, self.locus_list_deleted) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.metric['N_zero'] += total_N_SNPs * block_width_sum # this is in accordance with the formula for deferred sum calculation print('total_N_SNPs, block_width', total_N_SNPs, block_width) print('total_N_SNPs-block_width', total_N_SNPs-block_width) print('block_width_sum', block_width_sum) self.calculation_complete = True return self.metric
def calc_vert(self, dynamic_delete=True, sum_both_sides=True): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass raise Exception('calc_vert is deprecated - check code before running!') self.dynamic_delete = dynamic_delete flat.print_log_msg('Start') for p_num, p in enumerate(self.partitions): flat.print_log_msg('Reading partition: ' + str(p)) flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # Determine first locus curr_locus = -1 if p_num == 0: if len(self.locus_list) > 0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') else: if len(self.locus_list) > 0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus < 0: raise Exception('Error: curr_locus not found!') if p_num + 1 < len(self.partitions): end_locus = self.partitions[p_num + 1][0] end_locus_index = -1 else: # end_locus = partitions[p_num][1] # Find last locus <= snp_last for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i break flat.print_log_msg('Running for partition: ' + str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus < end_locus: for key, el in self.matrix[curr_locus]['data'].items(): corr_coeff = self.matrix[curr_locus]['data'][key][ 'shrink'] / math.sqrt( self.matrix[curr_locus]['data'][curr_locus] ['shrink'] * self.matrix[key]['data'][key]['shrink']) self.add_corr_coeff(corr_coeff, curr_locus) if sum_both_sides: self.add_corr_coeff(corr_coeff, key) # Just save it in the matrix ;) self.matrix[curr_locus]['data'][key][ 'corr_coeff'] = corr_coeff if curr_locus_index + 1 < len(self.locus_list): curr_locus_index += 1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg('curr_locus_index out of bounds') break # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') flat.delete_loci_smaller_than(end_locus, self.matrix, self.locus_list, self.locus_list_deleted) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.calculation_complete = True
def calc_diag(self, dynamic_delete=True): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass self.dynamic_delete = dynamic_delete flat.print_log_msg('Start') # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions) - 1): if self.snp_first >= self.partitions[p_num_init + 1][0]: flat.print_log_msg('Pre-reading partition: ' + str(self.partitions[p_num_init])) flat.read_partition_into_matrix(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) last_p_num = p_num_init else: break curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num + 1, len(self.partitions)): p = self.partitions[p_num] flat.print_log_msg('Reading partition: ' + str(p)) flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last) # Determine first locus if curr_locus < 0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list) > 0: # Find first locus >= snp_first for i, locus in enumerate(self.locus_list): if locus >= self.snp_first: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list) > 0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus < 0: flat.print_log_msg( 'Warning: curr_locus not found! Continuing to next partition.' ) flat.print_log_msg( 'Comment: This is possibly due to snp_first being very close to end of partition.' ) flat.print_log_msg('Details: ') flat.print_log_msg('Partition: ' + repr(p)) flat.print_log_msg('snp_first: ' + repr(self.snp_first)) flat.print_log_msg('curr_locus: ' + repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') # Determine end locus if p_num + 1 < len(self.partitions): end_locus = int( (self.partitions[p_num][1] + self.partitions[p_num + 1][0]) / 2) else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i end_locus_found = True break if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] flat.print_log_msg('Running for partition: ' + str(p)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus <= end_locus: x = self.locus_list[curr_locus_index] y = self.locus_list[curr_locus_index] delta = 0 while x >= self.partitions[p_num][0] and y <= self.partitions[ p_num][1]: if x in self.matrix and y in self.matrix[x]['data']: corr_coeff = self.matrix[x]['data'][y][ 'shrink'] / math.sqrt( self.matrix[x]['data'][x]['shrink'] * self.matrix[y]['data'][y]['shrink']) self.add_corr_coeff(corr_coeff, curr_locus) # Just save it in the matrix ;) ...for img self.matrix[x]['data'][y]['corr_coeff'] = corr_coeff if delta != 0: x = self.locus_list[curr_locus_index - delta + 1] if x in self.matrix and y in self.matrix[x]['data']: corr_coeff = self.matrix[x]['data'][y][ 'shrink'] / math.sqrt( self.matrix[x]['data'][x]['shrink'] * self.matrix[y]['data'][y]['shrink']) self.add_corr_coeff(corr_coeff, curr_locus) # Just save it in the matrix ;) ...for img self.matrix[x]['data'][y][ 'corr_coeff'] = corr_coeff delta += 1 if curr_locus_index - delta >= 0: x = self.locus_list[curr_locus_index - delta] else: # flat.print_log_msg('X index out of bounds') break if curr_locus_index + delta < len(self.locus_list): y = self.locus_list[curr_locus_index + delta] else: # flat.print_log_msg('Y index out of bounds') break if curr_locus_index + 1 < len(self.locus_list): curr_locus_index += 1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg('curr_locus_index out of bounds') break # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') if p_num + 1 < len(self.partitions): delete_loc = self.partitions[p_num + 1][0] # diag - specific else: delete_loc = end_locus flat.delete_loci_smaller_than(delete_loc, self.matrix, self.locus_list, self.locus_list_deleted) else: flat.print_log_msg('locus_list size: ' + repr(len(self.locus_list))) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.calculation_complete = True
def init_search_full(self): # flat.print_log_msg('Removing existing matrix output file') # try: # os.remove(cnst.const['out_matrix_delim']) # except OSError: # pass if not self.dynamic_delete: raise Exception( 'Error: dynamic_delete should be True for local search!') flat.print_log_msg('Start local search init') # pre-read all relevant partitions at beginning! last_p_num = -1 for p_num_init in range(0, len(self.partitions) - 1): if self.snp_bottom >= self.partitions[p_num_init + 1][0]: flat.print_log_msg('Pre-reading partition: ' + str(self.partitions[p_num_init])) flat.read_partition_into_matrix(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_bottom, self.snp_top) last_p_num = p_num_init else: break curr_locus = -1 # for p_num, p in enumerate(self.partitions): for p_num in range(last_p_num + 1, len(self.partitions)): p = self.partitions[p_num] flat.print_log_msg('Reading partition: ' + str(p)) # Data must be read until snp_top! flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_bottom, self.snp_top) # Determine first locus if curr_locus < 0: # Either first partition or not found in first partition # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop if len(self.locus_list) > 0: # Find first locus >= snp_bottom for i, locus in enumerate(self.locus_list): if locus >= self.snp_bottom: curr_locus = locus start_locus = locus curr_locus_index = i start_locus_index = i break else: raise Exception('Error: locus_list seems to be empty') # else: # if len(self.locus_list)>0: # curr_locus = self.locus_list[0] # curr_locus_index = 0 # else: # raise Exception('Error: locus_list seems to be empty') else: try: curr_locus_index = self.locus_list.index(curr_locus) # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted except ValueError: if len(self.locus_list) > 0: curr_locus = self.locus_list[0] curr_locus_index = 0 else: raise Exception('Error: locus_list seems to be empty') if curr_locus < 0: flat.print_log_msg( 'Warning: curr_locus not found! Continuing to next partition.' ) flat.print_log_msg( 'Comment: This is possibly due to snp_bottom being very close to end of partition.' ) flat.print_log_msg('Details: ') flat.print_log_msg('Partition: ' + repr(p)) flat.print_log_msg('snp_bottom: ' + repr(self.snp_bottom)) flat.print_log_msg('curr_locus: ' + repr(curr_locus)) continue #continue to next partition # raise Exception('Error: curr_locus not found!') if p_num + 1 < len(self.partitions): end_locus = self.partitions[p_num + 1][0] end_locus_index = -1 else: # end_locus = self.partitions[p_num][1] # Find last locus <= snp_last end_locus_found = False for i in reversed(range(0, len(self.locus_list))): # for locus in reversed(locus_list): if self.locus_list[i] <= self.snp_last: end_locus = self.locus_list[i] end_locus_index = i end_locus_found = True break if not end_locus_found: end_locus_index = 0 end_locus = self.locus_list[end_locus_index] # flat.print_log_msg('self.locus_list control output: '+repr(self.locus_list)) flat.print_log_msg('Running precompute for partition: ' + str(p)) flat.print_log_msg('start_locus: ' + repr(start_locus) + ' end_locus: ' + repr(end_locus) + ' end_locus_index ' + repr(end_locus_index)) # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD while curr_locus <= end_locus: self.add_locus_to_precomputed( curr_locus ) # We want snp_bottom to be added here always (for later use). Same thing for snp_top # flat.print_log_msg('curr_locus: '+repr(curr_locus)+' end_locus: '+repr(end_locus)) if ( curr_locus > self.snp_first or self.initial_breakpoint_index == 0 ) and ( curr_locus <= self.snp_last ): # Do not include snp_first in the calculation unless the very first block is being taken into account. Do not calculate anything above snp_last, just insert dummies for key, el in self.matrix[curr_locus]['data'].items(): # don't take into account anything over snp_top if key <= self.snp_top: corr_coeff = self.matrix[curr_locus]['data'][key][ 'shrink'] / math.sqrt( self.matrix[curr_locus]['data'][curr_locus] ['shrink'] * self.matrix[key]['data'][key]['shrink']) # if curr_locus != key: # Don't include diagonal! ...although not that important. self.add_val_to_precomputed( decimal.Decimal(corr_coeff**2), curr_locus, key ) # If the diagonal is included, it doesn't matter because later we add and subtract is exactly once when adding and subra # else: # self.add_val_to_precomputed(decimal.Decimal(0), curr_locus, key) else: self.add_val_to_precomputed( decimal.Decimal(0), curr_locus, curr_locus ) # Dummy value for snp_first! ...in order to be consistent for some other future use of these data structures if curr_locus_index + 1 < len(self.locus_list): curr_locus_index += 1 curr_locus = self.locus_list[curr_locus_index] else: flat.print_log_msg( 'curr_locus_index out of bounds' ) # The possibility of this happening is only at the end of the range [usually chromosome] (end of last partition) break # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename']) if self.dynamic_delete: flat.print_log_msg('Deleting loci not required any more') flat.delete_loci_smaller_than(end_locus, self.matrix, self.locus_list, self.locus_list_deleted) self.start_locus = start_locus self.start_locus_index = start_locus_index self.end_locus = end_locus self.end_locus_index = end_locus_index self.init_complete = True