def calc_metric_full(self):
        # flat.print_log_msg('Removing existing matrix output file')
        # try:
        #     os.remove(cnst.const['out_matrix_delim'])
        # except OSError:
        #     pass
        
        if not self.dynamic_delete:
            raise Exception('Error: dynamic delete must be True for metric calculation!')

        flat.print_log_msg('Start metric')
        
        curr_breakpoint_index = 0
        block_height = 0
        block_width = 0
        
        total_N_SNPs = decimal.Decimal('0')
        block_width_sum = decimal.Decimal('0')

        # pre-read all relevant partitions at beginning!
        last_p_num = -1
        for p_num_init in range(0, len(self.partitions)-1):
            if self.snp_first >= self.partitions[p_num_init+1][0]:
                flat.print_log_msg('Pre-reading partition: '+str(self.partitions[p_num_init])) 
                flat.read_partition_into_matrix(self.partitions, p_num_init, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last)
                last_p_num = p_num_init
            else:
                break

        curr_locus = -1
        # for p_num, p in enumerate(self.partitions):
        for p_num in range(last_p_num+1, len(self.partitions)):
            p = self.partitions[p_num]

            flat.print_log_msg('Reading partition: '+str(p))
            flat.read_partition_into_matrix(self.partitions, p_num, self.matrix, self.locus_list, self.name, self.input_config, self.snp_first, self.snp_last)

            # Determine first locus
            if curr_locus<0: # Either first partition or not found in first partition
                # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop
                if len(self.locus_list)>0:
                    # Find first locus >= snp_first
                    for i, locus in enumerate(self.locus_list):
                        if locus >= self.snp_first:
                            curr_locus = locus
                            start_locus = locus
                            curr_locus_index = i
                            start_locus_index = i
                            break
                else:
                    raise Exception('Error: locus_list seems to be empty') 
            # else:
            #   if len(self.locus_list)>0:
            #       curr_locus = self.locus_list[0]
            #       curr_locus_index = 0
            #   else:
            #       raise Exception('Error: locus_list seems to be empty')
            else:
                try:
                    curr_locus_index = self.locus_list.index(curr_locus)
                    # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted
                except ValueError:
                    if len(self.locus_list)>0:
                        curr_locus = self.locus_list[0]
                        curr_locus_index = 0
                    else:
                        raise Exception('Error: locus_list seems to be empty')

            if curr_locus<0:
                flat.print_log_msg('Warning: curr_locus not found! Continuing to next partition.')
                flat.print_log_msg('Comment: This is possibly due to snp_first being very close to end of partition.')
                flat.print_log_msg('Details: ')
                flat.print_log_msg('Partition: '+repr(p))
                flat.print_log_msg('snp_first: '+repr(self.snp_first))
                flat.print_log_msg('curr_locus: '+repr(curr_locus)) 
                continue #continue to next partition 
                # raise Exception('Error: curr_locus not found!')   
            
            # Determine last locus
            if p_num+1 < len(self.partitions):
                end_locus = self.partitions[p_num+1][0]
                end_locus_index = -1
            else:
                # end_locus = self.partitions[p_num][1]

                # Find last locus <= snp_last
                end_locus_found = False
                for i in reversed(range(0, len(self.locus_list))):
                # for locus in reversed(locus_list):
                    if self.locus_list[i] <= self.snp_last:
                        end_locus = self.locus_list[i]
                        end_locus_index = i
                        end_locus_found = True
                        break

                if not end_locus_found:
                    end_locus_index = 0
                    end_locus = self.locus_list[end_locus_index]
            
            flat.print_log_msg('Running metric for partition: '+str(p))
            # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD
            while curr_locus <= end_locus:
                if  curr_breakpoint_index<len(self.breakpoints): 
                    if curr_locus > self.breakpoints[curr_breakpoint_index]: # Breakpoint is the last element of the block!
#                         block_height =  len(self.locus_list) - curr_locus_index
                        block_height =  0 - total_N_SNPs # - 1 # ? # this is in accordance with the formula for deferred sum calculation 
                        self.metric['N_zero'] += block_height * block_width
                        block_width_sum += block_width
                        
                        curr_breakpoint_index += 1
                        block_width = 0
                
                if  curr_breakpoint_index>=len(self.breakpoints):
                    break
                
#                 found = False
                try:
                    for key, el in self.matrix[curr_locus]['data'].items():
                        if key > self.breakpoints[curr_breakpoint_index]: # Only add those above the breakpoint!
                            corr_coeff = self.matrix[curr_locus]['data'][key]['shrink'] / math.sqrt( self.matrix[curr_locus]['data'][curr_locus]['shrink'] * self.matrix[key]['data'][key]['shrink'] )
                            self.metric['sum'] += decimal.Decimal(corr_coeff**2)
                            self.metric['N_nonzero'] += 1
#                             found = True
                except IndexError as e:
                    print('Error!')
                    print(e)
                    print(key, el)
                    print(curr_locus)
                    print(self.matrix)
                    print(self.breakpoints)
                    print(curr_breakpoint_index)
                    
#                 if found:
                block_width += 1 # block_width needs to be increased even if it doesn't have values in the outer part of the matrix! 
                    
                if curr_locus_index+1 < len(self.locus_list):
                    curr_locus_index+=1
                    curr_locus = self.locus_list[curr_locus_index]
                    total_N_SNPs += 1
                else:
                    flat.print_log_msg('curr_locus_index out of bounds')
                    break

#             if block_width > 0: # If an LD block hasn't finished, but a new partition must be read into memory
# #                 index_of_breakpoint_in_locus_list = -1
#                 for ind in range(curr_locus_index, len(self.locus_list)):
#                     if self.locus_list[ind] >= self.breakpoints[curr_breakpoint_index]:
# #                         index_of_breakpoint_in_locus_list = ind
#                         break
#                 
#                 num_of_SNPs_to_add = ind - curr_locus_index
#                 
# #                 if index_of_breakpoint_in_locus_list < 0:
# #                     raise Exception('Error: index_of_breakpoint_in_locus_list not found!')
#                 
# #                 block_height =  len(self.locus_list) - index_of_breakpoint_in_locus_list
#                 block_height =  0 - (total_N_SNPs+num_of_SNPs_to_add)
#                 self.metric['N_zero'] += block_height * block_width
#                 
#                 block_width_sum += block_width
#                 block_width = 0
                
            # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename'])
            if self.dynamic_delete:
                flat.print_log_msg('Deleting loci not required any more')
                flat.delete_loci_smaller_than(end_locus, self.matrix, self.locus_list, self.locus_list_deleted)

        self.start_locus = start_locus
        self.start_locus_index = start_locus_index
        self.end_locus = end_locus
        self.end_locus_index = end_locus_index
        
        self.metric['N_zero'] += total_N_SNPs * block_width_sum # this is in accordance with the formula for deferred sum calculation
        
        print('total_N_SNPs, block_width', total_N_SNPs, block_width)
        print('total_N_SNPs-block_width', total_N_SNPs-block_width)
        print('block_width_sum', block_width_sum)
        
        self.calculation_complete = True
        
        return self.metric
Esempio n. 2
0
    def calc_vert(self, dynamic_delete=True, sum_both_sides=True):
        # flat.print_log_msg('Removing existing matrix output file')
        # try:
        #     os.remove(cnst.const['out_matrix_delim'])
        # except OSError:
        #     pass

        raise Exception('calc_vert is deprecated - check code before running!')

        self.dynamic_delete = dynamic_delete

        flat.print_log_msg('Start')

        for p_num, p in enumerate(self.partitions):
            flat.print_log_msg('Reading partition: ' + str(p))
            flat.read_partition_into_matrix(self.partitions, p_num,
                                            self.matrix, self.locus_list,
                                            self.name, self.input_config,
                                            self.snp_first, self.snp_last)

            # Determine first locus
            curr_locus = -1
            if p_num == 0:
                if len(self.locus_list) > 0:
                    # Find first locus >= snp_first
                    for i, locus in enumerate(self.locus_list):
                        if locus >= self.snp_first:
                            curr_locus = locus
                            start_locus = locus
                            curr_locus_index = i
                            start_locus_index = i
                            break
                else:
                    raise Exception('Error: locus_list seems to be empty')
            else:
                if len(self.locus_list) > 0:
                    curr_locus = self.locus_list[0]
                    curr_locus_index = 0
                else:
                    raise Exception('Error: locus_list seems to be empty')

            if curr_locus < 0:
                raise Exception('Error: curr_locus not found!')

            if p_num + 1 < len(self.partitions):
                end_locus = self.partitions[p_num + 1][0]
                end_locus_index = -1
            else:
                # end_locus = partitions[p_num][1]

                # Find last locus <= snp_last
                for i in reversed(range(0, len(self.locus_list))):
                    # for locus in reversed(locus_list):
                    if self.locus_list[i] <= self.snp_last:
                        end_locus = self.locus_list[i]
                        end_locus_index = i
                        break

            flat.print_log_msg('Running for partition: ' + str(p))
            # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD
            while curr_locus < end_locus:
                for key, el in self.matrix[curr_locus]['data'].items():
                    corr_coeff = self.matrix[curr_locus]['data'][key][
                        'shrink'] / math.sqrt(
                            self.matrix[curr_locus]['data'][curr_locus]
                            ['shrink'] *
                            self.matrix[key]['data'][key]['shrink'])
                    self.add_corr_coeff(corr_coeff, curr_locus)
                    if sum_both_sides:
                        self.add_corr_coeff(corr_coeff, key)

                    # Just save it in the matrix ;)
                    self.matrix[curr_locus]['data'][key][
                        'corr_coeff'] = corr_coeff

                if curr_locus_index + 1 < len(self.locus_list):
                    curr_locus_index += 1
                    curr_locus = self.locus_list[curr_locus_index]
                else:
                    flat.print_log_msg('curr_locus_index out of bounds')
                    break

            # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename'])
            if self.dynamic_delete:
                flat.print_log_msg('Deleting loci not required any more')
                flat.delete_loci_smaller_than(end_locus, self.matrix,
                                              self.locus_list,
                                              self.locus_list_deleted)

        self.start_locus = start_locus
        self.start_locus_index = start_locus_index
        self.end_locus = end_locus
        self.end_locus_index = end_locus_index

        self.calculation_complete = True
Esempio n. 3
0
    def calc_diag(self, dynamic_delete=True):
        # flat.print_log_msg('Removing existing matrix output file')
        # try:
        #     os.remove(cnst.const['out_matrix_delim'])
        # except OSError:
        #     pass

        self.dynamic_delete = dynamic_delete

        flat.print_log_msg('Start')

        # pre-read all relevant partitions at beginning!
        last_p_num = -1
        for p_num_init in range(0, len(self.partitions) - 1):
            if self.snp_first >= self.partitions[p_num_init + 1][0]:
                flat.print_log_msg('Pre-reading partition: ' +
                                   str(self.partitions[p_num_init]))
                flat.read_partition_into_matrix(self.partitions, p_num_init,
                                                self.matrix, self.locus_list,
                                                self.name, self.input_config,
                                                self.snp_first, self.snp_last)
                last_p_num = p_num_init
            else:
                break

        curr_locus = -1
        # for p_num, p in enumerate(self.partitions):
        for p_num in range(last_p_num + 1, len(self.partitions)):
            p = self.partitions[p_num]

            flat.print_log_msg('Reading partition: ' + str(p))
            flat.read_partition_into_matrix(self.partitions, p_num,
                                            self.matrix, self.locus_list,
                                            self.name, self.input_config,
                                            self.snp_first, self.snp_last)

            # Determine first locus
            if curr_locus < 0:  # Either first partition or not found in first partition
                # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop
                if len(self.locus_list) > 0:
                    # Find first locus >= snp_first
                    for i, locus in enumerate(self.locus_list):
                        if locus >= self.snp_first:
                            curr_locus = locus
                            start_locus = locus
                            curr_locus_index = i
                            start_locus_index = i
                            break
                else:
                    raise Exception('Error: locus_list seems to be empty')
            # else:
            # 	if len(self.locus_list)>0:
            # 		curr_locus = self.locus_list[0]
            # 		curr_locus_index = 0
            # 	else:
            # 		raise Exception('Error: locus_list seems to be empty')
            else:
                try:
                    curr_locus_index = self.locus_list.index(curr_locus)
                    # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted
                except ValueError:
                    if len(self.locus_list) > 0:
                        curr_locus = self.locus_list[0]
                        curr_locus_index = 0
                    else:
                        raise Exception('Error: locus_list seems to be empty')

            if curr_locus < 0:
                flat.print_log_msg(
                    'Warning: curr_locus not found! Continuing to next partition.'
                )
                flat.print_log_msg(
                    'Comment: This is possibly due to snp_first being very close to end of partition.'
                )
                flat.print_log_msg('Details: ')
                flat.print_log_msg('Partition: ' + repr(p))
                flat.print_log_msg('snp_first: ' + repr(self.snp_first))
                flat.print_log_msg('curr_locus: ' + repr(curr_locus))
                continue  #continue to next partition
                # raise Exception('Error: curr_locus not found!')

            # Determine end locus
            if p_num + 1 < len(self.partitions):
                end_locus = int(
                    (self.partitions[p_num][1] + self.partitions[p_num + 1][0])
                    / 2)
            else:
                # end_locus = self.partitions[p_num][1]

                # Find last locus <= snp_last
                end_locus_found = False
                for i in reversed(range(0, len(self.locus_list))):
                    # for locus in reversed(locus_list):
                    if self.locus_list[i] <= self.snp_last:
                        end_locus = self.locus_list[i]
                        end_locus_index = i
                        end_locus_found = True
                        break

                if not end_locus_found:
                    end_locus_index = 0
                    end_locus = self.locus_list[end_locus_index]

            flat.print_log_msg('Running for partition: ' + str(p))
            # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD
            while curr_locus <= end_locus:
                x = self.locus_list[curr_locus_index]
                y = self.locus_list[curr_locus_index]
                delta = 0

                while x >= self.partitions[p_num][0] and y <= self.partitions[
                        p_num][1]:
                    if x in self.matrix and y in self.matrix[x]['data']:
                        corr_coeff = self.matrix[x]['data'][y][
                            'shrink'] / math.sqrt(
                                self.matrix[x]['data'][x]['shrink'] *
                                self.matrix[y]['data'][y]['shrink'])
                        self.add_corr_coeff(corr_coeff, curr_locus)

                        # Just save it in the matrix ;) ...for img
                        self.matrix[x]['data'][y]['corr_coeff'] = corr_coeff

                    if delta != 0:
                        x = self.locus_list[curr_locus_index - delta + 1]
                        if x in self.matrix and y in self.matrix[x]['data']:
                            corr_coeff = self.matrix[x]['data'][y][
                                'shrink'] / math.sqrt(
                                    self.matrix[x]['data'][x]['shrink'] *
                                    self.matrix[y]['data'][y]['shrink'])
                            self.add_corr_coeff(corr_coeff, curr_locus)

                            # Just save it in the matrix ;) ...for img
                            self.matrix[x]['data'][y][
                                'corr_coeff'] = corr_coeff

                    delta += 1
                    if curr_locus_index - delta >= 0:
                        x = self.locus_list[curr_locus_index - delta]
                    else:
                        # flat.print_log_msg('X index out of bounds')
                        break

                    if curr_locus_index + delta < len(self.locus_list):
                        y = self.locus_list[curr_locus_index + delta]
                    else:
                        # flat.print_log_msg('Y index out of bounds')
                        break

                if curr_locus_index + 1 < len(self.locus_list):
                    curr_locus_index += 1
                    curr_locus = self.locus_list[curr_locus_index]
                else:
                    flat.print_log_msg('curr_locus_index out of bounds')
                    break

            # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename'])
            if self.dynamic_delete:
                flat.print_log_msg('Deleting loci not required any more')
                if p_num + 1 < len(self.partitions):
                    delete_loc = self.partitions[p_num +
                                                 1][0]  # diag - specific
                else:
                    delete_loc = end_locus

                flat.delete_loci_smaller_than(delete_loc, self.matrix,
                                              self.locus_list,
                                              self.locus_list_deleted)
            else:
                flat.print_log_msg('locus_list size: ' +
                                   repr(len(self.locus_list)))

        self.start_locus = start_locus
        self.start_locus_index = start_locus_index
        self.end_locus = end_locus
        self.end_locus_index = end_locus_index

        self.calculation_complete = True
    def init_search_full(self):
        # flat.print_log_msg('Removing existing matrix output file')
        # try:
        #     os.remove(cnst.const['out_matrix_delim'])
        # except OSError:
        #     pass

        if not self.dynamic_delete:
            raise Exception(
                'Error: dynamic_delete should be True for local search!')

        flat.print_log_msg('Start local search init')

        # pre-read all relevant partitions at beginning!
        last_p_num = -1
        for p_num_init in range(0, len(self.partitions) - 1):
            if self.snp_bottom >= self.partitions[p_num_init + 1][0]:
                flat.print_log_msg('Pre-reading partition: ' +
                                   str(self.partitions[p_num_init]))
                flat.read_partition_into_matrix(self.partitions, p_num_init,
                                                self.matrix, self.locus_list,
                                                self.name, self.input_config,
                                                self.snp_bottom, self.snp_top)
                last_p_num = p_num_init
            else:
                break

        curr_locus = -1
        # for p_num, p in enumerate(self.partitions):
        for p_num in range(last_p_num + 1, len(self.partitions)):
            p = self.partitions[p_num]

            flat.print_log_msg('Reading partition: ' + str(p))
            # Data must be read until snp_top!
            flat.read_partition_into_matrix(self.partitions, p_num,
                                            self.matrix, self.locus_list,
                                            self.name, self.input_config,
                                            self.snp_bottom, self.snp_top)

            # Determine first locus
            if curr_locus < 0:  # Either first partition or not found in first partition
                # curr_locus = -1 # <- this should have been set to -1 before entering the main for loop
                if len(self.locus_list) > 0:
                    # Find first locus >= snp_bottom
                    for i, locus in enumerate(self.locus_list):
                        if locus >= self.snp_bottom:
                            curr_locus = locus
                            start_locus = locus
                            curr_locus_index = i
                            start_locus_index = i
                            break
                else:
                    raise Exception('Error: locus_list seems to be empty')
            # else:
            #   if len(self.locus_list)>0:
            #       curr_locus = self.locus_list[0]
            #       curr_locus_index = 0
            #   else:
            #       raise Exception('Error: locus_list seems to be empty')
            else:
                try:
                    curr_locus_index = self.locus_list.index(curr_locus)
                    # curr_locus is carried from prev iteration, but index has changed since part of matrix (and locus_list) has been deleted
                except ValueError:
                    if len(self.locus_list) > 0:
                        curr_locus = self.locus_list[0]
                        curr_locus_index = 0
                    else:
                        raise Exception('Error: locus_list seems to be empty')

            if curr_locus < 0:
                flat.print_log_msg(
                    'Warning: curr_locus not found! Continuing to next partition.'
                )
                flat.print_log_msg(
                    'Comment: This is possibly due to snp_bottom being very close to end of partition.'
                )
                flat.print_log_msg('Details: ')
                flat.print_log_msg('Partition: ' + repr(p))
                flat.print_log_msg('snp_bottom: ' + repr(self.snp_bottom))
                flat.print_log_msg('curr_locus: ' + repr(curr_locus))
                continue  #continue to next partition
                # raise Exception('Error: curr_locus not found!')

            if p_num + 1 < len(self.partitions):
                end_locus = self.partitions[p_num + 1][0]
                end_locus_index = -1
            else:
                # end_locus = self.partitions[p_num][1]

                # Find last locus <= snp_last
                end_locus_found = False
                for i in reversed(range(0, len(self.locus_list))):
                    # for locus in reversed(locus_list):
                    if self.locus_list[i] <= self.snp_last:
                        end_locus = self.locus_list[i]
                        end_locus_index = i
                        end_locus_found = True
                        break

                if not end_locus_found:
                    end_locus_index = 0
                    end_locus = self.locus_list[end_locus_index]

            # flat.print_log_msg('self.locus_list control output: '+repr(self.locus_list))

            flat.print_log_msg('Running precompute for partition: ' + str(p))

            flat.print_log_msg('start_locus: ' + repr(start_locus) +
                               ' end_locus: ' + repr(end_locus) +
                               ' end_locus_index ' + repr(end_locus_index))
            # This will not include the very last SNP of the complete range, but that shouldn't be too important since the end of the range shouldn't be a defining location for LD
            while curr_locus <= end_locus:
                self.add_locus_to_precomputed(
                    curr_locus
                )  # We want snp_bottom to be added here always (for later use). Same thing for snp_top

                # flat.print_log_msg('curr_locus: '+repr(curr_locus)+' end_locus: '+repr(end_locus))

                if (
                        curr_locus > self.snp_first
                        or self.initial_breakpoint_index == 0
                ) and (
                        curr_locus <= self.snp_last
                ):  # Do not include snp_first in the calculation unless the very first block is being taken into account. Do not calculate anything above snp_last, just insert dummies
                    for key, el in self.matrix[curr_locus]['data'].items():
                        # don't take into account anything over snp_top
                        if key <= self.snp_top:
                            corr_coeff = self.matrix[curr_locus]['data'][key][
                                'shrink'] / math.sqrt(
                                    self.matrix[curr_locus]['data'][curr_locus]
                                    ['shrink'] *
                                    self.matrix[key]['data'][key]['shrink'])

                            #                         if curr_locus != key: # Don't include diagonal! ...although not that important.
                            self.add_val_to_precomputed(
                                decimal.Decimal(corr_coeff**2), curr_locus, key
                            )  # If the diagonal is included, it doesn't matter because later we add and subtract is exactly once when adding and subra
    #                         else:
    #                             self.add_val_to_precomputed(decimal.Decimal(0), curr_locus, key)
                else:
                    self.add_val_to_precomputed(
                        decimal.Decimal(0), curr_locus, curr_locus
                    )  # Dummy value for snp_first! ...in order to be consistent for some other future use of these data structures

                if curr_locus_index + 1 < len(self.locus_list):
                    curr_locus_index += 1
                    curr_locus = self.locus_list[curr_locus_index]
                else:
                    flat.print_log_msg(
                        'curr_locus_index out of bounds'
                    )  # The possibility of this happening is only at the end of the range [usually chromosome] (end of last partition)
                    break

            # flat.delete_loci_smaller_than_and_output_matrix_to_file(end_locus, self.matrix, locus_list, locus_list_deleted, cnst.const['out_matrix_filename'])
            if self.dynamic_delete:
                flat.print_log_msg('Deleting loci not required any more')
                flat.delete_loci_smaller_than(end_locus, self.matrix,
                                              self.locus_list,
                                              self.locus_list_deleted)

        self.start_locus = start_locus
        self.start_locus_index = start_locus_index
        self.end_locus = end_locus
        self.end_locus_index = end_locus_index

        self.init_complete = True