def correlateEigenvectorWithGeneTrack(pMatrix, pEigenvector, pGeneTrack): ''' This function correlates the eigenvectors per chromosome with the gene density. If the correlation is negative, the eigenvector values are multiplied with -1. ''' file_h = opener(pGeneTrack) bed = ReadBed(file_h) gene_occurrence = np.zeros(len(pMatrix.cut_intervals)) gene_occurrence_per_chr = {} chromosome_list = pMatrix.getChrNames() for interval in bed: chromosome_name = interval.chromosome if chromosome_name not in chromosome_list: continue # in which bin of the Hi-C matrix is the given gene? bin_id = pMatrix.getRegionBinRange(interval.chromosome, interval.start, interval.end) # add +1 for one gene occurrence in this bin gene_occurrence[bin_id[1]] += 1 for chromosome in chromosome_list: # where is the start and the end bin of a chromosome? bin_id = pMatrix.getChrBinRange(chromosome) gene_occurrence_per_chr[chromosome] = \ gene_occurrence[bin_id[0]:bin_id[1]] # change from [[1,2], [3,4], [5,6]] to [[1,3,5],[2,4,6]] pEigenvector = np.array(pEigenvector).real.transpose() # correlate gene density and eigenvector values. # if positive correlation, do nothing, if negative, flip the values. # computed per chromosome for chromosome in chromosome_list: bin_id = pMatrix.getChrBinRange(chromosome) for i, eigenvector in enumerate(pEigenvector): _correlation = pearsonr(eigenvector[bin_id[0]:bin_id[1]].real, gene_occurrence_per_chr[chromosome]) if _correlation[0] < 0: eigenvector[bin_id[0]:bin_id[1]] = np.negative( eigenvector[bin_id[0]:bin_id[1]]) return np.array(pEigenvector).transpose()
def correlateEigenvectorWithGeneTrack(pMatrix, pEigenvector, pGeneTrack): ''' This function correlates the eigenvectors per chromosome with the gene density. If the correlation is negative, the eigenvector values are multiplied with -1. ''' log.debug('correlate eigenvector!') file_h = opener(pGeneTrack) bed = ReadBed(file_h) gene_occurrence = np.zeros(len(pMatrix.cut_intervals)) gene_occurrence_per_chr = {} chromosome_list = pMatrix.getChrNames() for interval in bed: chromosome_name = interval.chromosome if chromosome_name not in chromosome_list: continue if interval.start > pMatrix.get_chromosome_sizes()[chromosome_name]: log.warning( 'Your chromosome sizes do not match the chromosome sizes of the extraTrack data!' ) log.warning( 'Your chromosome {}; Size {}. ExtraTrack data {} {} {}'.format( chromosome_name, pMatrix.get_chromosome_sizes()[chromosome_name], interval.chromosome, interval.start, interval.end)) log.warning( 'Please create your interaction matrix with a chromosome size file! However, if the sizes are intended and it is accepted that certain regions are not part of the correlation, you can ignore this message.' ) continue # in which bin of the Hi-C matrix is the given gene? bin_id = pMatrix.getRegionBinRange(interval.chromosome, interval.start, interval.end) if bin_id is None: log.warning( 'Your chromosome sizes do not match the chromosome sizes of the extraTrack data!' ) log.warning( 'Your chromosome {}; Size {}. ExtraTrack data {} {} {}'.format( chromosome_name, pMatrix.get_chromosome_sizes()[chromosome_name], interval.chromosome, interval.start, interval.end)) log.warning( 'Please create your interaction matrix with a chromosome size file! However, if the sizes are intended and it is accepted that certain regions are not part of the correlation, you can ignore this message.' ) continue # add +1 for one gene occurrence in this bin gene_occurrence[bin_id[1]] += 1 for chromosome in chromosome_list: # where is the start and the end bin of a chromosome? bin_id = pMatrix.getChrBinRange(chromosome) gene_occurrence_per_chr[chromosome] = \ gene_occurrence[bin_id[0]:bin_id[1]] # change from [[1,2], [3,4], [5,6]] to [[1,3,5],[2,4,6]] pEigenvector = np.array(pEigenvector).real.transpose() # correlate gene density and eigenvector values. # if positive correlation, do nothing, if negative, flip the values. # computed per chromosome for chromosome in chromosome_list: bin_id = pMatrix.getChrBinRange(chromosome) for i, eigenvector in enumerate(pEigenvector): _correlation = pearsonr( np.array(eigenvector[bin_id[0]:bin_id[1]]).real, gene_occurrence_per_chr[chromosome]) if _correlation[0] < 0: eigenvector[bin_id[0]:bin_id[1]] = np.negative( eigenvector[bin_id[0]:bin_id[1]]) # log.debug('correlated to {}!'.format(_correlation[0])) return np.array(pEigenvector).transpose()