Esempio n. 1
0
def correlateEigenvectorWithGeneTrack(pMatrix, pEigenvector, pGeneTrack):
    '''
    This function correlates the eigenvectors per chromosome with the gene
    density. If the correlation is negative, the eigenvector values are
    multiplied with -1.
    '''

    file_h = opener(pGeneTrack)
    bed = ReadBed(file_h)

    gene_occurrence = np.zeros(len(pMatrix.cut_intervals))
    gene_occurrence_per_chr = {}

    chromosome_list = pMatrix.getChrNames()

    for interval in bed:
        chromosome_name = interval.chromosome
        if chromosome_name not in chromosome_list:
            continue
        # in which bin of the Hi-C matrix is the given gene?
        bin_id = pMatrix.getRegionBinRange(interval.chromosome, interval.start,
                                           interval.end)

        # add +1 for one gene occurrence in this bin
        gene_occurrence[bin_id[1]] += 1

    for chromosome in chromosome_list:
        # where is the start and the end bin of a chromosome?
        bin_id = pMatrix.getChrBinRange(chromosome)
        gene_occurrence_per_chr[chromosome] = \
            gene_occurrence[bin_id[0]:bin_id[1]]

    # change from [[1,2], [3,4], [5,6]] to [[1,3,5],[2,4,6]]
    pEigenvector = np.array(pEigenvector).real.transpose()

    # correlate gene density and eigenvector values.
    # if positive correlation, do nothing, if negative, flip the values.
    # computed per chromosome
    for chromosome in chromosome_list:
        bin_id = pMatrix.getChrBinRange(chromosome)
        for i, eigenvector in enumerate(pEigenvector):
            _correlation = pearsonr(eigenvector[bin_id[0]:bin_id[1]].real,
                                    gene_occurrence_per_chr[chromosome])
            if _correlation[0] < 0:
                eigenvector[bin_id[0]:bin_id[1]] = np.negative(
                    eigenvector[bin_id[0]:bin_id[1]])

    return np.array(pEigenvector).transpose()
Esempio n. 2
0
def correlateEigenvectorWithGeneTrack(pMatrix, pEigenvector, pGeneTrack):
    '''
    This function correlates the eigenvectors per chromosome with the gene
    density. If the correlation is negative, the eigenvector values are
    multiplied with -1.
    '''
    log.debug('correlate eigenvector!')
    file_h = opener(pGeneTrack)
    bed = ReadBed(file_h)

    gene_occurrence = np.zeros(len(pMatrix.cut_intervals))
    gene_occurrence_per_chr = {}

    chromosome_list = pMatrix.getChrNames()

    for interval in bed:
        chromosome_name = interval.chromosome
        if chromosome_name not in chromosome_list:
            continue
        if interval.start > pMatrix.get_chromosome_sizes()[chromosome_name]:
            log.warning(
                'Your chromosome sizes do not match the chromosome sizes of the extraTrack data!'
            )
            log.warning(
                'Your chromosome {}; Size {}. ExtraTrack data {} {} {}'.format(
                    chromosome_name,
                    pMatrix.get_chromosome_sizes()[chromosome_name],
                    interval.chromosome, interval.start, interval.end))
            log.warning(
                'Please create your interaction matrix with a chromosome size file! However, if the sizes are intended and it is accepted that certain regions are not part of the correlation, you can ignore this message.'
            )
            continue
        # in which bin of the Hi-C matrix is the given gene?
        bin_id = pMatrix.getRegionBinRange(interval.chromosome, interval.start,
                                           interval.end)
        if bin_id is None:
            log.warning(
                'Your chromosome sizes do not match the chromosome sizes of the extraTrack data!'
            )
            log.warning(
                'Your chromosome {}; Size {}. ExtraTrack data {} {} {}'.format(
                    chromosome_name,
                    pMatrix.get_chromosome_sizes()[chromosome_name],
                    interval.chromosome, interval.start, interval.end))
            log.warning(
                'Please create your interaction matrix with a chromosome size file! However, if the sizes are intended and it is accepted that certain regions are not part of the correlation, you can ignore this message.'
            )
            continue
        # add +1 for one gene occurrence in this bin
        gene_occurrence[bin_id[1]] += 1

    for chromosome in chromosome_list:
        # where is the start and the end bin of a chromosome?
        bin_id = pMatrix.getChrBinRange(chromosome)
        gene_occurrence_per_chr[chromosome] = \
            gene_occurrence[bin_id[0]:bin_id[1]]

    # change from [[1,2], [3,4], [5,6]] to [[1,3,5],[2,4,6]]
    pEigenvector = np.array(pEigenvector).real.transpose()

    # correlate gene density and eigenvector values.
    # if positive correlation, do nothing, if negative, flip the values.
    # computed per chromosome
    for chromosome in chromosome_list:
        bin_id = pMatrix.getChrBinRange(chromosome)
        for i, eigenvector in enumerate(pEigenvector):
            _correlation = pearsonr(
                np.array(eigenvector[bin_id[0]:bin_id[1]]).real,
                gene_occurrence_per_chr[chromosome])
            if _correlation[0] < 0:
                eigenvector[bin_id[0]:bin_id[1]] = np.negative(
                    eigenvector[bin_id[0]:bin_id[1]])
            # log.debug('correlated to {}!'.format(_correlation[0]))
    return np.array(pEigenvector).transpose()