Beispiel #1
0
def _find_ab_compartments(gamma, matrix, breaks, cmprtsec, save=True, verbose=False):
    # function to convert correlation into distances

    gamma += 1
    func = lambda x: -abs(x)**gamma / x
    funczero = lambda x: 0.0
    # calculate distance_matrix
    dist_matrix = [[0 for _ in xrange(len(breaks))]
                   for _ in xrange(len(breaks))]
    scores = {}
    for k, cmprt in enumerate(cmprtsec):
        beg1, end1 = cmprt['start'], cmprt['end']
        diff1 = end1 - beg1
        scores[(k,k)] = dist_matrix[k][k] = -1
        for l in xrange(k + 1, len(cmprtsec)):
            beg2, end2 = cmprtsec[l]['start'], cmprtsec[l]['end']
            val = nansum([matrix[i][j] for i in xrange(beg1, end1)
                          for j in xrange(beg2, end2)]) / (end2 - beg2) / diff1
            try:
                scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = func(val)
            except ZeroDivisionError:
                scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(val)
            if isnan(scores[(k,l)]):
                scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(0)
    # cluster compartments according to their correlation score
    try:
        clust = linkage(dist_matrix, method='ward')
    except UnboundLocalError:
        print('WARNING: Chromosome probably too small. Skipping')
        warn('WARNING: Chromosome probably too small. Skipping')
        return (0,0,0,0)
    # find best place to divide dendrogram (only check 1, 2, 3 or 4 clusters)
    solutions = {}
    for k in clust[:,2][-3:]:
        clusters = {}
        _ = [clusters.setdefault(j, []).append(i) for i, j in
             enumerate(fcluster(clust, k, criterion='distance'))]
        solutions[k] = {'out': clusters}
        solutions[k]['score'] = calinski_harabasz(scores, clusters)
    try:
        # take best cluster according to calinski_harabasz score
        clusters = [solutions[s] for s in sorted(
            solutions, key=lambda x: solutions[x]['score'])
                    if solutions[s]['score']>0][-1]['out']
    except IndexError:
        #warn('WARNING: compartment clustering is not clear. Skipping')
        return (0,0,0,0)
    if len(clusters) != 2:
        #warn('WARNING: compartment clustering is too clear. Skipping')
        return (0,0,0,0)
    # labelling compartments. A compartments shall have lower
    # mean intra-interactions
    dens = {}
    for k in clusters:
        val = sum([cmprtsec[c]['dens']
                   for c in clusters[k]]) / len(clusters[k])
        dens['A' if val < 1 else 'B'] = [
            cmprtsec[c]['dens'] for c in clusters[k]
            if cmprtsec[c]['end'] - cmprtsec[c]['start'] > 2]
        if save:
            for c in clusters[k]:
                cmprtsec[c]['type'] = 'A' if val < 1 else 'B'
    try:
        tt, pval = ttest_ind(dens['A'], dens['B'])
    except ZeroDivisionError:
        return (0,0,0,0)
    prop = float(len(dens['A'])) / (len(dens['A']) + len(dens['B']))
    score = 5000*(prop- 0.5)**4 - 2
    if verbose:
        print 'g:%5s %5s%% pen:%7s tt:%7s score:%7s pv:%s' % (
            gamma - 1, round(prop*100, 1), round(score, 3), round(tt, 3),
            round(score + tt, 3), pval)
    return score + tt, tt, prop
Beispiel #2
0
def _find_ab_compartments(gamma,
                          matrix,
                          breaks,
                          cmprtsec,
                          save=True,
                          verbose=False):
    # function to convert correlation into distances

    gamma += 1
    func = lambda x: -abs(x)**gamma / x
    funczero = lambda x: 0.0
    # calculate distance_matrix
    dist_matrix = [[0 for _ in xrange(len(breaks))]
                   for _ in xrange(len(breaks))]
    scores = {}
    for k, cmprt in enumerate(cmprtsec):
        beg1, end1 = cmprt['start'], cmprt['end']
        diff1 = end1 - beg1
        scores[(k, k)] = dist_matrix[k][k] = -1
        for l in xrange(k + 1, len(cmprtsec)):
            beg2, end2 = cmprtsec[l]['start'], cmprtsec[l]['end']
            val = nansum([
                matrix[i][j] for i in xrange(beg1, end1)
                for j in xrange(beg2, end2)
            ]) / (end2 - beg2) / diff1
            try:
                scores[(k, l)] = dist_matrix[k][l] = scores[(
                    l, k)] = dist_matrix[l][k] = func(val)
            except ZeroDivisionError:
                scores[(k, l)] = dist_matrix[k][l] = scores[(
                    l, k)] = dist_matrix[l][k] = funczero(val)
            if isnan(scores[(k, l)]):
                scores[(k, l)] = dist_matrix[k][l] = scores[(
                    l, k)] = dist_matrix[l][k] = funczero(0)
    # cluster compartments according to their correlation score
    try:
        clust = linkage(dist_matrix, method='ward')
    except UnboundLocalError:
        print('WARNING: Chromosome probably too small. Skipping')
        warn('WARNING: Chromosome probably too small. Skipping')
        return (0, 0, 0, 0)
    # find best place to divide dendrogram (only check 1, 2, 3 or 4 clusters)
    solutions = {}
    for k in clust[:, 2][-3:]:
        clusters = {}
        _ = [
            clusters.setdefault(j, []).append(i)
            for i, j in enumerate(fcluster(clust, k, criterion='distance'))
        ]
        solutions[k] = {'out': clusters}
        solutions[k]['score'] = calinski_harabasz(scores, clusters)
    try:
        # take best cluster according to calinski_harabasz score
        clusters = [
            solutions[s]
            for s in sorted(solutions, key=lambda x: solutions[x]['score'])
            if solutions[s]['score'] > 0
        ][-1]['out']
    except IndexError:
        #warn('WARNING: compartment clustering is not clear. Skipping')
        return (0, 0, 0, 0)
    if len(clusters) != 2:
        #warn('WARNING: compartment clustering is too clear. Skipping')
        return (0, 0, 0, 0)
    # labelling compartments. A compartments shall have lower
    # mean intra-interactions
    dens = {}
    for k in clusters:
        val = sum([cmprtsec[c]['dens']
                   for c in clusters[k]]) / len(clusters[k])
        dens['A' if val < 1 else 'B'] = [
            cmprtsec[c]['dens'] for c in clusters[k]
            if cmprtsec[c]['end'] - cmprtsec[c]['start'] > 2
        ]
        if save:
            for c in clusters[k]:
                cmprtsec[c]['type'] = 'A' if val < 1 else 'B'
    try:
        tt, pval = ttest_ind(dens['A'], dens['B'])
    except ZeroDivisionError:
        return (0, 0, 0, 0)
    prop = float(len(dens['A'])) / (len(dens['A']) + len(dens['B']))
    score = 5000 * (prop - 0.5)**4 - 2
    if verbose:
        print 'g:%5s %5s%% pen:%7s tt:%7s score:%7s pv:%s' % (
            gamma - 1, round(prop * 100, 1), round(score, 3), round(
                tt, 3), round(score + tt, 3), pval)
    return score + tt, tt, prop