def _find_ab_compartments(gamma, matrix, breaks, cmprtsec, save=True, verbose=False): # function to convert correlation into distances gamma += 1 func = lambda x: -abs(x)**gamma / x funczero = lambda x: 0.0 # calculate distance_matrix dist_matrix = [[0 for _ in xrange(len(breaks))] for _ in xrange(len(breaks))] scores = {} for k, cmprt in enumerate(cmprtsec): beg1, end1 = cmprt['start'], cmprt['end'] diff1 = end1 - beg1 scores[(k,k)] = dist_matrix[k][k] = -1 for l in xrange(k + 1, len(cmprtsec)): beg2, end2 = cmprtsec[l]['start'], cmprtsec[l]['end'] val = nansum([matrix[i][j] for i in xrange(beg1, end1) for j in xrange(beg2, end2)]) / (end2 - beg2) / diff1 try: scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = func(val) except ZeroDivisionError: scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(val) if isnan(scores[(k,l)]): scores[(k,l)] = dist_matrix[k][l] = scores[(l,k)] = dist_matrix[l][k] = funczero(0) # cluster compartments according to their correlation score try: clust = linkage(dist_matrix, method='ward') except UnboundLocalError: print('WARNING: Chromosome probably too small. Skipping') warn('WARNING: Chromosome probably too small. Skipping') return (0,0,0,0) # find best place to divide dendrogram (only check 1, 2, 3 or 4 clusters) solutions = {} for k in clust[:,2][-3:]: clusters = {} _ = [clusters.setdefault(j, []).append(i) for i, j in enumerate(fcluster(clust, k, criterion='distance'))] solutions[k] = {'out': clusters} solutions[k]['score'] = calinski_harabasz(scores, clusters) try: # take best cluster according to calinski_harabasz score clusters = [solutions[s] for s in sorted( solutions, key=lambda x: solutions[x]['score']) if solutions[s]['score']>0][-1]['out'] except IndexError: #warn('WARNING: compartment clustering is not clear. Skipping') return (0,0,0,0) if len(clusters) != 2: #warn('WARNING: compartment clustering is too clear. Skipping') return (0,0,0,0) # labelling compartments. A compartments shall have lower # mean intra-interactions dens = {} for k in clusters: val = sum([cmprtsec[c]['dens'] for c in clusters[k]]) / len(clusters[k]) dens['A' if val < 1 else 'B'] = [ cmprtsec[c]['dens'] for c in clusters[k] if cmprtsec[c]['end'] - cmprtsec[c]['start'] > 2] if save: for c in clusters[k]: cmprtsec[c]['type'] = 'A' if val < 1 else 'B' try: tt, pval = ttest_ind(dens['A'], dens['B']) except ZeroDivisionError: return (0,0,0,0) prop = float(len(dens['A'])) / (len(dens['A']) + len(dens['B'])) score = 5000*(prop- 0.5)**4 - 2 if verbose: print 'g:%5s %5s%% pen:%7s tt:%7s score:%7s pv:%s' % ( gamma - 1, round(prop*100, 1), round(score, 3), round(tt, 3), round(score + tt, 3), pval) return score + tt, tt, prop
def _find_ab_compartments(gamma, matrix, breaks, cmprtsec, save=True, verbose=False): # function to convert correlation into distances gamma += 1 func = lambda x: -abs(x)**gamma / x funczero = lambda x: 0.0 # calculate distance_matrix dist_matrix = [[0 for _ in xrange(len(breaks))] for _ in xrange(len(breaks))] scores = {} for k, cmprt in enumerate(cmprtsec): beg1, end1 = cmprt['start'], cmprt['end'] diff1 = end1 - beg1 scores[(k, k)] = dist_matrix[k][k] = -1 for l in xrange(k + 1, len(cmprtsec)): beg2, end2 = cmprtsec[l]['start'], cmprtsec[l]['end'] val = nansum([ matrix[i][j] for i in xrange(beg1, end1) for j in xrange(beg2, end2) ]) / (end2 - beg2) / diff1 try: scores[(k, l)] = dist_matrix[k][l] = scores[( l, k)] = dist_matrix[l][k] = func(val) except ZeroDivisionError: scores[(k, l)] = dist_matrix[k][l] = scores[( l, k)] = dist_matrix[l][k] = funczero(val) if isnan(scores[(k, l)]): scores[(k, l)] = dist_matrix[k][l] = scores[( l, k)] = dist_matrix[l][k] = funczero(0) # cluster compartments according to their correlation score try: clust = linkage(dist_matrix, method='ward') except UnboundLocalError: print('WARNING: Chromosome probably too small. Skipping') warn('WARNING: Chromosome probably too small. Skipping') return (0, 0, 0, 0) # find best place to divide dendrogram (only check 1, 2, 3 or 4 clusters) solutions = {} for k in clust[:, 2][-3:]: clusters = {} _ = [ clusters.setdefault(j, []).append(i) for i, j in enumerate(fcluster(clust, k, criterion='distance')) ] solutions[k] = {'out': clusters} solutions[k]['score'] = calinski_harabasz(scores, clusters) try: # take best cluster according to calinski_harabasz score clusters = [ solutions[s] for s in sorted(solutions, key=lambda x: solutions[x]['score']) if solutions[s]['score'] > 0 ][-1]['out'] except IndexError: #warn('WARNING: compartment clustering is not clear. Skipping') return (0, 0, 0, 0) if len(clusters) != 2: #warn('WARNING: compartment clustering is too clear. Skipping') return (0, 0, 0, 0) # labelling compartments. A compartments shall have lower # mean intra-interactions dens = {} for k in clusters: val = sum([cmprtsec[c]['dens'] for c in clusters[k]]) / len(clusters[k]) dens['A' if val < 1 else 'B'] = [ cmprtsec[c]['dens'] for c in clusters[k] if cmprtsec[c]['end'] - cmprtsec[c]['start'] > 2 ] if save: for c in clusters[k]: cmprtsec[c]['type'] = 'A' if val < 1 else 'B' try: tt, pval = ttest_ind(dens['A'], dens['B']) except ZeroDivisionError: return (0, 0, 0, 0) prop = float(len(dens['A'])) / (len(dens['A']) + len(dens['B'])) score = 5000 * (prop - 0.5)**4 - 2 if verbose: print 'g:%5s %5s%% pen:%7s tt:%7s score:%7s pv:%s' % ( gamma - 1, round(prop * 100, 1), round(score, 3), round( tt, 3), round(score + tt, 3), pval) return score + tt, tt, prop