Example #1
0
def condition_clustering(distance_name, folder='/media/lalil0u/New/projects/drug_screen/results/', color_gradient='YlOrRd',
                         hit_only=False, compare_to='MITO',
                          level_row=0.4, level_column=0.5,show=False,
                          filename='Clusters_{}_{}.pkl',
                          #to avoid reloading distance files each time
                          distances=None, all_exposures=None, 
                          row_method='ward'
                          ):
    '''
    DOING CONDITION CLUSTERING (MEDIAN OF EXPERIMENTS FOR THIS CONDITION)
    
- compare_to: We can do drug clustering based on their distances to eachother (value 'DS')
    but we can also do drug clustering based on their distances to Mitocheck (value 'MITO')
- 'hit_only': to do clustering considering hit distances only or no
    
    '''
    f=open(os.path.join(folder, 'DS_hits_1.5IQR.pkl'))
    _, _, exposure_hits=pickle.load(f)
    f.close()
    
    d=Counter(exposure_hits)
    d={el:d[el]/float(PASSED_QC_COND[el]) for el in d}
    distinct_exposure=filter(lambda x:d[x]>0.5, d)
    
    if distances is None:
        distances, _, exposure_, _=_return_right_distance(distance_name, folder, filter_replicates=True,
                                                                hit_only=hit_only, compare_to=compare_to)
    
        if not hit_only:
            all_exposures=sorted(Counter(exposure_).keys())
            distances=np.vstack((np.median(distances[np.where(exposure_==condition)],0)  for condition in all_exposures))
            
        else:
            all_exposures=sorted(distinct_exposure)
            distances=np.vstack((np.median(distances[np.where(exposure_==condition)],0)  for condition in all_exposures))

    if compare_to=='MITO':
        column_header=[k for k in range(distances.shape[1])]
    else:
        column_header=all_exposures
    print distances.shape
    clusters=hierarchical_clustering.heatmap(distances, row_header=all_exposures, column_header=column_header, 
                                    row_method=row_method, column_method='ward', 
                                    row_metric='euclidean', column_metric='euclidean', 
                                    color_gradient=color_gradient, filename="{}{}".format(int(hit_only),distance_name),
                                    folder='{}/inference_Freplicates'.format(folder), 
                                    level_row=level_row, level_column=level_column,
                                    title=drug_screen_utils.DISTANCES[distance_name],
                                    colorbar_ticks=[-2, 0, 2],
                                    colorbar_ticklabels=[0, '', 1], show=show,
                                    colorbar_title='Distance (arbitrary units)',
                                    range_normalization=(scoreatpercentile(distances.flatten(),10), scoreatpercentile(distances.flatten(), per=90)))

    
    global_=np.bincount(clusters)
    if not hit_only:
        hit_clusters = clusters[np.where(np.array([el in distinct_exposure for el in all_exposures]))]
        all_exposures=distinct_exposure
    else:
        hit_clusters=clusters
        
    print len(hit_clusters), len(distinct_exposure)
    print global_, np.bincount(hit_clusters)

    who_cluster_hits={k: Counter(np.array(all_exposures)[np.where(hit_clusters==k)]) for k in range(1,np.max(clusters)+1)}
    
    if hit_only:
        f=open(os.path.join(folder, 'inference_Freplicates', filename.format(distance_name, level_row)), 'w')
        pickle.dump(who_cluster_hits, f); f.close()
    
    return distances, all_exposures, who_cluster_hits
Example #2
0
def experiment_clustering(distance_name, folder='/media/lalil0u/New/projects/drug_screen/results/', color_gradient='YlOrRd',
                         hit_only=False, compare_to='MITO',
                          level=0.4):
    '''
    DOING EXPERIMENT CLUSTERING AS OPPOSED TO CONDITION CLUSTERING
    
- compare_to: We can do drug clustering based on their distances to eachother (value 'DS')
    but we can also do drug clustering based on their distances to Mitocheck (value 'MITO')
- 'hit_only': to do clustering considering hit distances only or no
    
    '''
    distances, who_, exposure_, mito_who=_return_right_distance(distance_name, folder, 
                                                                filter_replicates=True,
                                                                hit_only=hit_only, compare_to=compare_to)
    
    plates=np.array([int(el.split('--')[0].split('_')[1]) for el in who_])
    exposure_wPL=np.array(['{}{:>10}'.format(exposure_[i], plates[i]) for i in range(len(exposure_))])

    f=open(os.path.join(folder, 'DS_hits_1.5IQR.pkl'))
    _, who_hits, exposure_hits=pickle.load(f)
    f.close()
    
    d=Counter(exposure_hits)
    d={el:d[el]/float(PASSED_QC_COND[el]) for el in d}
    distinct_exposure=filter(lambda x:d[x]>0.5, d)
    
    if hit_only:
        wh_=np.hstack((np.where(who_==who_hits[i])[0] for i in range(len(who_hits)) if exposure_hits[i] in distinct_exposure))
        distances=distances[wh_]
        if compare_to=='DS':
            distances=distances[:,wh_]
        print wh_
        who_=who_[wh_]
        exposure_wPL=exposure_wPL[wh_]
        plates=plates[wh_]
        
    if compare_to=='MITO':
        column_header=mito_who
    else:
        column_header=who_
    
    clusters=hierarchical_clustering.heatmap(distances, row_header=exposure_wPL, column_header=column_header, 
                                    row_method='ward', column_method='ward', 
                                    row_metric='euclidean', column_metric='euclidean', 
                                    color_gradient=color_gradient, filename="E{}".format(distance_name),
                                    folder='{}/inference_Freplicates'.format(folder), 
                                    level=level,title=drug_screen_utils.DISTANCES[distance_name],
                                    colorbar_ticks=[-2, 0, 2],
                                    colorbar_ticklabels=[0, '', 1],
                                    colorbar_title='Distance (arbitrary units)',
                                    range_normalization=(scoreatpercentile(distances.flatten(),10), scoreatpercentile(distances.flatten(), per=90)))

    
    global_=np.bincount(clusters)
    if not hit_only:
        hit_clusters = clusters[np.where(np.array([el in who_hits for el in who_]))]
        hit_clusters = hit_clusters[np.where(np.array([exposure_hits[i] in distinct_exposure for i in range(len(who_hits))]))]
        
        plates=plates[np.where(np.array([el in who_hits for el in who_]))]
        plates=plates[np.where(np.array([exposure_hits[i] in distinct_exposure for i in range(len(who_hits))]))]
        
        exposure_hits=exposure_hits[np.where(np.array([exposure_hits[i] in distinct_exposure for i in range(len(who_hits))]))]
    else:
        hit_clusters=clusters
    print len(hit_clusters), len(exposure_hits), len(plates)
    print global_, np.bincount(hit_clusters)
    
    who_cluster_hits={k: Counter(exposure_hits[np.where(hit_clusters==k)]) for k in range(1,np.max(clusters)+1)}
    plate_clusters={k: Counter(plates[np.where(hit_clusters==k)]) for k in range(1,np.max(clusters)+1)}
    
    return hit_clusters, who_cluster_hits, plate_clusters