Beispiel #1
0
def write_MOHGP_results(organ, strain, path, fit, geneID):
    """
    A helper function akin to 'write_gene_model_results' to save MOHGP results to disk

    Arguments
    =========
    organ - Blood/Spleen
    strain - AS/CB
    path - a structured dictionary of paths returned by config
    fit - a Mixture of Hierarchical Gaussian Process model
    geneID - a pandas data frame containing ordered probeID/geneSymbols
            NOTE: this is different from probesToCluster order
        
    Returns
    =========
    None - data saved to disk

    """
    # Extract the cluster assigned to each probe
    clustNum = np.argmax(fit.phi, axis=1) + 1 # cluster number
    clustName = [strain + '_' + organ[:2] + '_%02d' % i for i in clustNum] # cluster name
    geneID['Cluster'] = clustName # add to data frame
    
    # Extract the gene and probe list
    geneList = []; probeList = []; header = []
    for name in np.unique(clustName):
        bWant = geneID['Cluster'] == name
        geneList.append(list(geneID.loc[bWant, 'Symbol']))
        probeList.append(list(geneID.loc[bWant, 'ProbeID']))
        header.append(name)
   
    # Save to disk
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], organ + strain + '.csv'), 
                         header, geneList)
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + '.csv'), 
                         header, probeList) # Probe list 
    
    # Save model and standard plot    
    io.save_pickle(os.path.join(path['Clust']['Model'], organ + strain + ".pickle"), fit)
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + ".pdf"), standard_plot(fit))
    
    # Compute cluster predictions for xTest where xTest is taken from SmoothExprs
    data = pd.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organ + strain + ".csv"))  
    xTest = data.drop(['ProbeID', 'Symbol'], axis=1).columns.values.astype('float64')[:, None]
    mu, var = fit.predict_components(xTest) # Compute posterior mean and posterior variance
    # Write to disk (mu row ordering is biggest to smallest cluster)
    df = pd.DataFrame(data=np.array(mu), columns=list(map(str, xTest.flatten())))
    df['Cluster'] = header # header = cluster name
    df.to_csv(os.path.join(path['Clust']['Centres'], organ + strain + '.csv'), index=False)
    clustCentre = df # for readability
    
    # Merge smooth expression data frame with gene ID
    smoothExprs = pd.merge(geneID, data, how='left', on=['ProbeID', 'Symbol'])
    
    # Produce alternate plot
    hFig = alternate_plot(smoothExprs, clustCentre, config.COL[organ])    
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + '2.pdf'), hFig)
Beispiel #2
0
def cluster_membership_boxplot(organ, strain, path):
    """
    Boxplot of posterior probability that a gene pertains to that cluster

    Arguments
    =========   
    organ - Blood/Spleen
    strain - AS/CB
    path - dictionary with all results paths
    
    Returns
    =========
    None - Figure is saved to 'Misc' folder
    """
    # Read model fit
    fit = io.load_pickle(
        os.path.join(path['Clust']['Model'], organ + strain + ".pickle"))
    fit.reorder()
    labels = np.argmax(fit.phi, axis=1)  # 0, 1, 2, etc.
    # Read cluster names e.g 0 --> AS_Bl_01
    data = pd.read_csv(os.path.join(path['Clust']['Centres'],
                                    organ + strain + ".csv"),
                       sep=",")

    # Create a list of cluster membership
    membership = []
    clustName = []
    for label in np.unique(labels):
        membership.append(fit.phi[labels == label, label])
        clustName.append(data['Cluster'][label])

    # Create boxplot
    hFig = plt.figure(figsize=(16, 5))
    hAx = hFig.gca()
    hAx.boxplot(membership,
                labels=clustName,
                patch_artist=True,
                notch=True,
                boxprops={"facecolor": COL[organ]},
                medianprops={
                    "color": "black",
                    "linewidth": 3
                },
                whiskerprops={"color": "black"})
    hAx.set_ylabel("Posterior probability of assigning gene to cluster")
    hAx.set_ylim(0, 1)

    # Save figure to file
    filePath = os.path.join(path['Misc'],
                            "ClusterMembership" + organ + strain + '.pdf')
    io.save_pdf(filePath, hFig)
Beispiel #3
0
def fit_plot_save(k, smoothExprs, day, probeID, geneSymbol, organ, strain, path):
    """
    Fit k-means, plot and save results

    Arguments
    =========
    k - no. of clusters
    smoothExprs - gene expression rows = genes, columns = day
    day - day
    probeID - probeID
    geneSymbol - geneSymbol
    path - path
    
    Returns
    =========
    None - results are plotted and saved

    """
    model = KMeans(n_clusters=k)
    model.fit(smoothExprs)
    clustCentre = model.cluster_centers_
    # Plot results
    plot_silhouette(silhouette_samples(smoothExprs, model.labels_), model.labels_)
    clust.multi_plot(smoothExprs, clustCentre, day, model.labels_)
    # Hierarchical clustering
    # Ward + Euclidean
    header = ["Cluster%i" % label for label in np.unique(model.labels_)]    
    hclust = hc.linkage(clustCentre, method='ward', metric='euclidean')
    plt.figure(); plt.title("Hclust() Ward + Euclidean")
    hc.dendrogram(hclust, color_threshold=0.0, labels=header)
    #seed=101
    #embedding = tsne.tsne(smoothExprs, no_dims = 3, initial_dims = 20, perplexity = 30.0, seed=seed) # low dimensional embedding
    #tsne.plot(embedding, model.labels_)

    # Save model 
    io.save_pickle(os.path.join(path['Clust']['Model'], organ + strain + ".pickle"), model)
    # Save Gene/Probe List    
    geneList = clust.get_gene_list(model.labels_, geneSymbol)
    probeList = clust.get_gene_list(model.labels_, probeID)
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], organ + strain + ".csv"), header, geneList) # Gene list 
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + ".csv"), header, probeList) # Probe list    
    # Save Cluster "centres"    
    dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) 
    header = list(itertools.chain.from_iterable([["Cluster"], list(day)]))    
    io.write_to_csv(os.path.join(path['Clust']['Centres'], organ + strain + ".csv"), header, dataMatrix) # Cluster "centres"   
    # Save Alternate plot     
    hFig = clust.multi_plot(smoothExprs, clustCentre, day, model.labels_)    
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + "2.pdf"), hFig) # Plot 
def merge(organ, strain, groupToMerge, groupLabel, originalLabel, path):
    """
    Merge modules

    Arguments
    =========
    groupToMerge - list of lists e.g [[1,2], [3]]
    groupLabel - list of unique group labels e.g ["A", "B"]

    Returns
    =========
    newLabel - A, B, C etc.
    
    
    """
    # Load gene/probe list
    oldGeneList = pandas.read_csv(os.path.join(path['Clust']['GeneList'], organ + strain + ".csv"), sep=",")
    oldProbeList = pandas.read_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + ".csv"), sep=",")     
    # Initialise vars
    NGroup = len(groupToMerge)
    newLabel = originalLabel.astype(type(groupLabel))    
    newGeneList = []
    newProbeList = []        
    for iGroup in xrange(NGroup):
        tempGeneList = []
        tempProbeList = []
        for label in groupToMerge[iGroup]:
            newLabel[originalLabel == label] = groupLabel[iGroup]  
            bWant = ~pandas.isnull(oldGeneList['Cluster' + str(label)]) # some entries could be NaN 
            tempGeneList.append(np.array(oldGeneList['Cluster' + str(label)][bWant]))
            tempProbeList.append(np.array(oldProbeList['Cluster' + str(label)][bWant]))
        newGeneList.append(list(itertools.chain.from_iterable(tempGeneList)))
        newProbeList.append(list(itertools.chain.from_iterable(tempProbeList)))
    
    # Save Gene/Probe List    
    header = ["Cluster%s" % label for label in groupLabel]
    io.write_list_to_csv(os.path.join(path['ClustMerge']['GeneList'], organ + strain + ".csv"), header, newGeneList)
    io.write_list_to_csv(os.path.join(path['ClustMerge']['ProbeList'], organ + strain + ".csv"), header, newProbeList)      
    
    # Retrieve old clust centres
    data = pandas.read_csv(os.path.join(path['Clust']['Centres'], organ + strain + ".csv"), sep=",")  
    #oldClustCentre = data.values[:, 1:] # pick only the centres
    day = data.columns.values[1:].astype('float')      
    
    # Get smooth exprs 
    data = pandas.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organ + strain + ".csv"), sep=",")
    #bSelect = top_ranked(organ, strain, len(originalLabel), path) # 29/03/16 not applicable anymore as I'm choosing COMMON gene sets using clust.common_ranked()   
    allProbeID = np.array(data['ProbeID'])
    wantedProbeID = np.array(list(itertools.chain.from_iterable(newProbeList)))
    bSelect = np.array([allProbeID[i] in wantedProbeID for i in xrange(len(allProbeID))]) # simply creates a vector of T, F, T, whether gene is in geneSet or      
    smoothExprs = data.values[:, 2:].astype('float')[bSelect, :] 
   
    # get new label of old clust centres i.e 1, 2, 3, 4, 5 --> 'A', 'B', 'A', 'C', 'B'
    # VERY ugly - should've used dictionaries....hey ho
    newLabelOldClustCentre = np.empty((len(np.unique(originalLabel))), dtype='str')
    for iGroup in xrange(NGroup):
        for label in groupToMerge[iGroup]: # I know that label is numeric, else it would fail
            newLabelOldClustCentre[label-1] = groupLabel[iGroup] # -1 as "I" start counting from 1
    
    # Naively take the mean
    clustCentre = np.empty((len(groupLabel), len(day)))
    for i, label in enumerate(groupLabel):
        clustCentre[i, :] = np.mean(smoothExprs[newLabel==label, :], axis=0)
    
#    #Using GPR was creating numerical issues so now (naively) I'm taking the mean         
#    #Compute clust centres (should research into doing this "properly" i.e using MOHGP, but coz for now I'm only
#    #interested in gene symbols it should be fine) 
#    clustCentre = np.empty((len(groupLabel), len(day)))
#    for i, label in enumerate(np.unique(newLabel)):
#        thisClustCentre = oldClustCentre[newLabelOldClustCentre == label, :]        
#        if sum(newLabelOldClustCentre == label) == 1:
#            clustCentre[i, :] = thisClustCentre # no need to GPR
#        else:         
#            xTrain = np.tile(day, sum(newLabelOldClustCentre == label)).flatten()[:, None]
#            yTrain = thisClustCentre.flatten()[:, None]
#            fit = gpr.fit(xTrain, yTrain)        
#            mu, var = fit.predict(day[:, None]) 
#            clustCentre[i, :] = mu.T
          
    # Save Cluster "centres"    
    dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) # using header from Save Gene/Probe List
    header = list(itertools.chain.from_iterable([["Cluster"], list(day)]))    
    io.write_to_csv(os.path.join(path['ClustMerge']['Centres'], organ + strain + ".csv"), header, dataMatrix) # Cluster "centres"   
    # Save Alternate plot         
    hFig = multi_plot(smoothExprs, clustCentre, day, newLabel)    
    io.save_pdf(os.path.join(path['ClustMerge']['Plot'], organ + strain + "2.pdf"), hFig) # Plot  
    
    return newLabel    
def MOHGP(probesToCluster, organ, strain, prefix, K, alpha, path, seed=0):
    """
    Word cloud plot for all clusters in a dataset

    Arguments
    =========
    probesToCluster - a set of unique probeIDs to cluster
    organ - blood/spleen
    strain - AS/CB
    prefix - all/common/only
    K - init no. of clusters
    alpha - concentration parameter/strength parameter of the Dirichlet Process Prior
    path - dictionary with all results paths
    seed - to reproduce results due to multiple local optima
    
    Returns
    =========
    None - a Mixture of Hierarchical Gaussian Process model is fitted and saved to disk

    """
    # To reproduce results
    np.random.seed(seed) 
    # Load gene expression data
    data = pandas.read_csv(os.path.join(path['RawData']['Log2FC'], organ + strain + ".csv"), sep=",") # read data
    probeID = np.array(data['ProbeID'])        
    yTrain = data.values[:, 2:].astype('float') # 45,281 genes x S samples              
    xTrain = np.floor(data.columns.values[2:].astype('float64'))[:, None] # floor to get int 0, 0, 2, 2, ..., 12    
    # Subset the data by keeping only probesToCluster
    bWant = np.array([probeID[i] in probesToCluster for i in xrange(len(probeID))]) # simply creates a vector of T, F, T        
    yTrain = yTrain[bWant, :]         
    probeID = np.array(data['ProbeID'][bWant])
    geneSymbol = np.array(data['GeneSymbol'][bWant])
    
    # MOHGP fitting
    # Define the covariance functions for the hierarchical GP structure
    # The model of any cluster of genes has a hierarchical structure, with the unknown cluster-specific 
    # mean drawn from a GP, and then each gene in that cluster being drawn from a GP with said unknown mean function.
    # Covariance function for the latent function that describes EACH cluster. 
    covFunCluster = GPy.kern.RBF(input_dim=1, variance=np.var(yTrain.ravel()), lengthscale=LENGTHSCALE) 
    # Covariance function that describes how EACH time-course (gene) deviates from the cluster
    covFunGene = GPy.kern.RBF(input_dim=1, variance=np.var(yTrain.ravel())/10, lengthscale=LENGTHSCALE) + \
                 GPy.kern.White(1, variance=NOISE_VARIANCE)
    # Set-up the clustering problem NB: For large alpha P resembles Po (i.e the base distribution)
    fit = GPclust.MOHGP(X=xTrain, kernF=covFunCluster, kernY=covFunGene, Y=yTrain, K=K, prior_Z='DP', alpha=alpha)   
    # Constrain lengthscales (to avoid very short lengthscales as per Topa et al. (2012) on arXiv)
    fit.rbf.lengthscale.constrain_bounded(LOWER_BOUND_LENGTHSCALE, UPPER_BOUND_LENGTHSCALE , warning=False)
    fit.add.rbf.lengthscale.constrain_bounded(LOWER_BOUND_LENGTHSCALE, UPPER_BOUND_LENGTHSCALE , warning=False)    
    fit.hyperparam_opt_interval = 1000 # how often to optimize the hyperparameters
    # Optimise hyperparameters
    fit.optimize()
    fit.systematic_splits(verbose=False)
    # Name and reorder fit    
    fit.name = prefix + organ + strain
    fit.reorder()
    labels = np.argmax(fit.phi, axis=1) + 1 # cluster number    
    
    # Compute cluster prediction for xTest where xTest is taken from SmoothExprs
    data = pandas.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organ + strain + ".csv"), sep=",") # read data    
    smoothExprs = data.values[:, 2:].astype('float')[bWant, :]   
    xTest = data.columns.values[2:].astype('float64')[:, None]    
    mu, var = fit.predict_components(xTest)
    clustCentre = np.empty((len(mu), xTest.shape[0]))
    for iClust in xrange(len(mu)):
        clustCentre[iClust, :] = mu[iClust]
    
    # Save model and plot    
    io.save_pickle(os.path.join(path['Clust']['Model'], prefix + organ + strain + ".pickle"), fit)
    io.save_pdf(os.path.join(path['Clust']['Plot'], prefix + organ + strain + ".pdf"), plot(fit))
    # Save Gene/Probe List    
    geneList = get_gene_list(labels, geneSymbol)
    probeList = get_gene_list(labels, probeID)
    header = ["Cluster%i" % label for label in np.unique(labels)]
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], prefix + organ + strain + ".csv"), header, geneList) # Gene list 
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], prefix + organ + strain + ".csv"), header, probeList) # Probe list    
    # Save Cluster "centres"    
    dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) 
    header = list(itertools.chain.from_iterable([["Cluster"], list(xTest.ravel())]))    
    io.write_to_csv(os.path.join(path['Clust']['Centres'], prefix + organ + strain + ".csv"), header, dataMatrix) # Cluster "centres"   
    # Save Alternate plot     
    hFig = multi_plot(smoothExprs, clustCentre, xTest, labels)    
    io.save_pdf(os.path.join(path['Clust']['Plot'], prefix + organ + strain + "2.pdf"), hFig) # Plot 
    # Word cloud
    #vis.word_cloud_plot(organ, strain, prefix, path)
    # Heatmap
    vis.heatmap_plot_by_clusters(organ, strain, prefix, path)     
def compare_arbitrary_clusters(organA, strainA, iiClusterA, colourA, organB, strainB, iiClusterB, colourB, prefix, path):
    """
    Compare arbitrary cluster e.g CB_Bl_10 + CB_Bl_11 vs CB_Sp_07
    
    Arguments
    =========
    organA/B - blood/spleen
    strainA/B - AS/CB
    iiClusterA/B - list of indices of clusters to compare e.g [10, 11]    
    colourA/B - colour of conditions red/blue "#e41a1c"/"#377eb8"  
    prefix - all/common/only
    path - dictionary with all results paths
    
    Returns
    =========
    None - figures saved to disk + .csv file of common genes saved to disk

    """    
    # Load probe list by cluster
    probeListA = pandas.read_csv(os.path.join(path['Clust']['ProbeList'], prefix + organA + strainA + ".csv"), sep=",") 
    probeListB = pandas.read_csv(os.path.join(path['Clust']['ProbeList'], prefix + organB + strainB + ".csv"), sep=",") 

    # Pick only the clusters we're interested in
    # A    
    probeIDA = []
    for iClust in iiClusterA:
        probeIDA.append(probeListA['Cluster' + str(iClust)].dropna()) # drop NA entries
    probeIDA = list(itertools.chain.from_iterable(probeIDA))
    # B
    probeIDB = []
    for iClust in iiClusterB:
        probeIDB.append(probeListB['Cluster' + str(iClust)].dropna()) # drop NA entries
    probeIDB = list(itertools.chain.from_iterable(probeIDB))

    # Find common probes
    commonProbes = set(probeIDA) & set(probeIDB)

    # Read GPR smoothed gene expression
    # A    
    data = pandas.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organA + strainA + ".csv"), sep=",")
    allProbeIDA = np.array(data['ProbeID']) 
    smoothExprsA = data.values[:, 2:].astype('float') 
    day = data.columns.values[2:].astype('float')  
    # B    
    data = pandas.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organB + strainB + ".csv"), sep=",")
    allProbeIDB = np.array(data['ProbeID']) 
    smoothExprsB = data.values[:, 2:].astype('float')

    # Plot data
    # Create figure
    hFig, (hAx1, hAx2, hAx3) = plt.subplots(1, 3, sharex=False, sharey=False, figsize=(16, 8.27/1.8)) # a4 = 11.69 x 8.27 inches
    
    # Condition A
    for iProbe in xrange(len(probeIDA)):
        iiWant = np.where(allProbeIDA == probeIDA[iProbe])[0][0] # should return only one index
        if probeIDA[iProbe] in commonProbes:
            hAx1.plot(day, smoothExprsA[iiWant, :], linewidth=2, color="#525252", alpha=ALPHA)
        else:
            hAx1.plot(day, smoothExprsA[iiWant, :], linewidth=0.5, color=colourA, alpha=ALPHA)
    hAx1.axhline(y=0, color="black", linestyle='--', linewidth=2)
    # Set x/y limits and title
    hAx1.set_xlim(XLIM)
    hAx1.set_ylim(YLIM)
    #hAx1.set_title("%s%s - %s (#%d probes) r = %.2f" % (organA, strainA, clustNamesA[iClustA], len(thisProbeListA), rho))
    hAx1.set_title(strainA + '_' + organA[:2] + str(iiClusterA))    
    hAx1.set_ylabel("$\log_2$(FC) wrt naive")
    hAx1.set_xlabel("Days")
    hAx1.grid(axis="y")

    # Venn diagram
    plt.sca(hAx2)
    hVen = venn2([set(probeIDA), set(probeIDB)], set_colors=(colourA, colourB), set_labels=("", ""), alpha=0.9)
    for label in hVen.subset_labels: # hVen.set_labels = "blood"/"spleen"; hVen.subset_labels = 3211/3211/1317 
        if hasattr(label, 'set_fontsize'):
            label.set_fontsize(20)
    if len(commonProbes)>0:
        hVen.get_patch_by_id('11').set_color('#525252') # make the intersection grey
        
    # Condition B
    for iProbe in xrange(len(probeIDB)):
        iiWant = np.where(allProbeIDB == probeIDB[iProbe])[0][0] # should return only one index
        if probeIDB[iProbe] in commonProbes:
            hAx3.plot(day, smoothExprsB[iiWant, :], linewidth=2, color="#525252", alpha=ALPHA)
        else:
            hAx3.plot(day, smoothExprsB[iiWant, :], linewidth=0.5, color=colourB, alpha=ALPHA)
    hAx3.axhline(y=0, color="black", linestyle='--', linewidth=2)
    # Set x/y limits and title
    hAx3.set_xlim(XLIM)
    hAx3.set_ylim(YLIM)
    #hAx3.set_title("%s%s - %s (#%d probes)" % (organB, strainB, clustNamesB[iClustB], len(thisProbeListB)))
    hAx3.set_title(strainB + '_' + organB[:2] + str(iiClusterB))    
    hAx3.set_ylabel("$\log_2$(FC) wrt naive")
    hAx3.set_xlabel("Days")
    hAx3.grid(axis="y")

    # Save figure as .pdf
    fileName = strainA + '_' + organA[:2] + str(iiClusterA) + '_vs_' + strainB + '_' + organB[:2] + str(iiClusterB)
    io.save_pdf(os.path.join(path['Comparison'], fileName) + ".pdf", hFig) 
    
    # Write common probes to .csv
    header = ["ProbeID", "GeneSymbol"]
    commonGeneSymbol=[]    
    for probe in commonProbes: # get gene symbol
        iiWant = np.where(allProbeIDB == probe)[0][0] # should return one value (probeIDs are unique)
        commonGeneSymbol.append(data['GeneSymbol'].values[iiWant])
    dataMatrix = np.hstack((np.array(list(commonProbes))[:, None], np.array(commonGeneSymbol)[:, None]))
    io.write_to_csv(os.path.join(path['Comparison'], fileName) + "shared_genes.csv", header, dataMatrix)
Beispiel #7
0
import enrichr
import compare

#*************************************************************************************************#
# Setup/Create folders and thus all paths (stored as a structured dictionary)
#*************************************************************************************************#
path = setup_folders()

#*************************************************************************************************#
# PCA plot considering all measured probes and blood and spleen together
#*************************************************************************************************#
# Consider all tx
strain = 'AS'
for organ in ORGANS:
    hWt, hPCA = util.pca_plot([organ], strain, path, bLegend=True)
    io.save_pdf(os.path.join(path['Misc'], 'PCA' + organ + strain + 'All.pdf'), hPCA)
    io.save_pdf(os.path.join(path['Misc'], 'PCAWts' + organ + strain + 'All.pdf'), hWt)
# Spleen + Blood together
hWt, hPCA = util.pca_plot(ORGANS, strain, path, bLegend=True)
io.save_pdf(os.path.join(path['Misc'], 'PCA' + "".join(ORGANS) + strain + 'All.pdf'), hPCA)
io.save_pdf(os.path.join(path['Misc'], 'PCAWts' + "".join(ORGANS) + strain + 'All.pdf'), hWt)
    
# Consider ONLY top ranked
for organ in ORGANS:
    hWt, hPCA = util.pca_plot([organ], strain, path, topRanked=TOP_RANKED, bLegend=True)
    io.save_pdf(os.path.join(path['Misc'], 'PCA' + organ + strain + 'Top.pdf'), hPCA)
    io.save_pdf(os.path.join(path['Misc'], 'PCAWts' + organ + strain + 'Top.pdf'), hWt)
# Spleen + Blood together
hWt, hPCA = util.pca_plot(ORGANS, strain, path, topRanked=TOP_RANKED, bLegend=True)
io.save_pdf(os.path.join(path['Misc'], 'PCA' + "".join(ORGANS) + strain + 'Top.pdf'), hPCA)
io.save_pdf(os.path.join(path['Misc'], 'PCAWts' + "".join(ORGANS) + strain + 'Top.pdf'), hWt)
Beispiel #8
0
hFig, hAxs= plt.subplots(2, 2, sharex=True, sharey=False, figsize=(11.69, 8.27)) # a4 = 11.69 x 8.27 inches
metrics = ['score', 'SNR', 'maxLogFC', 'rank']
for i, hAx in enumerate(hFig.axes):
    for organ in ORGANS:
        for strain in STRAINS:
            data = pandas.read_csv(os.path.join(path['GPFit']['Metrics'], organ + strain + ".csv"), sep=",")            
            metric = np.array(data[metrics[i]])
            hAx.plot(np.sort(metric), label=organ + strain, linewidth=2)            
    hAx.set_xlabel("No. of Probes")
    hAx.set_ylabel(metrics[i])
    hAx.set_xlim((0, 46000))
    hAx.axvline(x=45281-topRanked, c="black", linewidth=2, linestyle='--')
    hAx.legend(loc=2)
    #plt.tight_layout()
filePath = os.path.join(path['Misc'], 'DEG.pdf') 
io.save_pdf(filePath, hFig)
    
#SNR = np.array(data['SNR'])
#maxLogFC = np.array(data['maxLogFC'])
#score = np.array(data['score'])
#hFig = plt.figure("DEG") # Handle to figure
#hAx = hFig.add_subplot(111, projection='3d') # Handle to axis
#hAx.scatter(SNR, maxLogFC, score, color='grey', marker='.', s=10) # marker size

#----------------------------------------------------------------------------------------------------------------------#
# D) Crude estimation for number of clusters
#----------------------------------------------------------------------------------------------------------------------#
from sklearn.cluster import KMeans
hFig = plt.figure()
hAx = plt.gca()
for organ in ORGANS:
Beispiel #9
0
def clusters(organs, strains, path, clustIndices):
    """
    Visually compare arbitrary clusters across different conditions organ/strain A/B 
    
    =========
    organs - a list of two organs e.g ['Blood', 'Spleen']
    strains - a list of two strains e.g ['AS', 'CB']
    path - a structured dictionary of paths returned by config
    clustIndices - a list of cluster indices e.g [[1, 4, 5], [6, 7]]
    
    Returns
    =========
    hFig - figure handle
    
    """
    # Simple data checking
    if len(organs) != 2 or len(strains) != 2 or len(clustIndices) != 2:
        warnings.warn("Arguments not compatible with function requirements!",
                      UserWarning)
        hFig = None
    else:
        # Find common probes i.e to plot in grey
        shared, left, right = shared_genes(organs, strains, path, clustIndices)

        # Create figure (a4 = 11.69 x 8.27 inches)
        hFig, hAx = plt.subplots(1,
                                 3,
                                 sharex=False,
                                 sharey=False,
                                 figsize=(16, 8.27 / 1.8))
        title = [
            '{}_{}{}'.format(strains[i], organs[i][:2], clustIndices[i])
            for i in range(2)
        ]

        # Plot shared and only genes
        cluster_subplot(organs[0], strains[0], path, shared, left, title[0],
                        hAx[0])
        cluster_subplot(organs[1], strains[1], path, shared, right, title[1],
                        hAx[2])  # beware 2

        # Plot venn diagram
        plt.sca(hAx[1])
        hVen = venn2([left.shape[0], right.shape[0], shared.shape[0]],
                     set_colors=(COL[organs[0]], COL[organs[1]]),
                     set_labels=("", ""),
                     alpha=0.9)
        for label in hVen.subset_labels:
            if hasattr(label, 'set_fontsize'):
                label.set_fontsize(20)
        if shared.shape[0] > 0:
            hVen.get_patch_by_id('11').set_color(
                COL['Shade'])  # make intersection grey

        # Create folder where to put results
        folderName = '{}_{}'.format(title[0], title[1])
        thisDir = io.create_folder(path['Comparison'], folderName)

        # Save plot and gene lists
        io.save_pdf(os.path.join(thisDir, 'timeplot.pdf'), hFig)
        shared.to_csv(os.path.join(thisDir, 'shared.csv'), index=False)
        left.to_csv(os.path.join(thisDir, 'left.csv'), index=False)
        right.to_csv(os.path.join(thisDir, 'right.csv'), index=False)