コード例 #1
0
def write_MOHGP_results(organ, strain, path, fit, geneID):
    """
    A helper function akin to 'write_gene_model_results' to save MOHGP results to disk

    Arguments
    =========
    organ - Blood/Spleen
    strain - AS/CB
    path - a structured dictionary of paths returned by config
    fit - a Mixture of Hierarchical Gaussian Process model
    geneID - a pandas data frame containing ordered probeID/geneSymbols
            NOTE: this is different from probesToCluster order
        
    Returns
    =========
    None - data saved to disk

    """
    # Extract the cluster assigned to each probe
    clustNum = np.argmax(fit.phi, axis=1) + 1 # cluster number
    clustName = [strain + '_' + organ[:2] + '_%02d' % i for i in clustNum] # cluster name
    geneID['Cluster'] = clustName # add to data frame
    
    # Extract the gene and probe list
    geneList = []; probeList = []; header = []
    for name in np.unique(clustName):
        bWant = geneID['Cluster'] == name
        geneList.append(list(geneID.loc[bWant, 'Symbol']))
        probeList.append(list(geneID.loc[bWant, 'ProbeID']))
        header.append(name)
   
    # Save to disk
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], organ + strain + '.csv'), 
                         header, geneList)
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + '.csv'), 
                         header, probeList) # Probe list 
    
    # Save model and standard plot    
    io.save_pickle(os.path.join(path['Clust']['Model'], organ + strain + ".pickle"), fit)
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + ".pdf"), standard_plot(fit))
    
    # Compute cluster predictions for xTest where xTest is taken from SmoothExprs
    data = pd.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organ + strain + ".csv"))  
    xTest = data.drop(['ProbeID', 'Symbol'], axis=1).columns.values.astype('float64')[:, None]
    mu, var = fit.predict_components(xTest) # Compute posterior mean and posterior variance
    # Write to disk (mu row ordering is biggest to smallest cluster)
    df = pd.DataFrame(data=np.array(mu), columns=list(map(str, xTest.flatten())))
    df['Cluster'] = header # header = cluster name
    df.to_csv(os.path.join(path['Clust']['Centres'], organ + strain + '.csv'), index=False)
    clustCentre = df # for readability
    
    # Merge smooth expression data frame with gene ID
    smoothExprs = pd.merge(geneID, data, how='left', on=['ProbeID', 'Symbol'])
    
    # Produce alternate plot
    hFig = alternate_plot(smoothExprs, clustCentre, config.COL[organ])    
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + '2.pdf'), hFig)
コード例 #2
0
ファイル: script.py プロジェクト: jjvalletta/MiceMicroArray
def fit_plot_save(k, smoothExprs, day, probeID, geneSymbol, organ, strain, path):
    """
    Fit k-means, plot and save results

    Arguments
    =========
    k - no. of clusters
    smoothExprs - gene expression rows = genes, columns = day
    day - day
    probeID - probeID
    geneSymbol - geneSymbol
    path - path
    
    Returns
    =========
    None - results are plotted and saved

    """
    model = KMeans(n_clusters=k)
    model.fit(smoothExprs)
    clustCentre = model.cluster_centers_
    # Plot results
    plot_silhouette(silhouette_samples(smoothExprs, model.labels_), model.labels_)
    clust.multi_plot(smoothExprs, clustCentre, day, model.labels_)
    # Hierarchical clustering
    # Ward + Euclidean
    header = ["Cluster%i" % label for label in np.unique(model.labels_)]    
    hclust = hc.linkage(clustCentre, method='ward', metric='euclidean')
    plt.figure(); plt.title("Hclust() Ward + Euclidean")
    hc.dendrogram(hclust, color_threshold=0.0, labels=header)
    #seed=101
    #embedding = tsne.tsne(smoothExprs, no_dims = 3, initial_dims = 20, perplexity = 30.0, seed=seed) # low dimensional embedding
    #tsne.plot(embedding, model.labels_)

    # Save model 
    io.save_pickle(os.path.join(path['Clust']['Model'], organ + strain + ".pickle"), model)
    # Save Gene/Probe List    
    geneList = clust.get_gene_list(model.labels_, geneSymbol)
    probeList = clust.get_gene_list(model.labels_, probeID)
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], organ + strain + ".csv"), header, geneList) # Gene list 
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + ".csv"), header, probeList) # Probe list    
    # Save Cluster "centres"    
    dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) 
    header = list(itertools.chain.from_iterable([["Cluster"], list(day)]))    
    io.write_to_csv(os.path.join(path['Clust']['Centres'], organ + strain + ".csv"), header, dataMatrix) # Cluster "centres"   
    # Save Alternate plot     
    hFig = clust.multi_plot(smoothExprs, clustCentre, day, model.labels_)    
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + "2.pdf"), hFig) # Plot 
コード例 #3
0
def MOHGP(probesToCluster, organ, strain, prefix, K, alpha, path, seed=0):
    """
    Word cloud plot for all clusters in a dataset

    Arguments
    =========
    probesToCluster - a set of unique probeIDs to cluster
    organ - blood/spleen
    strain - AS/CB
    prefix - all/common/only
    K - init no. of clusters
    alpha - concentration parameter/strength parameter of the Dirichlet Process Prior
    path - dictionary with all results paths
    seed - to reproduce results due to multiple local optima
    
    Returns
    =========
    None - a Mixture of Hierarchical Gaussian Process model is fitted and saved to disk

    """
    # To reproduce results
    np.random.seed(seed) 
    # Load gene expression data
    data = pandas.read_csv(os.path.join(path['RawData']['Log2FC'], organ + strain + ".csv"), sep=",") # read data
    probeID = np.array(data['ProbeID'])        
    yTrain = data.values[:, 2:].astype('float') # 45,281 genes x S samples              
    xTrain = np.floor(data.columns.values[2:].astype('float64'))[:, None] # floor to get int 0, 0, 2, 2, ..., 12    
    # Subset the data by keeping only probesToCluster
    bWant = np.array([probeID[i] in probesToCluster for i in xrange(len(probeID))]) # simply creates a vector of T, F, T        
    yTrain = yTrain[bWant, :]         
    probeID = np.array(data['ProbeID'][bWant])
    geneSymbol = np.array(data['GeneSymbol'][bWant])
    
    # MOHGP fitting
    # Define the covariance functions for the hierarchical GP structure
    # The model of any cluster of genes has a hierarchical structure, with the unknown cluster-specific 
    # mean drawn from a GP, and then each gene in that cluster being drawn from a GP with said unknown mean function.
    # Covariance function for the latent function that describes EACH cluster. 
    covFunCluster = GPy.kern.RBF(input_dim=1, variance=np.var(yTrain.ravel()), lengthscale=LENGTHSCALE) 
    # Covariance function that describes how EACH time-course (gene) deviates from the cluster
    covFunGene = GPy.kern.RBF(input_dim=1, variance=np.var(yTrain.ravel())/10, lengthscale=LENGTHSCALE) + \
                 GPy.kern.White(1, variance=NOISE_VARIANCE)
    # Set-up the clustering problem NB: For large alpha P resembles Po (i.e the base distribution)
    fit = GPclust.MOHGP(X=xTrain, kernF=covFunCluster, kernY=covFunGene, Y=yTrain, K=K, prior_Z='DP', alpha=alpha)   
    # Constrain lengthscales (to avoid very short lengthscales as per Topa et al. (2012) on arXiv)
    fit.rbf.lengthscale.constrain_bounded(LOWER_BOUND_LENGTHSCALE, UPPER_BOUND_LENGTHSCALE , warning=False)
    fit.add.rbf.lengthscale.constrain_bounded(LOWER_BOUND_LENGTHSCALE, UPPER_BOUND_LENGTHSCALE , warning=False)    
    fit.hyperparam_opt_interval = 1000 # how often to optimize the hyperparameters
    # Optimise hyperparameters
    fit.optimize()
    fit.systematic_splits(verbose=False)
    # Name and reorder fit    
    fit.name = prefix + organ + strain
    fit.reorder()
    labels = np.argmax(fit.phi, axis=1) + 1 # cluster number    
    
    # Compute cluster prediction for xTest where xTest is taken from SmoothExprs
    data = pandas.read_csv(os.path.join(path['GPFit']['SmoothExprs'], organ + strain + ".csv"), sep=",") # read data    
    smoothExprs = data.values[:, 2:].astype('float')[bWant, :]   
    xTest = data.columns.values[2:].astype('float64')[:, None]    
    mu, var = fit.predict_components(xTest)
    clustCentre = np.empty((len(mu), xTest.shape[0]))
    for iClust in xrange(len(mu)):
        clustCentre[iClust, :] = mu[iClust]
    
    # Save model and plot    
    io.save_pickle(os.path.join(path['Clust']['Model'], prefix + organ + strain + ".pickle"), fit)
    io.save_pdf(os.path.join(path['Clust']['Plot'], prefix + organ + strain + ".pdf"), plot(fit))
    # Save Gene/Probe List    
    geneList = get_gene_list(labels, geneSymbol)
    probeList = get_gene_list(labels, probeID)
    header = ["Cluster%i" % label for label in np.unique(labels)]
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], prefix + organ + strain + ".csv"), header, geneList) # Gene list 
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], prefix + organ + strain + ".csv"), header, probeList) # Probe list    
    # Save Cluster "centres"    
    dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) 
    header = list(itertools.chain.from_iterable([["Cluster"], list(xTest.ravel())]))    
    io.write_to_csv(os.path.join(path['Clust']['Centres'], prefix + organ + strain + ".csv"), header, dataMatrix) # Cluster "centres"   
    # Save Alternate plot     
    hFig = multi_plot(smoothExprs, clustCentre, xTest, labels)    
    io.save_pdf(os.path.join(path['Clust']['Plot'], prefix + organ + strain + "2.pdf"), hFig) # Plot 
    # Word cloud
    #vis.word_cloud_plot(organ, strain, prefix, path)
    # Heatmap
    vis.heatmap_plot_by_clusters(organ, strain, prefix, path)