from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from plotly.graph_objs import * #Read in data, data will contain the gene expression file, annotations contains the metadata, genes contains gene metadata. It is annoying that it is spread over many files.. data = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_expression_v7.1.tsv', sep='\t', index_col=0) annotations = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_samples_v7.1.tsv', sep='\t', index_col=0) annotations_platform = pd.read_csv('/Users/pwangel/PlotlyWorkspace/combine_data/blood/outputs_for_front_end/iMac_annotations.tsv', sep='\t', index_col=0) ### Need to get the platform separately from this file annotations = annotations.merge(annotations_platform['Platform_Category'], how='inner', left_index=True, right_index=True) genes_s4m = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes_v7.1.tsv', sep='\t', index_col=0) genes_varPart = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t', index_col=0) genes = genes_s4m.merge(genes_varPart['Platform_VarFraction'], how='left', left_index=True, right_index=True) # Want genes to have a platform variance fraction plus ensembl-gene symbol conversion cut_data = functions.transform_to_percentile(data.loc[genes.loc[genes.inclusion.values].index.values.astype(str), annotations.index.values]) all_ranked_data = functions.transform_to_percentile(data.loc[:, annotations.index.values]) annotations['Cluster'] = 1 #Dummy values for the categories of samples we will compare #Generate the atlas PCA and use it to remove samples that are not part of the DE pca = sklearn.decomposition.PCA(n_components=3, svd_solver='full') pca_coords = pca.fit_transform(cut_data.transpose()) annotations = annotations.loc[pca_coords[:,0] < 0] all_ranked_data = all_ranked_data[annotations.index] cut_data = cut_data[annotations.index] #Folder to output p values and graphs into. fname = 'macrophage_tissues' folder = 'macrophage_tissues' cluster_names = []
#data = data.merge(ext_data, how='inner', left_index=True, right_index=True) #annotations = pd.concat([annotations, ext_annotations]) # In[3]: weird_index = annotations.loc[ (annotations['Platform Category'] == 'Illumina V4') & (annotations['Sample Source'] == 'in vivo')].index annotations.loc[weird_index, 'Platform Category'] = 'Illumina V4 2' annotations = annotations.loc[~np.in1d(annotations['Tissue Type']. values, ['skin', 'spleen'])] # In[4]: data = functions.transform_to_percentile(data[annotations.index]) # Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe # In[5]: annotations.rename(columns={'Platform Category': 'Platform_Category'}, inplace=True) genes = functions.calculate_platform_dependence(data, annotations) # In[6]: pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 1.0]).transpose())
#data = pd.read_csv('/Users/pwangel/Downloads/pluripotent_atlas_data.tsv', sep='\t', index_col=0) annotations = pd.read_csv('/Users/pwangel/Downloads/pluripotent_RNASeq_annotations.tsv', sep='\t', index_col=0) lizzi_anno = pd.read_csv('/Users/pwangel/PlotlyWorkspace/combine_data/naive_stemcells/stemcell_annotations.tsv', sep='\t', index_col=0) annotations = annotations.merge(lizzi_anno['LM_Group_COLOR'], how='left', left_index=True, right_index=True) experiment_anno = pd.read_csv('/Users/pwangel/Downloads/RNASeq_only_pluripotent_annotations.tsv', sep='\t', index_col=0) experiment_anno.index = [i+';'+j for i,j in zip(experiment_anno.chip_id.values.astype(str), experiment_anno.Dataset.values.astype(int).astype(str))] annotations = annotations.merge(experiment_anno[['Experiment', 'Time', 'Initial Condition']], how='left', left_index=True, right_index=True) #annotations.Dataset = annotations.Dataset.astype(float).astype(int).astype(str) genes = pd.read_csv('/Users/pwangel/Data/ensembl_hg38.91/gene_to_symbol_ensembl91_human.tsv', sep='\t', index_col=0, names=['symbol']) gene_list = np.intersect1d(genes.loc[np.intersect1d(data.index, genes.index)].symbol.values, genes_df.index.values) annotations['chip_id'] = [i.split(';')[0] for i in annotations.index.values.astype(str)] annotations = annotations.loc[(annotations.Platform_Category=='RNASeq') & (annotations.Dataset!='7275.0')] #This dataset is mistakenly in, it is annotated endoderm data = data[annotations.chip_id] data = functions.transform_to_percentile(data) # Run pca pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit(functions.transform_to_percentile(data.transpose())) pca_coords = pca.transform(data.transpose()) functions.plot_pca(pca_coords, annotations,pca, labels=['generic_sample_type', 'Platform_Category', 'Dataset'], colour_dict={}, \ pcs=[1,2,3], out_file='/Users/pwangel/PlotlyWorkspace/combine_data/naive_stemcells/RNASeq_only_pluripotent.html') #### Apply k means clustering to divide genes on/off state kmeans = sklearn.cluster.KMeans(n_clusters=2) #data_output = pd.DataFrame(index=gene_list, columns=['Bimodal val', 'Low_Expr', 'High_Expr', 'Low_Std', 'High_Std'])
data = pd.read_csv( '/Users/pwangel/Downloads/myeloid_atlas_expression_v7.1.tsv', sep='\t', index_col=0) annotations = pd.read_csv( '/Users/pwangel/PlotlyWorkspace/combine_data/blood/outputs_for_front_end/iMac_annotations.tsv', sep='\t', index_col=0) genes = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t', index_col=0) # In[29]: data = functions.transform_to_percentile(data) # Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe # In[6]: #genes = functions.calculate_platform_dependence(data, annotations) #genes.to_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t') # In[30]: pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) pca_coords = pca.transform(
#### This is an example script utilising the Mann Whitney Ranksum test implement in scipy. #### The groups being tested here are the in vitro vs in vivo DC1 cells from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from plotly.graph_objs import * data = pd.read_csv('/location/of/expression.tsv', sep='\t', index_col=0) annotations = pd.read_csv('/location/of/annotations.tsv', sep='\t', index_col=0) genes = pd.read_csv('/location/of/myeloid_atlas_genes_v7.1.tsv', sep='\t', index_col=0) cut_data = functions.transform_to_percentile(data.loc[genes.inclusion.values]) # Select only DC1 samples annotations = annotations.loc[annotations.tier1 == 'DC1'] #### Select only DC1 cells for example cut_data = cut_data[annotations.index.values] pvals = np.array([]) delta_median = np.array([]) # Define dataframe to keep results in. Also keep the mean and std of each group for the hell of it df_output = pd.DataFrame(index=gene_list, columns=[ 'P val', 'In vitro mean', 'In vivo mean',
sc_data = pd.read_csv( '/Users/pwangel/Data/Single_Cell/Han/aggregated_by_cluster_100_0pt0.tsv', sep='\t', index_col=0) sc_annotations = pd.read_csv( '/Users/pwangel/Data/Single_Cell/Han/aggregated_by_cluster_metadata_100_0pt0.tsv', sep='\t', index_col=0) sc_annotations['LM_Group_COLOR'] = sc_annotations.celltype.values data = data.merge(sc_data, how='inner', left_index=True, right_index=True).fillna(0.0) annotations = pd.concat([annotations, sc_annotations]) #data = np.log2(1.e6*data/data.sum()+1) data = functions.transform_to_percentile(data) cut_data = data all_ranked_data = data #cut_data = functions.transform_to_percentile(data.loc[genes.loc[genes.inclusion.values].index.values.astype(str), annotations.index.values]) #all_ranked_data = functions.transform_to_percentile(data.loc[:, annotations.index.values]) sel_samples = np.ones(shape=annotations.shape[0]).astype(bool) annotations = annotations.loc[sel_samples] cut_data = cut_data.loc[:, sel_samples] cut_unfiltered_data = all_ranked_data.loc[:, sel_samples] gmm = sklearn.mixture.GaussianMixture(n_components=2)
with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(annotations[[ 'Dataset', 'Platform_Category' ]].drop_duplicates().groupby('Platform_Category').size()) # In[67]: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(annotations.groupby(['Platform_Category']).size()) # Now to actually make the atlas. First step in the atlas two step process: transform expression values to percentile values. # In[34]: data = functions.transform_to_percentile(data) # Second step: model the influence of platform upon expression for each gene. As this can take a while, I often save the results and just read them in rather than recompute them. In this case the results are saved in 'pluripotent_atlas_genes_with_ext.tsv'. # In[35]: #genes = functions.calculate_platform_dependence(data, annotations) #genes.to_csv('../data/pluripotent_atlas_genes_with_ext.tsv', sep='\t') #genes = pd.read_csv('../data/pluripotent_atlas_genes.tsv', sep='\t') genes = pd.read_csv('../data/pluripotent_atlas_genes_with_ext.tsv', sep='\t') # Run the PCA on the expression data of the filtered, transformed genes. The value of the gene filter threshold is 0.25. I have not looked closely at this value. Perhaps a higher value would allow more components into the PCA. # In[36]: pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
names=['symbol']) genes_conversion = genes_conversion.loc[main_ensembl_ids] genes = genes_s4m.merge(genes_conversion, how='left', left_index=True, right_index=True) annotations = annotations.loc[np.in1d( annotations.Dataset.values.astype(int), [7124, 7135, 7240, 7253])] #, 6884, 7253])] annotations = annotations.loc[np.in1d(annotations.LM_Group_COLOR, ['naive', 'primed'])] data = data[ annotations.chip_id] #Not sure if the samples are in the right order data = functions.transform_to_percentile(data) #Loop through the sample types and genes to find differentially expressed genes for i_gene in gene_list: ensembl_id = genes_conversion.index.values[genes_conversion.symbol.values == i_gene] fig = Figure() for i_dataset in annotations.Dataset.unique(): for i_type, i_colour in zip(['naive', 'primed'], ['red', 'blue']): sel = (annotations.LM_Group_COLOR == i_type) & (annotations.Dataset == i_dataset) fig.add_trace( Histogram(x=data.loc[ensembl_id, sel.values].values[0],
data = data.merge(ext_data, how='inner', left_index=True, right_index=True) annotations = pd.concat([annotations, ext_annotations]) # In[49]: print(annotations.shape) print(data.shape) # In[50]: data = functions.transform_to_percentile(data) # Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe # In[51]: #genes = functions.calculate_platform_dependence(data, annotations) #genes.to_csv('/Users/pwangel/Downloads/temp_ext_blood_atlas_genes.tsv', sep='\t') genes = pd.read_csv('/Users/pwangel/Downloads/temp_ext_blood_atlas_genes.tsv', sep='\t', index_col=0) # In[52]: