def blomen_2015(): """ Extracts fitness phenotypes from Blomen et al., and saves them together with their NCBI gene ID. Will only retreive the insertions of the crispr cassettes, and will do so for KBM7 and HAP1 cells. """ p_out = io.get_output_path('papers/blomen_2015') io.ensure_presence_of_directory(p_out) def _tidy_blomen(file_path, cellline): s = cellline + '_full_dataset' d = pd.read_excel(file_path, sheetname=s, header=1) d['tot.insertions'] = d['tot.sense'] + d['tot.anti'] d['selected'] = d['selected'] == 'YES' d = d.drop('GENE_SYMBOL', axis=1) d = d.set_index('ENSEMBL_ID') c = 'Blomen2015__' + cellline d.columns = [c + ': {}'.format(j) for j in d.columns] return d fp_KBM7 = io.get_geisen_manual_data_path( 'out/papers/Blomen2015/aac7557_SM_Table_S1.xlsx') cellline = 'KBM7' k = _tidy_blomen(fp_KBM7, cellline) fp_HAP1 = io.get_geisen_manual_data_path( 'out/papers/Blomen2015/aac7557_SM_Table_S2.xlsx') cellline = 'HAP1' h = _tidy_blomen(fp_HAP1, cellline) blomen2015 = pd.concat([k, h], join='outer', verify_integrity=True, axis=1) blomen2015.index.name = 'gene_ensembl' # science of biology nomenclature # Selecet features which describe insertions, rather # than ratios # (note: in science of biology v.0.1 this was part of the predict module) c = [ 'Blomen2015__KBM7: tot.sense', 'Blomen2015__KBM7: tot.anti', 'Blomen2015__KBM7: p.val', 'Blomen2015__KBM7: q.val', 'Blomen2015__HAP1: tot.anti', 'Blomen2015__HAP1: p.val', 'Blomen2015__HAP1: q.val' ] blomen2015 = blomen2015.loc[:, c] v = 'blomen_2015_fitness_orig' blomen2015.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=True) blomen2015_entrez = mapper.gene_ensembl_2_gene_ncbi_unambiguously( blomen2015, taxon_id=9606) v = 'blomen_2015_fitness_ncbi_gene' blomen2015_entrez.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=True)
def rosenfeld_2013(): """ Patent data on human genes. Note that companies usually patent an n-mer sequence, and its variants, thus they do not really patent individual genes, but sequences that have some similarity to genes. """ p_in = io.get_geisen_manual_data_path( 'out/papers/rosenfeld2013/13073_2013_415_MOESM1_ESM.XLS') df = pd.read_excel(p_in, skiprows=3) df = df.drop_duplicates() df = df.rename(columns={ 'Patent': 'patent', 'Matching Gene': 'symbol_ncbi' }) df_entrez = mapper.symbol_2_gene_ncbi(df, 9606, 'substitute') p_out = io.get_output_path('papers/rosenfeld_2013') io.ensure_presence_of_directory(p_out) v = 'rosenfeld_2013_patents' _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=df, df_ncbi=df_entrez)
def wang_2015(): """ Wang et al. 2015 (loss of function mutation monitoring fitness) """ p_in = io.get_geisen_manual_data_path( 'out/papers/wang2015/aac7041_SM_Table_S3.xlsx') p_out = io.get_output_path('papers/wang_2015') io.ensure_presence_of_directory(p_out) df = pd.read_excel(p_in) df = df.drop('sgRNAs included', axis=1) df = df.rename(columns={'Gene': 'symbol_ambiguous'}) df = df.set_index('symbol_ambiguous', verify_integrity=True) # Remove K562 CS cells, as 39 of the 63 cell specific hits, are artifact # of genome location (see publication) excl = ['K562 CS', 'K562 adjusted p-value'] df = df.drop(excl, axis=1) df.columns = ['Wang2015: {}'.format(j) for j in df.columns] c = ['Wang2015: KBM7 CS', 'Wang2015: Jiyoye CS', 'Wang2015: Raji CS'] wang_cs = df.loc[:, c] wang_cs_entrez = mapper.symbol_2_gene_ncbi( wang_cs, taxon_id=9606, # H**o sapiens how='median') c = [ 'Wang2015: KBM7 adjusted p-value', 'Wang2015: Jiyoye adjusted p-value', 'Wang2015: Raji adjusted p-value' ] wang_pvalue = df.loc[:, c] wang_pvalue_entrez = mapper.symbol_2_gene_ncbi( wang_pvalue, taxon_id=9606, # H**o sapiens how='median') v = 'wang_2015_cs' _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=wang_cs, df_ncbi=wang_cs_entrez) v = 'wang_2015_pvalue' _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=wang_pvalue, df_ncbi=wang_pvalue_entrez)
def hart_2015(): """ Extracts fitness phenotypes from Hart et al., and saves them together with their NCBI gene ID. will isoalte individual datasets as separte fiels """ p_out = io.get_output_path('papers/hart_2015') io.ensure_presence_of_directory(p_out) p_in = io.get_geisen_manual_data_path( 'out/papers/hart2015/mmc3_TSDeletedThoseWithExcelToDateConversion.xlsx' ) hart2015 = pd.read_excel(p_in) hart2015 = hart2015.rename(columns={'Gene': 'symbol_ambiguous'}) hart2015 = hart2015.set_index('symbol_ambiguous', verify_integrity=True) hart2015.columns = ['Hart2015: {}'.format(j) for j in hart2015.columns] hart2015_entrez = mapper.symbol_2_gene_ncbi( hart2015, taxon_id=9606, # H**o sapiens how='median') out_settings = { # cell-line : column name 'hart2015_hct116_ordnum': 'Hart2015: BF_hct116', 'hart2015_hela_ordnum': 'Hart2015: BF_hela', 'hart2015_gbm_ordnum': 'Hart2015: BF_gbm', 'hart2015_rpe1_ordnum': 'Hart2015: BF_rpe1', 'hart2015_dld1_ordnum': 'Hart2015: BF_dld1', 'hart2015_a375_ko_ordnum': 'Hart2015: BF_a375_GeCKo', 'hart2015_hct116_shRNA_ordnum': 'Hart2015: BF_hct116_shRNA' } for cellline, dataset in out_settings.items(): v = 'hart_2015_{}_ordnum_orig'.format(cellline) h = hart2015.loc[:, [dataset]] h.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=True) v = 'hart_2015_{}_ordnum_gene_ncbi'.format(cellline) h = hart2015_entrez.loc[:, [dataset]] h.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=True)
def _load_batches(path_pattern, dataset): p_scheme = io.get_geisen_manual_data_path(path_pattern) agg = [] for p in glob.glob(p_scheme): p = os.path.join(p, '{}.txt'.format(dataset)) agg.append(pd.read_table(p)) df = pd.concat(agg) df = df.drop_duplicates() # place symbols lower-case as genealacart does not appear # to distinguish internally n = ['InputTerm', 'Symbol'] for t in n: df.loc[:, t] = df.loc[:, t].str.lower() return df
def lek_2016(): """ ExAc database, as published by Lek et al. 2016 Output: lek2016_aberration_ordnum enrichemnt of aberrations lek2016_aniticipation_ordnum anticipated background rates """ p_out = io.get_output_path('papers/lek_2016') io.ensure_presence_of_directory(p_out) # high level representation (at transcript level) p = io.get_geisen_manual_data_path( 'out/papers/lek2016/nature19057-SI Table 13.xlsx') # data sheet with information on all genes df = pd.read_excel(p, sheetname='Gene Constraint') # reformatting df = df.rename(columns={'transcript': 'rna_ensembl'}) # controlled vocabulary df['rna_ensembl'] = df['rna_ensembl'].replace( '\..*$', '', regex=True) # ignore versions of transcripts v = 'lek2016_aberration_ordnum' df_aberration = df[[ 'rna_ensembl', 'syn_z', 'mis_z', 'lof_z', 'pLI', 'pRec', 'pNull' ]].set_index('rna_ensembl') per_gene_aberration = mapper.rna_ensembl_2_gene_ncbi(df_aberration, how='median') _save_orig_and_ncbi_gene_mapped_tables(p_out, filebase=v, df_orig=df_aberration, df_ncbi=per_gene_aberration) v = 'lek2016_anticipation_ordnum' df_anticipation = df[['rna_ensembl', 'exp_syn', 'exp_mis', 'exp_lof']].set_index('rna_ensembl') per_gene_anticipation = mapper.rna_ensembl_2_gene_ncbi(df_anticipation, how='median') _save_orig_and_ncbi_gene_mapped_tables(p_out, filebase=v, df_orig=df_anticipation, df_ncbi=per_gene_anticipation)
def thul_2017(): """ Protein subcellular localization from human protein atlas """ p_in = io.get_geisen_manual_data_path( 'out/papers/thul2017/aal3321_Thul_SM_table_S6.xlsx') p_out = io.get_output_path('papers/uhlen_2015') io.ensure_presence_of_directory(p_out) df = pd.read_excel(p_in) col = [ 'ENSG', 'Nucleus', 'Nucleoplasm', 'Nuclear bodies', 'Nuclear speckles', 'Nuclear membrane', 'Nucleoli', 'Nucleoli (Fibrillar center)', 'Cytosol', 'Cytoplasmic bodies', 'Rods and Rings', 'Lipid droplets', 'Aggresome', 'Mitochondria', 'Microtubules', 'Microtubule ends', 'Microtubule organizing center', 'Centrosome', 'Mitotic spindle', 'Cytokinetic bridge', 'Midbody', 'Midbody ring', 'Intermediate filaments', 'Actin filaments', 'Focal Adhesions', 'Endoplasmic reticulum', 'Golgi apparatus', 'Vesicles', 'Plasma membrane', 'Cell Junctions', 'Reliability' ] df = df.loc[:, col] df = df.rename(columns={'ENSG': 'gene_ensembl'}) df = df.set_index('gene_ensembl', verify_integrity=True) df_entrez = \ mapper.gene_ensembl_2_gene_ncbi_unambiguously( df, taxon_id=9606) v = 'thul_2017_subcellular_localization' _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=df, df_ncbi=df_entrez)
def matt_antalek_170222(): """ Matt Antalek (Rick Morimoto lab) downloaded on 170222 tissue data of several model organisms; Used cutoff was 0, and when a filter would be required by the web-interface he chose reasonable representative ones """ # manually curated condition codes: # dictionary with extension as key, and entries # - taxon_id # - if qualifier: [taxon_id, qualifier] condition_codes = { 'rattus_norvegicus_female': [10116, 'female'], 'rattus_norvegicus_male': [10116, 'male'], 'ovis_aries_texel': [9940, 'texel'], 'ovis_aries_female': [9940, 'female'], 'ovis_aries_male': [9940, 'male'], 'mus_musculus': 10090, 'bos_taurus': 9913, 'gallus_gallus': 9031, 'macaca_mulatta': 9544, 'homo_sapiens': 9606, 'pabio_anubis': 9555, # olive baboon 'monodelphis_domestica': 13616, 'xenopus_tropicalis': 8364, 'anolis_carolinesis': 28377, } p_dir_in = io.get_geisen_manual_data_path( 'out/' 'ebi_expression_manual/' 'matt_antalek_170222/' 'E-*.tsv') # filter for correct files p_out = io.get_output_path('gxa/matt_antalek_170222') io.ensure_presence_of_directory(p_out) files = glob.glob(p_dir_in) for p in files: df = pd.read_table(p, header=3) df = df.rename(columns={'Gene ID': 'gene_ensembl'}) df = df.drop('Gene Name', axis=1) def add_GXA_to_label(x): # introduced in geisen v1_1 if not x.startswith('gene'): x = 'GXA_' + x return x df.columns = [add_GXA_to_label(y) for y in df.columns] _, fname = os.path.split(p) matched = re.findall('^(.*)-[0-9].*-results_(.*)\.tsv', fname) if len(matched) != 1: raise ValueError('Unexpected format. Check parsing pattern.') experiment = matched[0][0] k = matched[0][1] meta = condition_codes[k] if isinstance(meta, list): taxon_id = meta[0] condition = meta[1] v = '{}-taxon_id-{}-{}'.format(experiment, taxon_id, condition) elif isinstance(meta, int): taxon_id = meta v = '{}-taxon_id-{}'.format(experiment, taxon_id) else: raise ValueError('Unexpected format. Check condition_codes.') taxa_without_nih_ensembl = [8364] if taxon_id not in taxa_without_nih_ensembl: # If NIH has corresponding ensembl for ncbi gene IDs, # save original, and ncbi_gene mapped df_entrez = mapper.gene_ensembl_2_gene_ncbi_unambiguously( df, taxon_id) _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=df, df_ncbi=df_entrez) else: # for some taxa NIH does not have mapping to ensembl df.to_csv(os.path.join(p_out, '{}_orig.csv.gz'.format(v)), compression='gzip', index=True)
def uhlen_2015(): """ - RNA transcirpt data form human protein atlas. - log transform fpkm - Expession treshold is 1 fpkm (0 in log transform), as in original paper """ p_in = io.get_geisen_manual_data_path( 'out/papers/uhlen2015/1260419_Excel_TablesS1-S18.xlsx') p_out = io.get_output_path('papers/uhlen_2015') io.ensure_presence_of_directory(p_out) def get_single_sheet(name_of_sheet): df = pd.read_excel(p_in, sheetname=[name_of_sheet]) df = df[name_of_sheet] return df df_cell_lines = get_single_sheet('S11. FPKM Cell-lines') df_tissues = get_single_sheet('S18. Full FPKM dataset, tissues') def tidy_and_index(df): df = df.drop('gene_name', axis=1) df = df.set_index(['enstid']) # They use wrong name, as identifiers df.index.name = 'gene_ensembl' # are actually genes (each occurs once) threshold_used_by_Uhlen_2015 = 1 # Take author's detection threshold default_for_not_detected = np.nan # and ignore values below f = df < threshold_used_by_Uhlen_2015 df[f] = default_for_not_detected return df def log10_fun(x): y = x.applymap(lambda x: np.log10(x)) return y df_cell_lines = tidy_and_index(df_cell_lines) df_tissues = tidy_and_index(df_tissues) df_cell_lines_log10 = log10_fun(df_cell_lines) df_tissues_log10 = log10_fun(df_tissues) df_cell_lines_log10.columns = [ 'uhlen_2015_cells_log10fpkm: {}'.format(j) for j in df_cell_lines_log10.columns ] df_tissues_log10.columns = [ 'uhlen_2015_cells_log10fpkm: {}'.format(j) for j in df_tissues_log10.columns ] # From Science of Biology v.0.1 / Predict module uhlen2015_tissues_levels = df_tissues_log10 uhlen2015_cells_levels = df_cell_lines_log10 uhlen2015_cells_levels.columns = [ j.replace('.MEAN', '') for j in uhlen2015_cells_levels.columns ] def get_detected_fraction(df): d = 1 - df.isnull().sum(axis=1) / df.shape[1] return d detected_in_cells = get_detected_fraction(uhlen2015_cells_levels).to_frame( 'uhlen_2015_fraction_detection_cells') detected_in_tissues = get_detected_fraction( uhlen2015_tissues_levels).to_frame( 'uhlen_2015_fraction_detection_tissues') detected_in_cells_entrez = \ mapper.gene_ensembl_2_gene_ncbi_unambiguously( detected_in_cells, taxon_id=9606) detected_in_tissues_entrez = \ mapper.gene_ensembl_2_gene_ncbi_unambiguously( detected_in_tissues, taxon_id=9606) # correct identity of cell line, also see: # http://www.proteinatlas.org/learn/cellines uhlen2015_cells_levels = uhlen2015_cells_levels.rename(columns={ 'uhlen_2015_cells_log10fpkm: km3': 'uhlen_2015_cells_log10fpkm: reh' }) uhlen2015_cells_levels_entrez = \ mapper.gene_ensembl_2_gene_ncbi_unambiguously( uhlen2015_cells_levels, taxon_id=9606) # science of biology v.0.1 did log again uhlen2015_tissues_levels_entrez = \ mapper.gene_ensembl_2_gene_ncbi_unambiguously( uhlen2015_tissues_levels, taxon_id=9606) v = 'uhlen_2015_detected_in_cells' _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=detected_in_cells, df_ncbi=detected_in_cells_entrez) v = 'uhlen_2015_detected_in_tissuess' _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out, filebase=v, df_orig=detected_in_tissues, df_ncbi=detected_in_tissues_entrez) v = 'uhlen_2015_cells_levels' _save_orig_and_ncbi_gene_mapped_tables( p_dir=p_out, filebase=v, df_orig=uhlen2015_cells_levels, df_ncbi=uhlen2015_cells_levels_entrez) v = 'uhlen_2015_tissue_levels' _save_orig_and_ncbi_gene_mapped_tables( p_dir=p_out, filebase=v, df_orig=uhlen2015_tissues_levels, df_ncbi=uhlen2015_tissues_levels_entrez)
def rolland_2014(): """ Processes supplemental data of Rolland et al. 2014 (binary interaction; three methods) to extract: - interactions with same gene or other genes (stratified by support level) - binary interactin table (note: of genes with at least one interaction) - list of genes, which were tested Requirement: papers/rolland2014/mmc3.xlsx Output: rolland_considered_genes rolland_counts_of_interactions rolland_table_binary_interactions """ p_in = io.get_geisen_manual_data_path('out/papers/rolland2014/mmc3.xlsx') p_out = io.get_output_path('papers/rolland_2014') io.ensure_presence_of_directory(p_out) sheets_of_interest = ['2B', '2G'] rolland = pd.read_excel(p_in, sheetname=sheets_of_interest) bait_table = rolland['2B'] considered_entrez = [] count_of_invalid_baits = 0 # Considered Genes for row in bait_table.itertuples(): t = row.Tsdummyheader # Had manually inserted header ma = re.search('entrez_gene_id=(.*)\|', t) if ma: matched = ma.group(1) if matched == 'NA': count_of_invalid_baits += 1 else: attach = int(matched) considered_entrez.append(attach) considered_entrez = list(set(considered_entrez)) print('Rolland2014: Ignored {} baits that do not map to a gene.'.format( count_of_invalid_baits)) v = 'rolland_considered_genes' df = pd.DataFrame(data=list(considered_entrez), columns=[v]) df.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=False) # Create table where each gene of a non-self interaction occurrs # once as _ida, and once as _idb; note that this was ignored # by accident in science of biology v0.1 interaction_table = rolland['2G'] c = ['entrez_gene_ida', 'entrez_gene_idb', 'screens_found'] f = interaction_table['screens_found'] > 0 df = interaction_table.loc[f, c] df_i = df.iloc[:, [1, 0, 2]].copy() df_j = pd.concat([df, df_i], axis=0, ignore_index=True) df_j = df_j.drop_duplicates() # safety to avoid counting self twice v = 'rolland_table_binary_interactions' df.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=False) # Count occurences (note: code for readability rather than speed) df = pd.DataFrame(index=considered_entrez, columns=[ 'self_interaction_any_evidence', 'self_interaction_multiple_evidence', 'trans_interaction_any_evidence', 'trans_interaction_multiple_evidence', ]) df = df.fillna(False) # Python internally treates False and 0 as same df = df.sort_index() for row in df_j.itertuples(): ix, id_a, id_b, support = row if id_a == id_b: df.loc[id_a, 'self_interaction_any_evidence'] = True else: df.loc[id_a, 'trans_interaction_any_evidence'] += 1 if support > 1: if id_a == id_b: df.loc[id_a, 'self_interaction_multiple_evidence'] = True else: df.loc[id_a, 'trans_interaction_multiple_evidence'] += 1 v = 'trans_interaction_multiple_evidence' # appears to never occur if not (any(df[v])): df = df.drop(v, axis=1) df.columns = ['Rolland2014: {}'.format(j) for j in df.columns] v = 'rolland_counts_of_interactions' df.index.name = 'gene_ncbi' df.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)), compression='gzip', index=True)
def itzhak_2016(): """ Protein localization, and abundance, as measured for HeLa cells by Itzhak et al. 2016 """ p_out = io.get_output_path('papers/itzhak_2016') io.ensure_presence_of_directory(p_out) p = io.get_geisen_manual_data_path( 'out/papers/itzhak2016/' 'elife-16950-supp1-v3-download-hela-spatial-proteome.csv') df = pd.read_csv(p) r = { 'Lead Gene name': 'symbol_ambiguous', 'Lead Protein ID': 'protein_uniprot', 'Non-cytosolic pool1 ': 'Non-cytosolic pool', 'Global classifier2': 'Global classifier', 'Sub compart-ment Prediction': 'Subcompartment Prediction', ' Contribution to cell protein mass [ppm]': 'Contribution to cell protein mass [ppm]' } c = [ 'symbol_ambiguous', 'Prediction Confidence', 'Subcompartment Prediction', 'Lead Protein name', 'Mol. weight [kDa]', 'Sequence length (AA)', 'Total MS/MS Count', 'Organellar profiles in how many maps?' ] df = df.rename(columns=r) df = df.drop(c, axis=1) df['Cytosolic Pool'] = df['Cytosolic Pool'].map( lambda x: int(x.rstrip('%'))) df['Non-cytosolic pool'] = df['Non-cytosolic pool'].map( lambda x: int(x.rstrip('%'))) df['Estimated Copy number per cell'] = df[ 'Estimated Copy number per cell'].str.replace(',', '').astype(int) df['Compartment Prediction'] = df['Compartment Prediction'].fillna( value='not determined') df = df.set_index('protein_uniprot', verify_integrity=True) pr = 'Itzhak2016_' v = 'itzhak2016_compartment_nombool' f = df['Compartment Prediction'].isin(['not determined', 'No Prediction']) y = _nominal_ser_2_boolean_df(df.loc[~f, 'Compartment Prediction']) d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='any') _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr) v = 'itzhak2016_global_classifier_nombool' y = _nominal_ser_2_boolean_df(df.loc[:, 'Compartment Prediction']) d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='any') _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr) v = 'itzhak2016_localization_cytoplasm' y = df.loc[:, ['Cytosolic Pool']] # adds up to 100 with non-cytoplasmic d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='median') _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr) v = 'itzhak2016_localization_stats_ordnum' y = df.loc[:, ['Prediction Score']] d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='median') _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr) v = 'itzhak2016_protein_abundance_ordnum' y = df.loc[:, [ 'Estimated Copy number per cell', 'Copy number Abundance Percentile', 'Median cellular con-centration [nM]', 'Contribution to cell protein mass [ppm]' ]] d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='median') _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr)