Example #1
0
def rosenfeld_2013():
    """
    Patent data on human genes. Note that companies usually patent
    an n-mer sequence, and its variants, thus they do not really
    patent individual genes, but sequences that have some similarity
    to genes.
    """

    p_in = io.get_geisen_manual_data_path(
        'out/papers/rosenfeld2013/13073_2013_415_MOESM1_ESM.XLS')
    df = pd.read_excel(p_in, skiprows=3)
    df = df.drop_duplicates()
    df = df.rename(columns={
        'Patent': 'patent',
        'Matching Gene': 'symbol_ncbi'
    })
    df_entrez = mapper.symbol_2_gene_ncbi(df, 9606, 'substitute')

    p_out = io.get_output_path('papers/rosenfeld_2013')
    io.ensure_presence_of_directory(p_out)

    v = 'rosenfeld_2013_patents'
    _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                           filebase=v,
                                           df_orig=df,
                                           df_ncbi=df_entrez)
Example #2
0
def blomen_2015():
    """
    Extracts fitness phenotypes from Blomen et al., and saves them
    together with their NCBI gene ID. Will only retreive the insertions
    of the crispr cassettes, and will do so for KBM7 and HAP1 cells.
    """

    p_out = io.get_output_path('papers/blomen_2015')
    io.ensure_presence_of_directory(p_out)

    def _tidy_blomen(file_path, cellline):
        s = cellline + '_full_dataset'
        d = pd.read_excel(file_path, sheetname=s, header=1)
        d['tot.insertions'] = d['tot.sense'] + d['tot.anti']
        d['selected'] = d['selected'] == 'YES'
        d = d.drop('GENE_SYMBOL', axis=1)
        d = d.set_index('ENSEMBL_ID')
        c = 'Blomen2015__' + cellline
        d.columns = [c + ': {}'.format(j) for j in d.columns]
        return d

    fp_KBM7 = io.get_geisen_manual_data_path(
        'out/papers/Blomen2015/aac7557_SM_Table_S1.xlsx')
    cellline = 'KBM7'
    k = _tidy_blomen(fp_KBM7, cellline)

    fp_HAP1 = io.get_geisen_manual_data_path(
        'out/papers/Blomen2015/aac7557_SM_Table_S2.xlsx')
    cellline = 'HAP1'
    h = _tidy_blomen(fp_HAP1, cellline)

    blomen2015 = pd.concat([k, h], join='outer', verify_integrity=True, axis=1)
    blomen2015.index.name = 'gene_ensembl'  # science of biology nomenclature

    # Selecet features which describe insertions, rather
    # than ratios
    # (note: in science of biology v.0.1 this was part of the predict module)
    c = [
        'Blomen2015__KBM7: tot.sense', 'Blomen2015__KBM7: tot.anti',
        'Blomen2015__KBM7: p.val', 'Blomen2015__KBM7: q.val',
        'Blomen2015__HAP1: tot.anti', 'Blomen2015__HAP1: p.val',
        'Blomen2015__HAP1: q.val'
    ]
    blomen2015 = blomen2015.loc[:, c]

    v = 'blomen_2015_fitness_orig'
    blomen2015.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
                      compression='gzip',
                      index=True)

    blomen2015_entrez = mapper.gene_ensembl_2_gene_ncbi_unambiguously(
        blomen2015, taxon_id=9606)

    v = 'blomen_2015_fitness_ncbi_gene'
    blomen2015_entrez.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
                             compression='gzip',
                             index=True)
def symbol_2_gene_ncbi(df, taxon_id, how):
    """
    - Mappes a dataframe with gene symbols IDs to gene_ncbi
    - Places gene_ncbi as the index
    - Only returns genes that could be mapped (inner join)
    - Aggregates according to how (e.g.: median)

    Input:
        df    DataFrame, with symbol_ncbi (or as fallback: symbol_ambiguous)
        taxon_id  int with ncbi taxonomy ID; Required as the same symbols
                    are often used for homologs of different taxa
        how  str, eg.: median, (or substitute to ignore aggregation)

    """

    id_name = 'symbol_ncbi'  # Science of Biology nomeclature
    is_column, is_index = _check_for_presence(df,
                                              id_name,
                                              require_presence=False)

    if (not (is_column)) & (not (is_index)):
        id_name = 'symbol_ambiguous'  # Science of Biology fall-back
        is_column, is_index = _check_for_presence(
            df, id_name, require_presence=True)  # throw error if no match

    p_mapper = io.get_output_path(
        'ncbi/gene_info/gene_info_taxon_{}.gz'.format(taxon_id))

    if not os.path.exists(p_mapper):
        raise EnvironmentError(
            'symbol_2_gene_ncbi() requires taxon specific gene_info')
    mapper = pd.read_csv(p_mapper, usecols=['gene_ncbi', 'symbol_ncbi'])

    if is_index:
        df = df.reset_index()

    dfm = pd.merge(df,
                   mapper,
                   left_on=id_name,
                   right_on='symbol_ncbi',
                   how='inner')
    dfm = dfm.drop(id_name, axis=1)

    if id_name != 'symbol_ncbi':
        dfm = dfm.drop('symbol_ncbi', axis=1)

    if how is not 'substitute':
        df_fused = _group_aggregate_to_gene_ncbi(dfm, how)
    else:
        df_fused = dfm
        if is_index:
            df_fused = df_fused.set_index('gene_ncbi', verify_integrity=True)

    return df_fused
Example #4
0
def wang_2015():
    """
    Wang et al. 2015 (loss of function mutation monitoring fitness)

    """

    p_in = io.get_geisen_manual_data_path(
        'out/papers/wang2015/aac7041_SM_Table_S3.xlsx')
    p_out = io.get_output_path('papers/wang_2015')
    io.ensure_presence_of_directory(p_out)

    df = pd.read_excel(p_in)
    df = df.drop('sgRNAs included', axis=1)
    df = df.rename(columns={'Gene': 'symbol_ambiguous'})
    df = df.set_index('symbol_ambiguous', verify_integrity=True)

    # Remove K562 CS cells, as 39 of the 63 cell specific hits, are artifact
    # of genome location (see publication)
    excl = ['K562 CS', 'K562 adjusted p-value']
    df = df.drop(excl, axis=1)

    df.columns = ['Wang2015: {}'.format(j) for j in df.columns]

    c = ['Wang2015: KBM7 CS', 'Wang2015: Jiyoye CS', 'Wang2015: Raji CS']
    wang_cs = df.loc[:, c]

    wang_cs_entrez = mapper.symbol_2_gene_ncbi(
        wang_cs,
        taxon_id=9606,  # H**o sapiens
        how='median')

    c = [
        'Wang2015: KBM7 adjusted p-value', 'Wang2015: Jiyoye adjusted p-value',
        'Wang2015: Raji adjusted p-value'
    ]
    wang_pvalue = df.loc[:, c]

    wang_pvalue_entrez = mapper.symbol_2_gene_ncbi(
        wang_pvalue,
        taxon_id=9606,  # H**o sapiens
        how='median')

    v = 'wang_2015_cs'
    _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                           filebase=v,
                                           df_orig=wang_cs,
                                           df_ncbi=wang_cs_entrez)

    v = 'wang_2015_pvalue'
    _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                           filebase=v,
                                           df_orig=wang_pvalue,
                                           df_ncbi=wang_pvalue_entrez)
Example #5
0
def hart_2015():
    """
    Extracts fitness phenotypes from Hart et al., and saves them
    together with their NCBI gene ID.

    will isoalte individual datasets as separte fiels
    """

    p_out = io.get_output_path('papers/hart_2015')
    io.ensure_presence_of_directory(p_out)

    p_in = io.get_geisen_manual_data_path(
        'out/papers/hart2015/mmc3_TSDeletedThoseWithExcelToDateConversion.xlsx'
    )

    hart2015 = pd.read_excel(p_in)
    hart2015 = hart2015.rename(columns={'Gene': 'symbol_ambiguous'})
    hart2015 = hart2015.set_index('symbol_ambiguous', verify_integrity=True)
    hart2015.columns = ['Hart2015: {}'.format(j) for j in hart2015.columns]

    hart2015_entrez = mapper.symbol_2_gene_ncbi(
        hart2015,
        taxon_id=9606,  # H**o sapiens
        how='median')

    out_settings = {  # cell-line : column name
        'hart2015_hct116_ordnum': 'Hart2015: BF_hct116',
        'hart2015_hela_ordnum': 'Hart2015: BF_hela',
        'hart2015_gbm_ordnum': 'Hart2015: BF_gbm',
        'hart2015_rpe1_ordnum': 'Hart2015: BF_rpe1',
        'hart2015_dld1_ordnum': 'Hart2015: BF_dld1',
        'hart2015_a375_ko_ordnum': 'Hart2015: BF_a375_GeCKo',
        'hart2015_hct116_shRNA_ordnum': 'Hart2015: BF_hct116_shRNA'
    }

    for cellline, dataset in out_settings.items():

        v = 'hart_2015_{}_ordnum_orig'.format(cellline)
        h = hart2015.loc[:, [dataset]]
        h.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
                 compression='gzip',
                 index=True)

        v = 'hart_2015_{}_ordnum_gene_ncbi'.format(cellline)
        h = hart2015_entrez.loc[:, [dataset]]
        h.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
                 compression='gzip',
                 index=True)
Example #6
0
def lek_2016():
    """
    ExAc database, as published by Lek et al. 2016

    Output:
        lek2016_aberration_ordnum       enrichemnt of aberrations
        lek2016_aniticipation_ordnum    anticipated background rates
    """

    p_out = io.get_output_path('papers/lek_2016')
    io.ensure_presence_of_directory(p_out)

    # high level representation (at transcript level)
    p = io.get_geisen_manual_data_path(
        'out/papers/lek2016/nature19057-SI Table 13.xlsx')
    # data sheet with information on all genes
    df = pd.read_excel(p, sheetname='Gene Constraint')

    # reformatting
    df = df.rename(columns={'transcript':
                            'rna_ensembl'})  # controlled vocabulary
    df['rna_ensembl'] = df['rna_ensembl'].replace(
        '\..*$', '', regex=True)  # ignore versions of transcripts

    v = 'lek2016_aberration_ordnum'
    df_aberration = df[[
        'rna_ensembl', 'syn_z', 'mis_z', 'lof_z', 'pLI', 'pRec', 'pNull'
    ]].set_index('rna_ensembl')
    per_gene_aberration = mapper.rna_ensembl_2_gene_ncbi(df_aberration,
                                                         how='median')
    _save_orig_and_ncbi_gene_mapped_tables(p_out,
                                           filebase=v,
                                           df_orig=df_aberration,
                                           df_ncbi=per_gene_aberration)

    v = 'lek2016_anticipation_ordnum'
    df_anticipation = df[['rna_ensembl', 'exp_syn', 'exp_mis',
                          'exp_lof']].set_index('rna_ensembl')
    per_gene_anticipation = mapper.rna_ensembl_2_gene_ncbi(df_anticipation,
                                                           how='median')
    _save_orig_and_ncbi_gene_mapped_tables(p_out,
                                           filebase=v,
                                           df_orig=df_anticipation,
                                           df_ncbi=per_gene_anticipation)
def locustag_2_gene_ncbi_unambiguously(df, taxon_id):
    """
    Maps locus tag to NCBI (Entrez) gene IDs. Will only
    consider unambiguos 1:1 mappings.

    Input:
        df      dataframe with LocusTag

    Output:
        dfm     dataframe with gene_ncbi as index

    """

    id_name = 'LocusTag'
    is_column, is_index = _check_for_presence(df,
                                              id_name,
                                              require_presence=True)

    # Construct Mapper from gene_info
    p_mapper = io.get_output_path(
        'ncbi/gene_info/gene_info_taxon_{}.gz'.format(taxon_id))
    if not os.path.exists(p_mapper):
        raise EnvironmentError('locustag_2_gene_ncbi_unambiguously()'
                               ' requires gene_info')
    mapper = pd.read_csv(p_mapper, usecols=['gene_ncbi',
                                            'LocusTag']).drop_duplicates()

    # Tidy mapper: only consider unambiguous ones
    forbidden_ncbi = _get_duplicates(mapper['gene_ncbi'])
    forbidden_locus = _get_duplicates(mapper['LocusTag'])
    f = ((~mapper['gene_ncbi'].isin(forbidden_ncbi)) &
         (~mapper['LocusTag'].isin(forbidden_locus)))
    mapper = mapper.loc[f, :]

    if is_index:
        df = df.reset_index()

    dfm = pd.merge(df, mapper, left_on=id_name, right_on=id_name, how='inner')
    dfm = dfm.drop(id_name, axis=1)

    dfm = dfm.set_index('gene_ncbi')

    return dfm
def uniprot_protein_2_gene_ncbi(df, how):
    """
    - Mappes a dataframe with uniprot_protein IDs to gene_ncbi
    - Places gene_ncbi as the index
    - Only returns genes that could be mapped (inner join)
    - Aggregates according to how (e.g.: median)

    Input:
        df    DataFrame, with protein_uniprot
        how  str, eg.: median

    """

    id_name = 'protein_uniprot'  # Science of Biology nomeclature
    is_column, is_index = _check_for_presence(df,
                                              id_name,
                                              require_presence=True)

    p_mapper = io.get_output_path('uniprot/uniprot_id_mapper.h5')
    if not os.path.exists(p_mapper):
        raise EnvironmentError(
            'uniprot_protein_2_gene_ncbi() requires uniprot_id_mapper')
    mapper = pd.read_hdf(p_mapper,
                         'table',
                         columns=['protein_uniprot', 'gene_ncbi'])

    if is_index:
        df = df.reset_index()

    dfm = pd.merge(df,
                   mapper,
                   left_on='protein_uniprot',
                   right_on='protein_uniprot',
                   how='inner')
    dfm = dfm.drop(id_name, axis=1)

    df_fused = _group_aggregate_to_gene_ncbi(dfm, how)

    return df_fused
Example #9
0
def thul_2017():
    """
    Protein subcellular localization from human protein
    atlas
    """

    p_in = io.get_geisen_manual_data_path(
        'out/papers/thul2017/aal3321_Thul_SM_table_S6.xlsx')
    p_out = io.get_output_path('papers/uhlen_2015')
    io.ensure_presence_of_directory(p_out)

    df = pd.read_excel(p_in)

    col = [
        'ENSG', 'Nucleus', 'Nucleoplasm', 'Nuclear bodies', 'Nuclear speckles',
        'Nuclear membrane', 'Nucleoli', 'Nucleoli (Fibrillar center)',
        'Cytosol', 'Cytoplasmic bodies', 'Rods and Rings', 'Lipid droplets',
        'Aggresome', 'Mitochondria', 'Microtubules', 'Microtubule ends',
        'Microtubule organizing center', 'Centrosome', 'Mitotic spindle',
        'Cytokinetic bridge', 'Midbody', 'Midbody ring',
        'Intermediate filaments', 'Actin filaments', 'Focal Adhesions',
        'Endoplasmic reticulum', 'Golgi apparatus', 'Vesicles',
        'Plasma membrane', 'Cell Junctions', 'Reliability'
    ]

    df = df.loc[:, col]
    df = df.rename(columns={'ENSG': 'gene_ensembl'})
    df = df.set_index('gene_ensembl', verify_integrity=True)

    df_entrez = \
        mapper.gene_ensembl_2_gene_ncbi_unambiguously(
            df,
            taxon_id=9606)

    v = 'thul_2017_subcellular_localization'
    _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                           filebase=v,
                                           df_orig=df,
                                           df_ncbi=df_entrez)
def rna_ensembl_2_gene_ncbi(df, how):
    """
    Maps ensembl transcript ID to NCBI (Entrez) gene IDs.

    The present mapper will use gene2ensembl from NIH. Note that
    this can be different from EBI Biomart (which appers to map
    by overlap of any sequence)

    Input:
        df      dataframe with rna_ensembl
        how     str, method for aggregation (e.g.: median)

    Output:
        dfm     dataframe with gene_ncbi as index

    """

    id_name = 'rna_ensembl'  # Science of Biology nomeclature
    is_column, is_index = _check_for_presence(df,
                                              id_name,
                                              require_presence=True)

    p_mapper = io.get_output_path('ncbi/gene2ensembl.gz')
    if not os.path.exists(p_mapper):
        raise EnvironmentError(
            'rna_ensembl_2_gene_ncbi() requires gene2ensembl')
    mapper = pd.read_csv(p_mapper, usecols=['gene_ncbi',
                                            'rna_ensembl']).drop_duplicates()

    if is_index:
        df = df.reset_index()

    dfm = pd.merge(df, mapper, left_on=id_name, right_on=id_name, how='inner')
    dfm = dfm.drop(id_name, axis=1)

    df_fused = _group_aggregate_to_gene_ncbi(dfm, how)

    return df_fused
def gene_ensembl_2_gene_ncbi_unambiguously(df, taxon_id):
    """
    Maps ensembl gene ID to NCBI (Entrez) gene IDs. Will only
    consider unambiguos 1:1 mappings of ensembl and entrez gene IDs.

    Although ncbi and ensg have a working project on creating
    a uniform mapping for mouse and humans, the mapping is not
    necessarily unambiguous; Although there are different mappers
    available from different organisations, different
    organizations have different mapping schemes.

    The present mapper will use NIH's gene_info. Note that
    this can be different from EBI Biomart (which appers to map
    by overlap of any sequence)

    If the mapping of genes is not 1:1 within gene_info, those
    genes will be ignored.

    Note that for some taxa ensembl does not carry unique identifiers,
    but external databases, which are also listed as other
    databases in NIH (e.g.: flybase or wormbase IDs). -> when moving
    to additional taxa, one may need to implement taxon specific
    exteranal references (that would also be used by ensembl)

    Furhter note: in contrast to "Science of Biology v0.1" this
    function uses NIH's gene_info rather NIH's gene2ensembl, as the
    former covers more taxa

    Input:
        df      dataframe with gene_ensembl

    Output:
        dfm     dataframe with gene_ncbi as index

    """

    id_name = 'gene_ensembl'  # Science of Biology nomeclature
    is_column, is_index = _check_for_presence(df,
                                              id_name,
                                              require_presence=True)

    # Construct Mapper from gene_info
    p_mapper = io.get_output_path(
        'ncbi/gene_info/gene_info_taxon_{}.gz'.format(taxon_id))
    if not os.path.exists(p_mapper):
        raise EnvironmentError('gene_ensembl_2_gene_ncbi_unambiguously()'
                               ' requires gene_info')
    m = pd.read_csv(p_mapper, usecols=['gene_ncbi',
                                       'dbXrefs']).drop_duplicates()
    m = m.set_index('gene_ncbi')

    # get taxon specific pattern for extracting ensembl id
    # for some taxa, ensembl has interited from other databases
    if taxon_id in [6239]:
        p = 'WormBase:([A-Za-z0-9]*)'
    elif taxon_id in [7227]:
        p = 'FLYBASE:([A-Za-z0-9]*)'
    else:  # Default
        p = 'Ensembl:([A-Z0-9]*)'

    m = m.loc[:, 'dbXrefs'].str.extractall(p)
    m = m.rename(columns={0: 'gene_ensembl'})
    m = m.reset_index()

    mapper = m[['gene_ncbi', 'gene_ensembl']]

    # Tidy mapper: only consider unambiguous ones
    forbidden_ncbi = _get_duplicates(mapper['gene_ncbi'])
    forbidden_ensg = _get_duplicates(mapper['gene_ensembl'])
    f = ((~mapper['gene_ncbi'].isin(forbidden_ncbi)) &
         (~mapper['gene_ensembl'].isin(forbidden_ensg)))
    mapper = mapper.loc[f, :]

    if is_index:
        df = df.reset_index()

    dfm = pd.merge(df, mapper, left_on=id_name, right_on=id_name, how='inner')
    dfm = dfm.drop(id_name, axis=1)

    dfm = dfm.set_index('gene_ncbi')

    return dfm
def matt_antalek_170222():
    """
    Matt Antalek (Rick Morimoto lab)
    downloaded on 170222 tissue data of several
    model organisms; Used cutoff was 0, and when a filter would
    be required by the web-interface he chose reasonable
    representative ones
    """

    # manually curated condition codes:
    # dictionary with extension as key, and entries
    # - taxon_id
    # - if qualifier: [taxon_id, qualifier]
    condition_codes = {
        'rattus_norvegicus_female': [10116, 'female'],
        'rattus_norvegicus_male': [10116, 'male'],
        'ovis_aries_texel': [9940, 'texel'],
        'ovis_aries_female': [9940, 'female'],
        'ovis_aries_male': [9940, 'male'],
        'mus_musculus': 10090,
        'bos_taurus': 9913,
        'gallus_gallus': 9031,
        'macaca_mulatta': 9544,
        'homo_sapiens': 9606,
        'pabio_anubis': 9555,  # olive baboon
        'monodelphis_domestica': 13616,
        'xenopus_tropicalis': 8364,
        'anolis_carolinesis': 28377,
    }

    p_dir_in = io.get_geisen_manual_data_path(
        'out/'
        'ebi_expression_manual/'
        'matt_antalek_170222/'
        'E-*.tsv')  # filter for correct files

    p_out = io.get_output_path('gxa/matt_antalek_170222')
    io.ensure_presence_of_directory(p_out)

    files = glob.glob(p_dir_in)

    for p in files:

        df = pd.read_table(p, header=3)
        df = df.rename(columns={'Gene ID': 'gene_ensembl'})
        df = df.drop('Gene Name', axis=1)

        def add_GXA_to_label(x):  # introduced in geisen v1_1
            if not x.startswith('gene'):
                x = 'GXA_' + x
            return x

        df.columns = [add_GXA_to_label(y) for y in df.columns]

        _, fname = os.path.split(p)

        matched = re.findall('^(.*)-[0-9].*-results_(.*)\.tsv', fname)

        if len(matched) != 1:
            raise ValueError('Unexpected format. Check parsing pattern.')

        experiment = matched[0][0]

        k = matched[0][1]
        meta = condition_codes[k]

        if isinstance(meta, list):
            taxon_id = meta[0]
            condition = meta[1]
            v = '{}-taxon_id-{}-{}'.format(experiment, taxon_id, condition)
        elif isinstance(meta, int):
            taxon_id = meta
            v = '{}-taxon_id-{}'.format(experiment, taxon_id)
        else:
            raise ValueError('Unexpected format. Check condition_codes.')

        taxa_without_nih_ensembl = [8364]

        if taxon_id not in taxa_without_nih_ensembl:

            # If NIH has corresponding ensembl for ncbi gene IDs,
            # save original, and ncbi_gene mapped

            df_entrez = mapper.gene_ensembl_2_gene_ncbi_unambiguously(
                df, taxon_id)

            _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                                   filebase=v,
                                                   df_orig=df,
                                                   df_ncbi=df_entrez)

        else:  # for some taxa NIH does not have mapping to ensembl

            df.to_csv(os.path.join(p_out, '{}_orig.csv.gz'.format(v)),
                      compression='gzip',
                      index=True)
Example #13
0
def uhlen_2015():
    """
    - RNA transcirpt data form human protein atlas.
    - log transform fpkm
    - Expession treshold is 1 fpkm (0 in log transform), as
        in original paper

    """

    p_in = io.get_geisen_manual_data_path(
        'out/papers/uhlen2015/1260419_Excel_TablesS1-S18.xlsx')
    p_out = io.get_output_path('papers/uhlen_2015')
    io.ensure_presence_of_directory(p_out)

    def get_single_sheet(name_of_sheet):
        df = pd.read_excel(p_in, sheetname=[name_of_sheet])
        df = df[name_of_sheet]
        return df

    df_cell_lines = get_single_sheet('S11. FPKM Cell-lines')
    df_tissues = get_single_sheet('S18. Full FPKM dataset, tissues')

    def tidy_and_index(df):
        df = df.drop('gene_name', axis=1)
        df = df.set_index(['enstid'])  # They use wrong name, as identifiers
        df.index.name = 'gene_ensembl'  # are actually genes (each occurs once)
        threshold_used_by_Uhlen_2015 = 1  # Take author's detection threshold
        default_for_not_detected = np.nan  # and ignore values below

        f = df < threshold_used_by_Uhlen_2015
        df[f] = default_for_not_detected

        return df

    def log10_fun(x):
        y = x.applymap(lambda x: np.log10(x))
        return y

    df_cell_lines = tidy_and_index(df_cell_lines)
    df_tissues = tidy_and_index(df_tissues)
    df_cell_lines_log10 = log10_fun(df_cell_lines)
    df_tissues_log10 = log10_fun(df_tissues)

    df_cell_lines_log10.columns = [
        'uhlen_2015_cells_log10fpkm: {}'.format(j)
        for j in df_cell_lines_log10.columns
    ]
    df_tissues_log10.columns = [
        'uhlen_2015_cells_log10fpkm: {}'.format(j)
        for j in df_tissues_log10.columns
    ]

    # From Science of Biology v.0.1 / Predict module
    uhlen2015_tissues_levels = df_tissues_log10
    uhlen2015_cells_levels = df_cell_lines_log10

    uhlen2015_cells_levels.columns = [
        j.replace('.MEAN', '') for j in uhlen2015_cells_levels.columns
    ]

    def get_detected_fraction(df):
        d = 1 - df.isnull().sum(axis=1) / df.shape[1]
        return d

    detected_in_cells = get_detected_fraction(uhlen2015_cells_levels).to_frame(
        'uhlen_2015_fraction_detection_cells')
    detected_in_tissues = get_detected_fraction(
        uhlen2015_tissues_levels).to_frame(
            'uhlen_2015_fraction_detection_tissues')

    detected_in_cells_entrez = \
        mapper.gene_ensembl_2_gene_ncbi_unambiguously(
            detected_in_cells, taxon_id=9606)

    detected_in_tissues_entrez = \
        mapper.gene_ensembl_2_gene_ncbi_unambiguously(
            detected_in_tissues, taxon_id=9606)

    # correct identity of cell line, also see:
    # http://www.proteinatlas.org/learn/cellines
    uhlen2015_cells_levels = uhlen2015_cells_levels.rename(columns={
        'uhlen_2015_cells_log10fpkm: km3':
        'uhlen_2015_cells_log10fpkm: reh'
    })

    uhlen2015_cells_levels_entrez = \
        mapper.gene_ensembl_2_gene_ncbi_unambiguously(
            uhlen2015_cells_levels,
            taxon_id=9606)  # science of biology v.0.1 did log again

    uhlen2015_tissues_levels_entrez = \
        mapper.gene_ensembl_2_gene_ncbi_unambiguously(
            uhlen2015_tissues_levels,
            taxon_id=9606)

    v = 'uhlen_2015_detected_in_cells'
    _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                           filebase=v,
                                           df_orig=detected_in_cells,
                                           df_ncbi=detected_in_cells_entrez)

    v = 'uhlen_2015_detected_in_tissuess'
    _save_orig_and_ncbi_gene_mapped_tables(p_dir=p_out,
                                           filebase=v,
                                           df_orig=detected_in_tissues,
                                           df_ncbi=detected_in_tissues_entrez)

    v = 'uhlen_2015_cells_levels'
    _save_orig_and_ncbi_gene_mapped_tables(
        p_dir=p_out,
        filebase=v,
        df_orig=uhlen2015_cells_levels,
        df_ncbi=uhlen2015_cells_levels_entrez)

    v = 'uhlen_2015_tissue_levels'
    _save_orig_and_ncbi_gene_mapped_tables(
        p_dir=p_out,
        filebase=v,
        df_orig=uhlen2015_tissues_levels,
        df_ncbi=uhlen2015_tissues_levels_entrez)
Example #14
0
def rolland_2014():
    """
    Processes supplemental data of Rolland et al. 2014
    (binary interaction; three methods) to extract:
    - interactions with same gene or other genes
        (stratified by support level)
    - binary interactin table (note: of genes with at least one interaction)
    - list of genes, which were tested

    Requirement:
        papers/rolland2014/mmc3.xlsx

    Output:
        rolland_considered_genes
        rolland_counts_of_interactions
        rolland_table_binary_interactions

    """

    p_in = io.get_geisen_manual_data_path('out/papers/rolland2014/mmc3.xlsx')
    p_out = io.get_output_path('papers/rolland_2014')
    io.ensure_presence_of_directory(p_out)

    sheets_of_interest = ['2B', '2G']
    rolland = pd.read_excel(p_in, sheetname=sheets_of_interest)

    bait_table = rolland['2B']

    considered_entrez = []
    count_of_invalid_baits = 0

    # Considered Genes
    for row in bait_table.itertuples():
        t = row.Tsdummyheader  # Had manually inserted header
        ma = re.search('entrez_gene_id=(.*)\|', t)
        if ma:
            matched = ma.group(1)
            if matched == 'NA':
                count_of_invalid_baits += 1
            else:
                attach = int(matched)
                considered_entrez.append(attach)

    considered_entrez = list(set(considered_entrez))
    print('Rolland2014: Ignored {} baits that do not map to a gene.'.format(
        count_of_invalid_baits))

    v = 'rolland_considered_genes'
    df = pd.DataFrame(data=list(considered_entrez), columns=[v])
    df.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
              compression='gzip',
              index=False)

    # Create table where each gene of a non-self interaction occurrs
    # once as _ida, and once as _idb; note that this was ignored
    # by accident in science of biology v0.1
    interaction_table = rolland['2G']
    c = ['entrez_gene_ida', 'entrez_gene_idb', 'screens_found']
    f = interaction_table['screens_found'] > 0
    df = interaction_table.loc[f, c]
    df_i = df.iloc[:, [1, 0, 2]].copy()
    df_j = pd.concat([df, df_i], axis=0, ignore_index=True)
    df_j = df_j.drop_duplicates()  # safety to avoid counting self twice

    v = 'rolland_table_binary_interactions'
    df.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
              compression='gzip',
              index=False)

    # Count occurences (note: code for readability rather than speed)
    df = pd.DataFrame(index=considered_entrez,
                      columns=[
                          'self_interaction_any_evidence',
                          'self_interaction_multiple_evidence',
                          'trans_interaction_any_evidence',
                          'trans_interaction_multiple_evidence',
                      ])

    df = df.fillna(False)  # Python internally treates False and 0 as same
    df = df.sort_index()

    for row in df_j.itertuples():
        ix, id_a, id_b, support = row

        if id_a == id_b:
            df.loc[id_a, 'self_interaction_any_evidence'] = True
        else:
            df.loc[id_a, 'trans_interaction_any_evidence'] += 1

            if support > 1:
                if id_a == id_b:
                    df.loc[id_a, 'self_interaction_multiple_evidence'] = True
                else:
                    df.loc[id_a, 'trans_interaction_multiple_evidence'] += 1

    v = 'trans_interaction_multiple_evidence'  # appears to never occur
    if not (any(df[v])):
        df = df.drop(v, axis=1)

    df.columns = ['Rolland2014: {}'.format(j) for j in df.columns]

    v = 'rolland_counts_of_interactions'
    df.index.name = 'gene_ncbi'
    df.to_csv(os.path.join(p_out, '{}.csv.gz'.format(v)),
              compression='gzip',
              index=True)
Example #15
0
def itzhak_2016():
    """
    Protein localization, and abundance, as measured for HeLa cells
    by Itzhak et al. 2016
    """

    p_out = io.get_output_path('papers/itzhak_2016')
    io.ensure_presence_of_directory(p_out)

    p = io.get_geisen_manual_data_path(
        'out/papers/itzhak2016/'
        'elife-16950-supp1-v3-download-hela-spatial-proteome.csv')

    df = pd.read_csv(p)

    r = {
        'Lead Gene name':
        'symbol_ambiguous',
        'Lead Protein ID':
        'protein_uniprot',
        'Non-cytosolic pool1 ':
        'Non-cytosolic pool',
        'Global classifier2':
        'Global classifier',
        'Sub compart-ment Prediction':
        'Subcompartment Prediction',
        ' Contribution to cell protein mass [ppm]':
        'Contribution to cell protein mass [ppm]'
    }

    c = [
        'symbol_ambiguous', 'Prediction Confidence',
        'Subcompartment Prediction', 'Lead Protein name', 'Mol. weight [kDa]',
        'Sequence length (AA)', 'Total MS/MS Count',
        'Organellar profiles in how many maps?'
    ]

    df = df.rename(columns=r)
    df = df.drop(c, axis=1)

    df['Cytosolic Pool'] = df['Cytosolic Pool'].map(
        lambda x: int(x.rstrip('%')))
    df['Non-cytosolic pool'] = df['Non-cytosolic pool'].map(
        lambda x: int(x.rstrip('%')))

    df['Estimated Copy number per cell'] = df[
        'Estimated Copy number per cell'].str.replace(',', '').astype(int)

    df['Compartment Prediction'] = df['Compartment Prediction'].fillna(
        value='not determined')
    df = df.set_index('protein_uniprot', verify_integrity=True)

    pr = 'Itzhak2016_'

    v = 'itzhak2016_compartment_nombool'
    f = df['Compartment Prediction'].isin(['not determined', 'No Prediction'])
    y = _nominal_ser_2_boolean_df(df.loc[~f, 'Compartment Prediction'])
    d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='any')
    _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr)

    v = 'itzhak2016_global_classifier_nombool'
    y = _nominal_ser_2_boolean_df(df.loc[:, 'Compartment Prediction'])
    d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='any')
    _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr)

    v = 'itzhak2016_localization_cytoplasm'
    y = df.loc[:, ['Cytosolic Pool']]  # adds up to 100 with non-cytoplasmic
    d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='median')
    _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr)

    v = 'itzhak2016_localization_stats_ordnum'
    y = df.loc[:, ['Prediction Score']]
    d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='median')
    _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr)

    v = 'itzhak2016_protein_abundance_ordnum'
    y = df.loc[:, [
        'Estimated Copy number per cell', 'Copy number Abundance Percentile',
        'Median cellular con-centration [nM]',
        'Contribution to cell protein mass [ppm]'
    ]]
    d = mapper.uniprot_protein_2_gene_ncbi(df=y, how='median')
    _save_orig_and_ncbi_gene_mapped_tables(p_out, v, y, d, pr)
def export_selected_genealacart_datasets(patch_absent=False):
    """
    Will load selected datasetes from genealacard and export in a format
    that is consistent with the science of biology project

    Requirements:
        geisen_manual       with genealacart

    Input:
        patch_absent    optional; default: False; If True,
                            absent files will be added (e.g.:
                            if novel features of GeneCards should
                            be extracted)
    """

    p_out = io.get_output_path('genealacart')
    io.ensure_presence_of_directory(p_out)
    if io.check_number_of_files_in_directory(p_out, 'gz') > 0:
        raise EnvironmentError('Output directory needs to be empty')

    def export(df, name):
        o = os.path.join(p_out, 'genealacart_{}.gz'.format(name))

        if patch_absent:
            if not os.path.exists(o):
                df.to_csv(o, index=True, compression='gzip')
                print('Added absent file {}'.format(o))
        else:
            io.ensure_absence_of_file(o)
            df.to_csv(o, index=True, compression='gzip')

    def add_counts_for_absent_reference_genes(df):
        d = pd.merge(reference_genes,
                     df,
                     left_on='gene_ncbi',
                     right_index=True,
                     how='left')
        d = d.fillna(0)
        d = d.set_index('gene_ncbi')
        d = d.astype(int)

        return d

    # Reference genes: all genes that are in genealacart, and
    # unambiguously map to gene_ncbi gene IDs
    reference_genes = load_genealacart_dataset('ExternalIdentifiers')
    reference_genes = reference_genes[['EntrezGene_x']]
    reference_genes = reference_genes.rename(
        columns={'EntrezGene_x': 'gene_ncbi'})

    print('Start processing ENCODE')
    amount_of_enhancers, tf_by_gene = _get_encode()
    export(amount_of_enhancers, 'encode_amount_of_tfs')
    export(tf_by_gene, 'encode_tfs_by_gene')

    print('Start processing Promoters (ENSRs)')
    amount_of_tfs, tf_by_gene = _get_promoters()
    export(amount_of_enhancers, 'promoters_amount_of_tfs')
    export(tf_by_gene, 'promoters_tfs_by_gene')

    print('Start processing intolerance')
    df_gdi, df_rvis = _get_intolerance()
    export(df_gdi, 'intolerance_gdi')
    export(df_rvis, 'intolerance_rvis')

    print('Start processing selected disease databases')
    dbs = ['DISEASES', 'Orphanet', 'OMIM']

    for disease in dbs:
        amount_of_diseases, df_stack_diseases = _get_disease(disease)
        amount_of_diseases = add_counts_for_absent_reference_genes(
            amount_of_diseases)
        export(amount_of_diseases, '{}_amount'.format(disease.lower()))
        export(df_stack_diseases, '{}_kind'.format(disease.lower()))

    print('Start processing human phenotypes')
    amount_of_phenotypes, df_stack_phenotype = _get_human_phenotype_ontology()
    amount_of_phenotypes = add_counts_for_absent_reference_genes(
        amount_of_phenotypes)
    export(amount_of_phenotypes, 'phenotype_ontology_amount')
    export(df_stack_phenotype, 'phenotype_ontology_kind')

    print('Start processing GIFTS score')
    gifts = _get_gifts()
    export(gifts, 'annotation_range_gifts')