Python get_path 예제들, access_science_shared.inout.get_path Python 예제들

예제 #1

0

파일 보기

파일: annotation.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

    def _load_go_v1_1(taxon_id):

        p = inout.get_path(
            'geisen', 'ncbi/gene2go/gene2go_taxon_{}.csv.gz'.format(
                int(taxon_id)))
        df = pd.read_csv(p)

        p = inout.get_path(
            'geisen', 'ncbi/gene2go/go_id_to_term.csv.gz')
        df_label = pd.read_csv(p)

        df = pd.merge(
            df, df_label, left_on='GO_ID', right_on='GO_ID', how='left')
        return df

예제 #2

0

파일 보기

파일: annotation.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def human_phenotype_genealacart(taxon_id=9606, add_absenece=True):
    """
    Human phenotypes
    Source: Human Phenotype Ontology through Genealacart

    Input:
        taxon_id    int
        add_absence bool; default is True; add genes for which there
                    is no phenotype entry
    Output:
        dataframe
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path(
        'geisen',
        'genealacart/genealacart_phenotype_ontology_kind.gz')
    df = pd.read_csv(p)

    df = df.set_index('gene_ncbi')
    df = df.reset_index()

    if add_absenece:
        p = inout.get_path(
            'geisen',
            'genealacart/genealacart_phenotype_ontology_amount.gz')
        df_c = pd.read_csv(p)

        extra = np.setdiff1d(
            np.array(df_c['gene_ncbi'].unique()),
            np.array(df['gene_ncbi'].unique()))

        df_p = pd.DataFrame(
            index=extra,
            columns=[
                'human_phenotype_genealacart: human_phenotype_id',
                'human_phenotype_genealacart: human_phenotype_name'
            ])
        df_p.loc[
            :,
            'human_phenotype_genealacart: human_phenotype_name'] = \
            'No known human phenotype'
        df_p.index.name = 'gene_ncbi'
        df_p = df_p.reset_index()
        df = pd.concat([df, df_p], axis=0)

    return df

예제 #3

0

파일 보기

파일: patents.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def rosenfeld_2013(cols_to_use=None):
    """
    Will load gene-linked patent data as described by
    Rosenfeld et al. ; Note that most patents are filed
    for sequences, rather than genes, and are ambiguous,
    and including related genes, on purpose. See
    publication of Rosenfeld et al. on that
    socio-economical problem, and for specific
    cutoff / mapping scheme applied by them (Note:
    which sounds like a very reasonable compromise)

    Note: original datset might have some excess
    records that could not be unambiguously mapped
    to gene_ncbi (Entrez gene IDs)

    Output:
    linkage table between patent and gene_ncbi

    """

    p_folder = inout.get_path(
        'geisen',
        os.path.join('papers', 'rosenfeld_2013',
                     'rosenfeld_2013_patents_ncbi_gene.csv.gz'))

    df = pd.read_csv(p_folder, usecols=['patent',
                                        'gene_ncbi'])  # skip index column
    df = df.drop_duplicates()

    return df

예제 #4

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def allelepool_lek_2016_anticipation(taxon_id=9606):
    """
    Anticipated variability in human populations (allele diversity)
    Original source data: Lek et al. 2015

    Input:
        taxon_id    int (safety check)

    Output:
        df          ordinal numbers
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')
    n = 'lek2016_anticipation_ordnum_ncbi_gene.csv.gz'
    p = inout.get_path('geisen', 'papers/lek_2016/{}'.format(n))

    df = pd.read_csv(p)

    def add_label(x):
        if not x.startswith('gene_ncbi'):
            x = 'Population variability Lek ' + x
        return x

    df.columns = [add_label(x) for x in df.columns]
    return df

예제 #5

0

파일 보기

파일: nar180310_mega_integrator.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def challenged_proteins():

    p = inout.get_path('publications', 'ezkurdia2014/ddu309supp_tables1.xlsx')

    df_all = pd.read_excel(p, sheet_name='All G12 genes')

    df_tagged = pd.read_excel(p, sheet_name='Possible non-coding set')

    gi = _get_gene_ncbi_2_ensembl()

    if (df_tagged['ENSEMBL'].isin(df_all['ENSEMBL'])).all():
        ge = sorted(gi[gi['gene_ensembl'].isin(
            df_all['ENSEMBL'])]['gene_ncbi'])

    df_suspicious = gi.copy()

    df_suspicious = df_suspicious[df_suspicious['gene_ensembl'].isin(
        df_all['ENSEMBL'])]

    df_suspicious.loc[
        :,
        'ezkurdia_challenged'] = df_suspicious.loc[:, 'gene_ensembl'].isin(
            df_tagged['ENSEMBL'])

    dd = df_suspicious[['gene_ncbi', 'ezkurdia_challenged']
                       ].set_index('gene_ncbi')
    cl = dd.copy()

    return cl, dd, ge

예제 #6

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def protein_abundance_itzhak_2015(taxon_id=9606):
    """
    Protein abundance in a human cell line
    Original source data: Itzhak et al. 2015

    Input:
        taxon_id    int (safety check)

    Output:
        df          ordinal numbers
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path(
        'geisen', 'papers/itzhak_2016/' +
        'itzhak2016_protein_abundance_ordnum_ncbi_gene.csv.gz')
    df = pd.read_csv(p)

    def add_label(x):
        if not x.startswith('gene_ncbi'):
            x = 'Protein Itzhak ' + x
        return x

    df.columns = [add_label(x) for x in df.columns]
    return df

예제 #7

0

파일 보기

파일: annotation.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def generif(taxon_id):
    """
    Loads gene RIFs

    Input:
        taxon_id    list of taxa, or int of taxon id, or 'all'

    Output:
        generifs    dataframe

    """

    p = inout.get_path(
        'geisen',
        'ncbi/generifs_basic.gz')

    df = pd.read_csv(p, low_memory=False)

    if isinstance(taxon_id, int):
        taxon_id = [taxon_id]

    if taxon_id != 'all':
        f = df.loc[:, 'taxon_ncbi'].isin(taxon_id)
        df = df.loc[f, :]

    return df

예제 #8

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def genbank_genomic_cds(taxon_id):
    """
    Features for predicted coding sequences; Source is one of the
    most complete genbank releases (manually selected one per species).
    Features have been extracted for a manually selected set of species
    (roughly corresponding to heavily studied species.)

    For genbank_genomic_cds the features correspond to individual
    nucleotides and individual codons.

    Note that genomic CDS may not be defined for some species within
    the original data source (genbank)

    Input:
        taxon_id    int
    Output:
        df          ordinal numbers
    """

    p = inout.get_path(
        'geisen',
        'genbank/genomic_cds/genbank_genomic_cds_{}.csv'.format(taxon_id))
    if os.path.exists(p):
        df = pd.read_csv(p)
    else:
        print(p)
        raise EnvironmentError(
            'Did not find data for taxon {}'.format(taxon_id))
    return df

예제 #9

0

파일 보기

파일: phenotype_collections.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def genome_rnai(taxon_id):
    """
    Will load genome_rnai for datasets mappable to a
    Pubmed ID

    Input:
        taxon_id  e.g.: 9606 for human

    Output:
        df     Genome RNAi table
    """

    if taxon_id == 9606:
        pheno_set = 'GenomeRNAi_v17_Homo_sapiens.txt'
    else:
        raise EnvironmentError('This function presenlty only supports human.')

    # import genome RNAi dataset
    p = inout.get_path('genome_rnai', pheno_set)

    agg = []

    is_valid_pubmed = False
    pubmed_id = 'not_set'
    with open(p, 'r') as rea:
        for line in rea:
            line = line.strip('\n')
            if line.startswith('#Pubmed ID='):
                pubmed_id = line[len('#Pubmed ID='):]
                is_valid_pubmed = len(pubmed_id) > 1
            elif line.startswith('#Screen Title'):
                pubmed_id = 'invalid'
            elif not line.startswith('#'):
                if not line.startswith('//'):
                    if is_valid_pubmed:
                        h = line.strip('\n').split('\t')
                        h = h + [pubmed_id]
                        agg.append(h)

    co = [
        'Stable ID', 'Entrez ID', 'Gene ID', 'Gene Symbol', 'Reagent ID',
        'Score', 'Phenotype', 'Conditions', 'Follow Up', 'Comment', 'pubmed_id'
    ]
    df = pd.DataFrame(data=agg, columns=co)

    f = df['Phenotype'] == 'Inconclusive'
    df = df.loc[~f, :]

    df = df.rename(
        columns={
            'Entrez ID': 'gene_ncbi',
            'Gene ID': 'gene_id_ambiguous',
            'Gene Symbol': 'gene_symbol_ambiguous',
            'Stable ID': 'genome_rnai_id',
            'Phenotype': 'phenotype',
            'Reagent ID': 'reagent_id'
        })
    df = df.reset_index(drop=True)

    return df

예제 #10

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def compartment_itzhak_2016_cytoplasmic(taxon_id=9606):
    """
    Cytoplasmic location (note complements with non-cytoplasmic
    localization to 100); Note that the amount of recoreds is higher
    than the amount of records for other predictions / features
    of the original underlying publication (the finely resolved
    localization)

    Input:
        taxon_id    int safety

    Ouput:
        df          nominal data
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    def _load_cytoplasmic_v1():
        df = compartment_itzhak_2016_global_scores(taxon_id)
        col_to_use = ['gene_ncbi', 'Localization Itzhak Cytosolic Pool']
        df = df.loc[:, col_to_use]
        return df

    p = inout.get_path(
        'geisen', 'papers/itzhak_2016/' +
        'itzhak2016_localization_cytoplasm_ncbi_gene.csv.gz')

    if os.path.exists(p):  # geisen v1_1 or higher
        df = pd.read_csv(p)
    else:
        df = _load_cytoplasmic_v1(taxon_id)

    return df

예제 #11

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def compartment_itzhak_2016_global_scores(taxon_id=9606):
    """
    Prediction accuracy, and broad classification by nucleo-cyplasmic
    localization

    Input:
        taxon_id    int (safety check)

    Output:
        df      ordinal numbers

    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path(
        'geisen', 'papers/itzhak_2016/' +
        'itzhak2016_localization_stats_ordnum_ncbi_gene.csv.gz')
    df = pd.read_csv(p)

    def add_global(x):
        if not x.startswith('gene_ncbi'):
            x = 'Localization Itzhak ' + x
        return x

    df.columns = [add_global(x) for x in df.columns]
    return df

예제 #12

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def transcript_abundance_gerstein(taxon_id=6239):
    """
    Abundance of transcripts under mixed conditoins
    Source: modENCODE through gesrtein lab

    NaN if original value below 1 (0 on log scale) or below detection threshold

    Note that their meta-annotation is horrible, and misleading, and does
    not allow to separate treatments from tissues etc.
    While modEncode was helpful, and relayed request to Gerstein lab,
    the Gerstein lab never replied (neither to modEncode help, nor me),
    although this highl-level dataset is shown as part of modEncode


    Input:
        taxon_id    int (saftey check)

    Output:
        df          ordinal numbers,
                    and NaN (below reliable measurement threshold)
    """

    if taxon_id != 6239:
        raise EnvironmentError('Only supports taxon 6239, C. elegans')
    p = inout.get_path('geisen',
                       'gerstein/gerstein_expression__ncbi_gene.csv.gz')

    df = pd.read_csv(p)
    return df

예제 #13

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def genbank_validated_rna(taxon_id):
    """
    Features for RNA with high support by genbank curators;
    Source is one of the most complete genbank releases
    (manually selected one per species). Features have been extracted
    for a manually selected set of species
    (roughly corresponding to heavily studied species.)

    For genbank_validated_rna the features correspond to
    individual nucleotides, codons, and cdon bias.

    Note that validated RNA may not be defined for some species within
    the original data source (genbank)

    Input:
        taxon_id    int
    Output:
        df          ordinal numbers
    """

    p = inout.get_path(
        'geisen',
        'genbank/validated_rna/genbank_validated_rna_{}.csv'.format(taxon_id))
    if os.path.exists(p):
        df = pd.read_csv(p)
    else:
        raise EnvironmentError(
            'Did not find data for taxon {}'.format(taxon_id))
    return df

예제 #14

0

파일 보기

파일: annotation.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def omim_genealacart(taxon_id=9606, add_absenece=True):
    """
    Mendelian diseases
    Source: OMIM through Genealacart

    Note that the source OMIM database, which is quite messy, would
    in principle allow a higher level of stratication then Genealacart

    Input:
        taxon_id    int
        add_absence bool; default is True; add genes for which there
                    is no disease entry
    Output:
        dataframe
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path(
        'geisen',
        'genealacart/genealacart_omim_kind.gz')
    df = pd.read_csv(p)

    df = df.set_index('gene_ncbi')
    df.columns = ['omim_disease__{}'.format(x) for x in df.columns]
    df = df.rename(columns={'omim_disease__disease_name': 'omim_disease'})
    df = df.reset_index()

    if add_absenece:
        p = inout.get_path(
            'geisen',
            'genealacart/genealacart_omim_amount.gz')
        df_c = pd.read_csv(p)

        extra = np.setdiff1d(
            np.array(df_c['gene_ncbi'].unique()),
            np.array(df['gene_ncbi'].unique()))

        df_p = pd.DataFrame(index=extra, columns=['omim_disease'])
        df_p.loc[:, 'omim_disease'] = 'No entry in OMIM'
        df_p.index.name = 'gene_ncbi'
        df_p = df_p.reset_index()
        df = pd.concat([df, df_p], axis=0)

    return df

예제 #15

0

파일 보기

파일: meta.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

    def load_all_taxa(usecols=usecols):
        p = inout.get_path('geisen', 'ncbi/gene_info/gene_info_full.gz')

        if os.path.exists(p):
            df = pd.read_csv(p, usecols=usecols)
        else:
            df = None  # Extra Safety
            raise EnvironmentError('Did not find gene info for all taxa')
        return df

예제 #16

0

파일 보기

def genes_2_drugs_and_status(taxon_id, target_class):
    """
    Loads drugbank IDs, and status for individual genes of
    taxon_id

    Input:
        taxon_id        int  taxon ID
        target_class    str  'pharmacologically_active' or 'all'

    """

    # Define code to filter drugbank
    dictionary_of_taxa = {9606: 'Human'}

    # neglect separation by drug class (e.g: small molecule)
    considered_status = [
        'approved', 'experimental', 'illicit', 'investigational',
        'nutrazeutical', 'withdrawn'
    ]

    agg = []
    for status in considered_status:

        p = inout.get_path(
            'drugbank',
            'protein_identifiers/drug_target_identifiers/{}/{}.csv'.format(
                status, target_class))
        df = pd.read_csv(p)

        f = df['Species'] == dictionary_of_taxa[taxon_id]
        df = df.loc[f, ['UniProt ID', 'Drug IDs']]
        df.loc[:, 'status'] = status
        agg.append(df)

    df = pd.concat(agg, axis=0)
    df = utils.split_text_to_multiple_rows(df, 'Drug IDs', ';')
    df = df.rename(columns={
        'UniProt ID': 'protein_uniprot',
        'Drug IDs': 'drug_drugbank'
    }).drop_duplicates()

    p_mapper = mapper._get_geisen_path('uniprot/uniprot_id_mapper.h5')
    if not os.path.exists(p_mapper):
        raise EnvironmentError(
            'uniprot_protein_2_gene_ncbi() requires uniprot_id_mapper')
    ma = pd.read_hdf(p_mapper,
                     'table',
                     columns=['protein_uniprot', 'gene_ncbi'],
                     where='taxon_ncbi={}'.format(taxon_id))

    df = pd.merge(df, ma)[['gene_ncbi', 'drug_drugbank',
                           'status']].drop_duplicates()

    df = df.sort_values(['gene_ncbi', 'status',
                         'drug_drugbank']).reset_index(drop=True)

    return df

예제 #17

0

파일 보기

파일: annotation.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def disease_genealacart(taxon_id=9606, add_absenece=True):
    """
    Unified diseases
    Source: Genealacart

    Input:
        taxon_id    int
        add_absence bool; default is True; add genes for which there
                    is no disease entry
    Output:
        dataframe
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path(
        'geisen',
        'genealacart/genealacart_diseases_kind.gz')
    df = pd.read_csv(p)

    df = df.set_index('gene_ncbi')
    df.columns = ['unified_disease__{}'.format(x) for x in df.columns]
    df = df.rename(
        columns={'unified_disease__disease_name': 'unified_disease'})
    df = df.reset_index()

    if add_absenece:
        p = inout.get_path(
            'geisen',
            'genealacart/genealacart_diseases_amount.gz')
        df_c = pd.read_csv(p)

        extra = np.setdiff1d(
            np.array(df_c['gene_ncbi'].unique()),
            np.array(df['gene_ncbi'].unique()))

        df_p = pd.DataFrame(index=extra, columns=['unified_disease'])
        df_p.loc[:, 'unified_disease'] = 'No known disease'
        df_p.index.name = 'gene_ncbi'
        df_p = df_p.reset_index()
        df = pd.concat([df, df_p], axis=0)

    return df

예제 #18

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

    def load_file(code):
        n = 'hart_2015_hart2015_{}_ordnum_ordnum_gene_ncbi.csv.gz'.format(code)
        p = inout.get_path('geisen', 'papers/hart_2015/{}'.format(n))

        if os.path.exists(p):
            df = pd.read_csv(p)
        else:
            print(p)
            raise EnvironmentError('Did not find data for {}'.format(code))
        return df

예제 #19

0

파일 보기

파일: meta.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

    def load_taxon(taxon_id, usecols):
        p = inout.get_path(
            'geisen', 'ncbi/gene_info/gene_info_taxon_{}.gz'.format(taxon_id))

        if os.path.exists(p):
            df = pd.read_csv(p, usecols=usecols)
        else:
            df = None  # Extra Safety
            raise EnvironmentError(
                'Did not find gene info for taxon {}'.format(taxon_id))
        return df

예제 #20

0

파일 보기

파일: gene_mapper.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def _get_geisen_path(ext):
    """
    support function to get to geisen files;
    Note that this function is built to later allow updates of mapping
    functions through considering updates in Geisen

    Input:
        ext     extensin within geisen folder
    """

    p = inout.get_path('geisen', ext)
    return p

예제 #21

0

파일 보기

def external_links():
    p = inout.get_path('drugbank', 'external_drug_links/drug_links.csv')
    df = pd.read_csv(p)

    df = df.rename(columns={
        'DrugBank ID': 'drug_drugbank'
    }).set_index('drug_drugbank', verify_integrity=True)

    if df['Name'].str.lower().value_counts().max() > 1:
        raise ValueError('Drug names are not unambigous')

    return df

예제 #22

0

파일 보기

def gene2pubmed(taxon_id=None, usecols=None, paper_kind=None, ref_genes=None):
    """
    Loads gene2pubmed from NIH; Will only return
    non-duplicated data in casse that columns are
    specified.

    Input:
        taxon_id    int, or 'all'
        usecols     optional, list of columns to be loaded
        paper_kind  optional; filter for  articles, e.g:
                        'research' to filter for papers in
                        Medline, where meta data suggests
                        that it is a research paper
        ref_genes   optional; filter for genes in ref_genes

    Output:
        gene2pubmed df

    """

    p = inout.get_path('geisen', 'ncbi/gene2pubmed.h5')

    if os.path.exists(p) is False:
        raise EnvironmentError('Did not find gene2pubmed')

    def load_all_taxa():
        df = pd.read_hdf(p, 'table')
        return df

    def load_taxon(usecols):
        q = 'taxon_ncbi=={}'.format(taxon_id)
        df = pd.read_hdf(p, 'table', where=q)
        return df

    # Implement input specific behavior of gene_info
    if taxon_id == 'all':
        df = load_all_taxa()
    elif isinstance(taxon_id, int):
        df = load_taxon(taxon_id)
    else:
        raise EnvironmentError('Did not recognize format of taxon_id')

    if usecols is not None:
        df = df.loc[:, usecols]
        df = df.drop_duplicates()

    if paper_kind is not None:
        df = standardizer.filter_by_paper_kind(df, paper_kind)

    if ref_genes is not None:
        df = df[df['gene_ncbi'].isin(ref_genes)]

    return df

예제 #23

0

파일 보기

파일: wos.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def dais(subset, allowed_dais='all', allowed_wos='all'):
    """
    Loads all disambiguated author data for Web Of Science,
    together with authorship information; Will iterate through
    batched data; Thus filtering might be used to reduce memory
    footprint (e.g.: if wos IDs are arleady known)

    Input:
        subset      wos_dais that should be loaded; options:
                        'gene-linked' and 'all'
        allowed_dais    list of dais that should be loaded
        allowed_wos     list of wos IDs that should be loaded

    Output:
        df_wos_dais

    """

    p = inout.get_path('rbusa', 'disambiguation/wos_dais')

    if subset == 'gene-linked':
        mask = os.path.join(p, 'wos_dais_gene_mapped_batch_*.csv.gz')
    elif subset == 'all':
        mask = os.path.join(p, 'wos_dais_all_batch*.csv.gz')
    else:
        raise EnvironmentError('subset not specified')

    agg = []
    for fi in glob.glob(mask):
        df = pd.read_csv(fi)

        if allowed_dais is not 'all':
            f = df.loc[:, 'DAIS'].isin(allowed_dais)
            df = df.loc[f, :]

        if allowed_wos is not 'all':
            f = df.loc[:, 'WOS'].isin(allowed_wos)
            df = df.loc[f, :]

        agg.append(df)

    df = pd.concat(agg)
    df = df.rename(columns={'WOS': 'wos_id', 'DAIS': 'dais_id'})
    df.loc[:, 'dais_id'] = df.loc[:, 'dais_id'].astype(int)
    df.loc[:, 'wos_id'] = df.loc[:, 'wos_id'].astype(str)

    target_amount_of_numbes = 15
    df.loc[:, 'wos_id'] = df.loc[:, 'wos_id'].apply(
        lambda x: x.zfill(target_amount_of_numbes))
    df.loc[:, 'wos_id'] = df.loc[:, 'wos_id'].astype(str)

    return df

예제 #24

0

파일 보기

def _load_batches(path_pattern, dataset):
    p_scheme = inout.get_path('geisen_manual', os.path.join(path_pattern))
    agg = []
    for p in glob.glob(p_scheme):
        p = os.path.join(p, '{}.txt'.format(dataset))
        agg.append(pd.read_table(p))
    df = pd.concat(agg)
    df = df.drop_duplicates()

    # place symbols lower-case as genealacart does not appear
    # to distinguish internally
    n = ['InputTerm', 'Symbol']
    for t in n:
        df.loc[:, t] = df.loc[:, t].str.lower()

    return df

예제 #25

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def transcript_abundance_gex_mantalek_170222(mask):
    """
    Tissue specific gene expression
    Source: EBML-EBI Expression Atlas (https://www.ebi.ac.uk/gxa/)
            Selected datasets manually downloaded by M. Antalek on
            170222 using cutoff of 0

    Unlike many other gene expression datasets, the returned gene
    expression values are not log-transformed


    Input:
        mask    int or str; e.g: taxon_id, or taxon_id-condition
                (where condition is sample specific, e.g.: 10116-female)

    Output:
        df      ordinal numbers
    """

    p = inout.get_path(
        'geisen', 'gxa/matt_antalek_170222/*-{}[-_]*_gene.csv.gz'.format(mask))

    g = glob.glob(p)

    if len(g) == 0:
        raise EnvironmentError(
            'Did not find any dataset matching the mask {}'.format(mask))

    df = pd.read_csv(g[0])

    if len(g) > 1:
        for gn in g:
            df_n = pd.read_csv(gn)

            ref_c = 'gene_ncbi'
            if sorted(df[ref_c].values) == sorted(df_n[ref_c].values):
                df = pd.merge(df,
                              df_n,
                              left_on=ref_c,
                              right_on=ref_c,
                              how='outer')

        if any(df.isnull().sum()):
            raise EnvironmentError(
                'Some measurements are lacking in one of the datasets.')

    return df

예제 #26

0

파일 보기

파일: meta.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def taxon_name(taxon_id):
    """
    Obtains name of taxon

    Note that the original reference data would also
    contain synonymous names, and the class of the name
    (e.g.: whether it is a scientific name)

    Input:
        taxon_id    int; optionally set to 'all' to load
                        table with full taxonomy information

    Output:
        taxon_name  str
    """

    p = inout.get_path('geisen', 'ncbi/taxon_names.h5')

    if taxon_id == 'all':

        df = pd.read_hdf(p, 'table')
        df = df.set_index('taxon_ncbi', verify_integrity=True)
        output = df

    else:

        q = 'taxon_ncbi=={}'.format(taxon_id)
        df = pd.read_hdf(p, 'table', where=q)

        v = df['taxon_name'].values
        if len(v) == 0:
            print('Could not find name of taxon {}'.format(taxon_id))
            name = 'not found taxon; id is {}'.format(taxon_id)
        elif len(v) > 1:
            raise EnvironmentError(
                'Found multipole records for taxon {}'.format(taxon_id))
        elif len(v) == 1:
            name = str(v[0])
        else:
            raise EnvironmentError(
                'Some error in code. This condition should never be' +
                'triggered. Please investigate code of this function.')

        output = name

    return output

예제 #27

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def transcript_halflife_tani_2012_assuming_48h_for_stable(taxon_id=9606):
    """
    Transcript halflife in HeLa cells, as measured by Tani et al.
    Note that the orgiginal data is on RNA, and different RNA of
    same gene can get pooled (here: median). Thus this function uses
    an educated guess for > 24h; more specifically, 48h hours will be used
    so that, if one transcript is >24 and there is only one <24h,
    the presence of a long lived transcript will be shown in the
    aggregated readout of both RNA species of the same gene

    Input:
        taxon_id    int (saftey check)

    Output:
        df          ordnum (having given categorial value >24 
                        the auxiliary value 48) - see above
    """

    hours_to_assume_for_over_24 = 48  # manually selected by Thomas Stoeger

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path('publications',
                       'tani2012/Tani_Supp_Tables_revised2.xls')

    df = pd.read_excel(p, sheet_name='Table S1', skiprows=3)
    f = df['t1/2 (h)'] != 'N.D.'
    df = df.loc[f, ['RepName', 't1/2 (h)']].rename(columns={
        'RepName': 'rna_ncbi',
        't1/2 (h)': 'rna_halflife_h'
    })

    df['rna_ncbi'] = df['rna_ncbi'].str.strip(',')
    df = utils.split_text_to_multiple_rows(df, 'rna_ncbi', ',')

    df['rna_halflife_h'] = df['rna_halflife_h'].replace(
        '>24', hours_to_assume_for_over_24).astype(float)

    df = df.groupby('rna_ncbi').agg(np.median)
    df = mapper.rna_ncbi_2_gene_ncbi(df, 'median')
    df = df.reset_index()

    return df

예제 #28

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def transcript_detection_uhlen_2015_tissues(taxon_id=9606):
    """
    Fraction of tissue samples with expression >= 1FPKM
    Source: Uhlen et al. 2015

    Input:
        taxon_id    int (saftey check)

    Output:
        df          ordinal numbers
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')
    n = 'uhlen_2015_detected_in_tissuess_ncbi_gene.csv.gz'
    p = inout.get_path('geisen', 'papers/uhlen_2015/{}'.format(n))

    df = pd.read_csv(p)
    return df

예제 #29

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def transcription_factors_genealacart_promoters(taxon_id=9606):
    """
    Occurences of transcription factors in promoter regions
    Source: Genealacart

    Input:
        taxon_id    int (saftey check)

    Output:
        df          ordinal numbers
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')
    n = 'genealacart_promoters_tfs_by_gene.gz'
    p = inout.get_path('geisen', 'genealacart/{}'.format(n))

    df = pd.read_csv(p)
    return df

예제 #30

0

파일 보기

파일: properties.py 프로젝트: tstoeger/plos_biology_2018_ignored_genes

def rvis(taxon_id=9606):
    """
    RVIS mutagenesis intolerance
    source: RVIS through genealacart

    Input:
        taxon_id    int (safety)

    Output:
        df          ordinal numbers
    """

    if taxon_id != 9606:
        raise EnvironmentError('Only supports taxon 9606, H**o sapiens')

    p = inout.get_path('geisen', 'genealacart/genealacart_intolerance_rvis.gz')

    df = pd.read_csv(p)
    return df