Ejemplo n.º 1
0
def get_transcript_ids_of_gene_ids(gene_ids, ensembl):
    """ Extract the transcript ids to which the given gene ids match.

    N.B. It is possible for a single transcript id to match to multiple genes.

    Parameters
    ----------
    gene_ids: iterable of strings
        The gene ids

    ensembl: pyensembl.Genome
        The annotations

    Returns
    -------
    transcript_gene_id_df: pd.DataFrame
        A dataframe with columns to map between transcripts and genes. Its
        columns are:

            transcript_id
            gene_id
    """
    id_mapping = parallel.apply_iter_simple(gene_ids,
                                            get_transcript_ids_of_gene_id_df,
                                            ensembl)
    id_mapping = collection_utils.flatten_lists(id_mapping)
    id_mapping = collection_utils.remove_nones(id_mapping)
    id_mapping = pd.DataFrame(id_mapping)
    return id_mapping
Ejemplo n.º 2
0
def read_ame(ame_file, method='ranksum'):
    """ Parse the text output of AME into a data frame.

    Please see the documentation for more about AME:
        http://meme-suite.org/doc/ame.html

    N.B. The parsing code here is brittle. If the results seem like nonsense,
    then it is possible the output format of AME has changed, and this code
    needs to be updated.

    The was written based on output from ame version 4.11.2.

    The options to ame which may affect the output were:
        --scoring avg 
        --method ranksum 

    Parameters
    ----------
    ame_file: string
        The path to the txt output of AME

    method: string. see meme_utils.AME_METHODS for allowed methods
        The method used when running AME. 

    Returns
    -------
    ame_df: pd.DataFrame
        A data frame with the following columns:
            motif_name: the name of the motif (from the meme files)
            num_seqs: the number of sequences used in the test
            {left,right,twotailed}_pvalue: the respective p-values
            corrected_{left,right,twotailed}_pvalue: the respective p-values
                after bonferroni correction
            u_value: an "association score" (see the paper for details)
    """
    if method not in AME_METHODS:
        msg = "[meme_utils.read_ame]: invalid method: {}".format(method)
        raise ValueError(msg)

    ame_df = []

    with open(ame_file) as f:
        for line in f:
            if method == 'ranksum':
                res = _parse_ame_ranksum_line(line)
            elif method == 'fisher':
                res = _parse_ame_fisher_line(line)

            ame_df.append(res)

    ame_df = collection_utils.remove_nones(ame_df)
    ame_df = pd.DataFrame(ame_df)
    return ame_df
def process_chunk_count_vectorizer(df, config):

    filenames = df.apply(mp_filenames.get_note_event_filename_row,
                         axis=1,
                         args=(config, NOTE_CLEANED))
    filenames = list(filenames)
    filenames = collection_utils.remove_nones(filenames)

    icv = IncrementalCountVectorizer(prune=False,
                                     create_mapping=False,
                                     get_tokens=get_tokens)

    icv_fit = icv.fit(filenames)

    return icv_fit
Ejemplo n.º 4
0
def query_mygene(gene_ids: Iterable[str],
                 species: Optional[str] = 'all',
                 scopes: Optional[str] = None,
                 unique_only: bool = True,
                 mygene_url: str = "http://mygene.info/v3") -> pd.DataFrame:
    """ Query mygene.info to find information about the `gene_ids`
    
    In particular, this function looks for information (via mygene) from
    Swiss-Prot, TrEMBL, Interpro, PDB, Pfam, PROSITE, the Gene Ontology,
    and KEGG.
        
    Parameters
    ----------
    gene_ids : typing.Iterable[str]
        A list of gene identifiers for the query. This list is passed
        directly to MyGeneInfo.getgenes, so any identifiers which are valid
        for it are valid here.

        As of 1.7.2016, it supports entrez/ensembl gene ids, where the entrez
        gene id can be either a string or integer.

        Example valid identifiers:
            ENSG00000004059 (from human)
            ENSMUSG00000051951 (from mouse)
            WBGene00022277 (from c. elegans)

    species : typing.Optional[str]
        Optionally, a comma-delimited list of species can be given. By default,
        all species are considered. For ensembl identifiers, this is unlikely
        to be a problem; for other scopes (such as gene symbols), it could cause
        unexpected behavior, though.

        Please see the mygene.info docs for more details about valid
        species: http://mygene.info/doc/data.html#species
    
    scopes : typing.Optional[str]
        Optionally, a comma-delimited list of scopes can be given for the
        `gene_ids` (so they need not necessarily be entrez/ensembl gene
        identifiers. 

        Please see the mygene.info docs for more details about 
        valid scopes: http://mygene.info/doc/query_service.html#available-fields

    unique_only : bool
        Whether to use only unique `gene_ids`. Typically, duplicates in the
        `gene_ids` list can cause problems for the joins in this function and
        cause huge memory usage. Unless some specific use-case is known,
        duplicates should be removed.

    mygene_url : str
        The url to use to connect to the mygene server. In principle, "v2" could
        be used instead of "v3".

    Returns
    -------
    df_gene_info : pandas.DataFrame
        A data frame with the following columns. All values are strings and separated
        by semi-colons if multiple values are present:

        * 'gene_id': the original query
        * 'swiss_prot': any matching Swiss-Prot identifier (e.g., "Q9N4D9")
        * 'trembl': any matching TrEMBL identifiers (e.g., "H2KZI1;Q95Y41")
        * 'name': a short description of the gene
        * 'summary': a longer description of the gene
        * 'pdb': any matching Protein Data Bank identifiers (e.g., "2B6H")
        * 'pfam': any matching Pfam protein families (e.g., "PF00254;PF00515")
        * 'prosite': any matching PROSITE protein domains, etc. (e.g., "PS50114;PS51156")
        * 'kegg_pathways': the name of any matching KEGG pathways
        * 'go_terms': the term name of any associated GO terms
        * 'interpro_families': the description of any associated Interpro family
    """

    MG = mygene.MyGeneInfo()
    MG.url = mygene_url

    # pull the information from mygene
    msg = "Pulling annotations from mygene.info"
    logger.info(msg)

    if unique_only:
        gene_ids = set(gene_ids)

    res = MG.getgenes(gene_ids,
                      fields=FIELDS,
                      as_dataframe=True,
                      species=species,
                      scopes=scopes)
    res = res.reset_index()

    # parse out the various fields

    msg = "Parsing KEGG pathways"
    logger.info(msg)

    kps = pd_utils.apply(res, parse_kegg_pathways)
    kps = collection_utils.remove_nones(kps)
    kps_df = pd.DataFrame(kps)

    msg = "Parsing GO annotations"
    logger.info(msg)

    gos = pd_utils.apply(res, parse_go_terms)
    gos = collection_utils.remove_nones(gos)
    gos_df = pd.DataFrame(gos)

    msg = "Parsing Interpro families"
    logger.info(msg)

    interpros = pd_utils.apply(res, parse_interpro)
    interpros = collection_utils.remove_nones(interpros)
    interpros_df = pd.DataFrame(interpros)

    # make sure our columns of interest are there and have the right names
    msg = "Cleaning mygene data frame"
    logger.info(msg)

    res = res.rename(columns=MYGENE_COLUMN_MAP)

    for col in MYGENE_COLUMN_MAP.keys():
        if col not in res:
            res[col] = None

    for col in MYGENE_COLUMNS:
        if col not in res:
            res[col] = None

    # keep only the columns we care about
    res_df = res[MYGENE_COLUMNS]

    # merge the non-empty data frames
    msg = "Merging results"
    logger.info(msg)

    dfs = [res_df, kps_df, gos_df, interpros_df]
    dfs = [df for df in dfs if len(df.columns) > 0]

    res_df = pd_utils.join_df_list(dfs, join_col='gene_id', how='inner')

    # clean up the data frame and return it
    msg = "Cleaning mygene data frame, phase 2"
    logger.info(msg)

    res_df = res_df.drop_duplicates(subset='gene_id')
    res_df = res_df.fillna('')

    # fix the string vs. list of string fields
    sep_join = lambda l: ";".join(l)

    res_df['trembl'] = res_df['trembl'].apply(
        collection_utils.wrap_string_in_list)
    res_df['trembl'] = res_df['trembl'].apply(sep_join)

    res_df['pdb'] = res_df['pdb'].apply(collection_utils.wrap_string_in_list)
    res_df['pdb'] = res_df['pdb'].apply(sep_join)

    res_df['pfam'] = res_df['pfam'].apply(collection_utils.wrap_string_in_list)
    res_df['pfam'] = res_df['pfam'].apply(sep_join)

    res_df['prosite'] = res_df['prosite'].apply(
        collection_utils.wrap_string_in_list)
    res_df['prosite'] = res_df['prosite'].apply(sep_join)

    msg = "Finished compiling mygene annotations"
    logger.info(msg)

    return res_df