Ejemplo n.º 1
0
def test_CogAtLemmatizer():
    """A smoke test for CogAtLemmatizer."""
    cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False)
    id_df = pd.read_csv(cogat["ids"])
    id_df = id_df.loc[id_df["id"] == "trm_4aae62e4ad209"]
    lem = annotate.cogat.CogAtLemmatizer(id_df)
    true_text = "trm_4aae62e4ad209 is great"
    test_text = "Cognitive control is great"
    assert lem.transform(test_text) == true_text
Ejemplo n.º 2
0
def test_cogat(testdata_laird):
    """A smoke test for CogAt-related functions."""
    # A small test dataset with abstracts
    ns_dset_laird = testdata_laird.copy()
    cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False)
    id_df = pd.read_csv(cogat["ids"])
    rel_df = pd.read_csv(cogat["relationships"])
    weights = {"isKindOf": 1, "isPartOf": 1, "inCategory": 1}
    counts_df, rep_text_df = annotate.cogat.extract_cogat(
        ns_dset_laird.texts, id_df, text_column="abstract"
    )
    assert "id" in ns_dset_laird.texts.columns
    expanded_df = annotate.cogat.expand_counts(counts_df, rel_df, weights)
    assert isinstance(expanded_df, pd.DataFrame)
Ejemplo n.º 3
0
def expand_counts(counts_df, rel_df=None, weights=None):
    """Perform hierarchical expansion of counts across labels.

    Parameters
    ----------
    counts_df : (D x T) :obj:`pandas.DataFrame`
        Term counts for a corpus. T = term, D = document.
    rel_df : :obj:`pandas.DataFrame`
        Long-form DataFrame of term-term relationships with at least three columns:
        'input', 'output', and 'rel_type'.
    weights : :obj:`dict`
        Dictionary of weights per relationship type. E.g., {'isKind': 1}.
        Unspecified relationship types default to 0.

    Returns
    -------
    weighted_df : (D x T) :obj:`pandas.DataFrame`
        Term counts for a corpus after hierarchical expansion.
    """
    if rel_df is None:
        cogat = download_cognitive_atlas()
        rel_df = pd.read_csv(cogat["relationships"])
    weights_df = utils._generate_weights(rel_df, weights=weights)

    # First reorg counts_df so it has the same columns in the same order as
    # weight_df
    counts_columns = counts_df.columns.tolist()
    weights_columns = weights_df.columns.tolist()
    w_not_c = set(weights_columns) - set(counts_columns)
    c_not_w = set(counts_columns) - set(weights_columns)
    if c_not_w:
        raise Exception(
            f"Columns found in counts but not weights: {', '.join(c_not_w)}")

    for col in w_not_c:
        counts_df[col] = 0

    counts_df = counts_df[weights_columns]

    # Now matrix multiplication
    counts = counts_df.values
    weights = weights_df.values
    weighted = np.dot(counts, weights)
    weighted_df = pd.DataFrame(index=counts_df.index,
                               columns=counts_df.columns,
                               data=weighted)
    return weighted_df
Ejemplo n.º 4
0
def test_cogat():
    """
    A smoke test for CogAt-related functions.
    """
    # A small test dataset with abstracts
    ns_dset_laird = nimare.dataset.Dataset.load(
        op.join(get_test_data_path(), 'neurosynth_laird_studies.pkl.gz'))
    cogat = extract.download_cognitive_atlas(data_dir=get_test_data_path(),
                                             overwrite=False)
    id_df = pd.read_csv(cogat['ids'])
    rel_df = pd.read_csv(cogat['relationships'])
    weights = {'isKindOf': 1, 'isPartOf': 1, 'inCategory': 1}
    counts_df, rep_text_df = annotate.ontology.cogat.extract_cogat(
        ns_dset_laird.texts, id_df, text_column='abstract')
    expanded_df = annotate.ontology.cogat.expand_counts(
        counts_df, rel_df, weights)
    assert isinstance(expanded_df, pd.DataFrame)
Ejemplo n.º 5
0
    def __init__(self, ontology_df=None):
        if ontology_df is None:
            cogat = download_cognitive_atlas()
            self.ontology_ = pd.read_csv(cogat["ids"])
        else:
            assert isinstance(ontology_df, pd.DataFrame)
            self.ontology_ = ontology_df
        assert "id" in self.ontology_.columns
        assert "name" in self.ontology_.columns
        assert "alias" in self.ontology_.columns

        # Create regex dictionary
        regex_dict = {}
        for term in ontology_df["alias"].values:
            term_for_regex = term.replace("(", r"\(").replace(")", r"\)")
            regex = "\\b" + term_for_regex + "\\b"
            pattern = re.compile(regex, re.MULTILINE | re.IGNORECASE)
            regex_dict[term] = pattern
        self.regex_ = regex_dict
Ejemplo n.º 6
0
import pandas as pd

import nimare
from nimare import annotate, extract
from nimare.tests.utils import get_test_data_path

###############################################################################
# Load dataset with abstracts
# ---------------------------
dset = nimare.dataset.Dataset.load(
    os.path.join(get_test_data_path(), 'neurosynth_laird_studies.pkl.gz'))

###############################################################################
# Download Cognitive Atlas
# ------------------------
cogatlas = extract.download_cognitive_atlas(data_dir=get_test_data_path(),
                                            overwrite=False)
id_df = pd.read_csv(cogatlas['ids'])
rel_df = pd.read_csv(cogatlas['relationships'])

###############################################################################
# ID DataFrame
id_df.head()

###############################################################################
# Relationships DataFrame
rel_df.head()

###############################################################################
# Extract Cognitive Atlas terms from text
# ---------------------------------------
counts_df, rep_text_df = annotate.cogat.extract_cogat(dset.texts,
Ejemplo n.º 7
0
# (content:annotation:cogat)=
# ## Cognitive Atlas term extraction and hierarchical expansion
#
# **Cognitive Atlas term extraction** leverages the structured nature of the Cognitive Atlas in order to extract counts for individual terms and their synonyms in the ontology, as well as to apply hierarchical expansion to these counts based on the relationships specified between terms.
# This method produces both basic term counts and expanded term counts based on the weights applied to different relationship types present in the ontology.
#
# First, we must use {py:func}`~nimare.extract.download_cognitive_atlas` to download the current version of the Cognitive Atlas ontology.
# This includes both information about individual terms in the ontology and asserted relationships between those terms.
#
# NiMARE will automatically attempt to extrapolate likely alternate forms of each term in the ontology, in order to make extraction easier.
# For an example, see {numref}`tbl:table_cogat_forms`.

# In[4]:

cogatlas = extract.download_cognitive_atlas(data_dir=data_path,
                                            overwrite=False)
id_df = pd.read_csv(cogatlas["ids"])
rel_df = pd.read_csv(cogatlas["relationships"])

cogat_counts_df, rep_text_df = annotate.cogat.extract_cogat(
    neurosynth_dset_first_500.texts, id_df, text_column="abstract")

# In[5]:

example_forms = id_df.loc[id_df["name"] == "dot motion task"][[
    "id", "name", "alias"
]]
glue("table_cogat_forms", example_forms)

# ```{glue:figure} table_cogat_forms
# :name: "tbl:table_cogat_forms"
Ejemplo n.º 8
0
import pandas as pd

from nimare import annotate, extract
from nimare.dataset import Dataset
from nimare.utils import get_resource_path

###############################################################################
# Load dataset with abstracts
# -----------------------------------------------------------------------------
dset = Dataset(
    os.path.join(get_resource_path(), "neurosynth_laird_studies.json"))

###############################################################################
# Download Cognitive Atlas
# -----------------------------------------------------------------------------
cogatlas = extract.download_cognitive_atlas(data_dir=get_resource_path(),
                                            overwrite=False)
id_df = pd.read_csv(cogatlas["ids"])
rel_df = pd.read_csv(cogatlas["relationships"])

###############################################################################
# ID DataFrame
id_df.head()

###############################################################################
# Relationships DataFrame
rel_df.head()

###############################################################################
# Extract Cognitive Atlas terms from text
# -----------------------------------------------------------------------------
counts_df, rep_text_df = annotate.cogat.extract_cogat(dset.texts,
Ejemplo n.º 9
0
def extract_cogat(text_df, id_df=None, text_column="abstract"):
    """Extract Cognitive Atlas terms and count instances using regular expressions.

    Parameters
    ----------
    text_df : (D x 2) :obj:`pandas.DataFrame`
        Pandas dataframe with at least two columns: 'id' and the text.
        D = document.

    id_df : (T x 3) :obj:`pandas.DataFrame`
        Cognitive Atlas ontology dataframe with one row for each term and at least three columns:

        - ``"id"``: A unique identifier for each term.
        - ``"alias"``: A natural language expression for each term.
        - ``"name"``: The preferred name of each term. Currently unused.

    text_column : :obj:`str`, optional
        Name of column in text_df that contains text. Default is 'abstract'.

    Returns
    -------
    counts_df : (D x T) :obj:`pandas.DataFrame`
        Term counts for documents in the corpus.
        One row for each document and one column for each term.

    rep_text_df : (D x 2) :obj:`pandas.DataFrame`
        An updated version of the ``text_df`` DataFrame with terms in the text column replaced
        with their CogAt IDs.

    Notes
    -----
    The Cognitive Atlas :footcite:p:`poldrack2011cognitive` is an ontology for describing
    cognitive neuroscience concepts and tasks.

    References
    ----------
    .. footbibliography::

    See Also
    --------
    nimare.extract.download_cognitive_atlas : This function will be called automatically if
                                              ``id_df`` is not provided.
    """
    text_df = text_df.copy()
    if id_df is None:
        cogat = download_cognitive_atlas()
        id_df = pd.read_csv(cogat["ids"])
    gazetteer = sorted(id_df["id"].unique().tolist())
    if "id" in text_df.columns:
        text_df.set_index("id", inplace=True)

    text_df[text_column] = text_df[text_column].fillna("")
    text_df[text_column] = text_df[text_column].apply(_uk_to_us)

    # Create regex dictionary
    regex_dict = {}
    for term in id_df["alias"].values:
        term_for_regex = term.replace("(", r"\(").replace(")", r"\)")
        regex = "\\b" + term_for_regex + "\\b"
        pattern = re.compile(regex, re.MULTILINE | re.IGNORECASE)
        regex_dict[term] = pattern

    # Count
    count_arr = np.zeros((text_df.shape[0], len(gazetteer)), int)
    counts_df = pd.DataFrame(columns=gazetteer,
                             index=text_df.index,
                             data=count_arr)
    for term_idx in id_df.index:
        term = id_df["alias"].loc[term_idx]
        term_id = id_df["id"].loc[term_idx]
        pattern = regex_dict[term]
        counts_df[term_id] += text_df[text_column].str.count(pattern).astype(
            int)
        text_df[text_column] = text_df[text_column].str.replace(
            pattern, term_id)

    return counts_df, text_df