def test_CogAtLemmatizer(): """A smoke test for CogAtLemmatizer.""" cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False) id_df = pd.read_csv(cogat["ids"]) id_df = id_df.loc[id_df["id"] == "trm_4aae62e4ad209"] lem = annotate.cogat.CogAtLemmatizer(id_df) true_text = "trm_4aae62e4ad209 is great" test_text = "Cognitive control is great" assert lem.transform(test_text) == true_text
def test_cogat(testdata_laird): """A smoke test for CogAt-related functions.""" # A small test dataset with abstracts ns_dset_laird = testdata_laird.copy() cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False) id_df = pd.read_csv(cogat["ids"]) rel_df = pd.read_csv(cogat["relationships"]) weights = {"isKindOf": 1, "isPartOf": 1, "inCategory": 1} counts_df, rep_text_df = annotate.cogat.extract_cogat( ns_dset_laird.texts, id_df, text_column="abstract" ) assert "id" in ns_dset_laird.texts.columns expanded_df = annotate.cogat.expand_counts(counts_df, rel_df, weights) assert isinstance(expanded_df, pd.DataFrame)
def expand_counts(counts_df, rel_df=None, weights=None): """Perform hierarchical expansion of counts across labels. Parameters ---------- counts_df : (D x T) :obj:`pandas.DataFrame` Term counts for a corpus. T = term, D = document. rel_df : :obj:`pandas.DataFrame` Long-form DataFrame of term-term relationships with at least three columns: 'input', 'output', and 'rel_type'. weights : :obj:`dict` Dictionary of weights per relationship type. E.g., {'isKind': 1}. Unspecified relationship types default to 0. Returns ------- weighted_df : (D x T) :obj:`pandas.DataFrame` Term counts for a corpus after hierarchical expansion. """ if rel_df is None: cogat = download_cognitive_atlas() rel_df = pd.read_csv(cogat["relationships"]) weights_df = utils._generate_weights(rel_df, weights=weights) # First reorg counts_df so it has the same columns in the same order as # weight_df counts_columns = counts_df.columns.tolist() weights_columns = weights_df.columns.tolist() w_not_c = set(weights_columns) - set(counts_columns) c_not_w = set(counts_columns) - set(weights_columns) if c_not_w: raise Exception( f"Columns found in counts but not weights: {', '.join(c_not_w)}") for col in w_not_c: counts_df[col] = 0 counts_df = counts_df[weights_columns] # Now matrix multiplication counts = counts_df.values weights = weights_df.values weighted = np.dot(counts, weights) weighted_df = pd.DataFrame(index=counts_df.index, columns=counts_df.columns, data=weighted) return weighted_df
def test_cogat(): """ A smoke test for CogAt-related functions. """ # A small test dataset with abstracts ns_dset_laird = nimare.dataset.Dataset.load( op.join(get_test_data_path(), 'neurosynth_laird_studies.pkl.gz')) cogat = extract.download_cognitive_atlas(data_dir=get_test_data_path(), overwrite=False) id_df = pd.read_csv(cogat['ids']) rel_df = pd.read_csv(cogat['relationships']) weights = {'isKindOf': 1, 'isPartOf': 1, 'inCategory': 1} counts_df, rep_text_df = annotate.ontology.cogat.extract_cogat( ns_dset_laird.texts, id_df, text_column='abstract') expanded_df = annotate.ontology.cogat.expand_counts( counts_df, rel_df, weights) assert isinstance(expanded_df, pd.DataFrame)
def __init__(self, ontology_df=None): if ontology_df is None: cogat = download_cognitive_atlas() self.ontology_ = pd.read_csv(cogat["ids"]) else: assert isinstance(ontology_df, pd.DataFrame) self.ontology_ = ontology_df assert "id" in self.ontology_.columns assert "name" in self.ontology_.columns assert "alias" in self.ontology_.columns # Create regex dictionary regex_dict = {} for term in ontology_df["alias"].values: term_for_regex = term.replace("(", r"\(").replace(")", r"\)") regex = "\\b" + term_for_regex + "\\b" pattern = re.compile(regex, re.MULTILINE | re.IGNORECASE) regex_dict[term] = pattern self.regex_ = regex_dict
import pandas as pd import nimare from nimare import annotate, extract from nimare.tests.utils import get_test_data_path ############################################################################### # Load dataset with abstracts # --------------------------- dset = nimare.dataset.Dataset.load( os.path.join(get_test_data_path(), 'neurosynth_laird_studies.pkl.gz')) ############################################################################### # Download Cognitive Atlas # ------------------------ cogatlas = extract.download_cognitive_atlas(data_dir=get_test_data_path(), overwrite=False) id_df = pd.read_csv(cogatlas['ids']) rel_df = pd.read_csv(cogatlas['relationships']) ############################################################################### # ID DataFrame id_df.head() ############################################################################### # Relationships DataFrame rel_df.head() ############################################################################### # Extract Cognitive Atlas terms from text # --------------------------------------- counts_df, rep_text_df = annotate.cogat.extract_cogat(dset.texts,
# (content:annotation:cogat)= # ## Cognitive Atlas term extraction and hierarchical expansion # # **Cognitive Atlas term extraction** leverages the structured nature of the Cognitive Atlas in order to extract counts for individual terms and their synonyms in the ontology, as well as to apply hierarchical expansion to these counts based on the relationships specified between terms. # This method produces both basic term counts and expanded term counts based on the weights applied to different relationship types present in the ontology. # # First, we must use {py:func}`~nimare.extract.download_cognitive_atlas` to download the current version of the Cognitive Atlas ontology. # This includes both information about individual terms in the ontology and asserted relationships between those terms. # # NiMARE will automatically attempt to extrapolate likely alternate forms of each term in the ontology, in order to make extraction easier. # For an example, see {numref}`tbl:table_cogat_forms`. # In[4]: cogatlas = extract.download_cognitive_atlas(data_dir=data_path, overwrite=False) id_df = pd.read_csv(cogatlas["ids"]) rel_df = pd.read_csv(cogatlas["relationships"]) cogat_counts_df, rep_text_df = annotate.cogat.extract_cogat( neurosynth_dset_first_500.texts, id_df, text_column="abstract") # In[5]: example_forms = id_df.loc[id_df["name"] == "dot motion task"][[ "id", "name", "alias" ]] glue("table_cogat_forms", example_forms) # ```{glue:figure} table_cogat_forms # :name: "tbl:table_cogat_forms"
import pandas as pd from nimare import annotate, extract from nimare.dataset import Dataset from nimare.utils import get_resource_path ############################################################################### # Load dataset with abstracts # ----------------------------------------------------------------------------- dset = Dataset( os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) ############################################################################### # Download Cognitive Atlas # ----------------------------------------------------------------------------- cogatlas = extract.download_cognitive_atlas(data_dir=get_resource_path(), overwrite=False) id_df = pd.read_csv(cogatlas["ids"]) rel_df = pd.read_csv(cogatlas["relationships"]) ############################################################################### # ID DataFrame id_df.head() ############################################################################### # Relationships DataFrame rel_df.head() ############################################################################### # Extract Cognitive Atlas terms from text # ----------------------------------------------------------------------------- counts_df, rep_text_df = annotate.cogat.extract_cogat(dset.texts,
def extract_cogat(text_df, id_df=None, text_column="abstract"): """Extract Cognitive Atlas terms and count instances using regular expressions. Parameters ---------- text_df : (D x 2) :obj:`pandas.DataFrame` Pandas dataframe with at least two columns: 'id' and the text. D = document. id_df : (T x 3) :obj:`pandas.DataFrame` Cognitive Atlas ontology dataframe with one row for each term and at least three columns: - ``"id"``: A unique identifier for each term. - ``"alias"``: A natural language expression for each term. - ``"name"``: The preferred name of each term. Currently unused. text_column : :obj:`str`, optional Name of column in text_df that contains text. Default is 'abstract'. Returns ------- counts_df : (D x T) :obj:`pandas.DataFrame` Term counts for documents in the corpus. One row for each document and one column for each term. rep_text_df : (D x 2) :obj:`pandas.DataFrame` An updated version of the ``text_df`` DataFrame with terms in the text column replaced with their CogAt IDs. Notes ----- The Cognitive Atlas :footcite:p:`poldrack2011cognitive` is an ontology for describing cognitive neuroscience concepts and tasks. References ---------- .. footbibliography:: See Also -------- nimare.extract.download_cognitive_atlas : This function will be called automatically if ``id_df`` is not provided. """ text_df = text_df.copy() if id_df is None: cogat = download_cognitive_atlas() id_df = pd.read_csv(cogat["ids"]) gazetteer = sorted(id_df["id"].unique().tolist()) if "id" in text_df.columns: text_df.set_index("id", inplace=True) text_df[text_column] = text_df[text_column].fillna("") text_df[text_column] = text_df[text_column].apply(_uk_to_us) # Create regex dictionary regex_dict = {} for term in id_df["alias"].values: term_for_regex = term.replace("(", r"\(").replace(")", r"\)") regex = "\\b" + term_for_regex + "\\b" pattern = re.compile(regex, re.MULTILINE | re.IGNORECASE) regex_dict[term] = pattern # Count count_arr = np.zeros((text_df.shape[0], len(gazetteer)), int) counts_df = pd.DataFrame(columns=gazetteer, index=text_df.index, data=count_arr) for term_idx in id_df.index: term = id_df["alias"].loc[term_idx] term_id = id_df["id"].loc[term_idx] pattern = regex_dict[term] counts_df[term_id] += text_df[text_column].str.count(pattern).astype( int) text_df[text_column] = text_df[text_column].str.replace( pattern, term_id) return counts_df, text_df