Exemple #1
0
def concept_json_dump():
    ''' use cognitiveatlas library to dump concepts to a json file '''
    all_concepts = []
    concepts = get_concept().json
    for concept in concepts:
        all_concepts.append(get_concept(id=concept['id']).json)
    with open("all_concepts.json", 'w') as fp:
        json.dump(all_concepts, fp)
def get_cognitiveatlas_task(task_id):
    """get_cognitiveatlas_task
    return the database entry for CognitiveAtlasTask if it exists, and update concepts for that task. If not, create it.
    :param task_id: the unique id for the cognitive atlas task
    """
    try:
        task = get_task(id=task_id).json[0]
        cogatlas_task, _ = CognitiveAtlasTask.objects.update_or_create(
            cog_atlas_id=task["id"], defaults={"name": task["name"]}
        )
        concept_list = []
        if "concepts" in task.keys():
            for concept in task["concepts"]:
                cogatlas_concept = get_concept(id=concept["concept_id"]).json[0]
                cogatlas_concept, _ = CognitiveAtlasConcept.objects.update_or_create(
                    cog_atlas_id=cogatlas_concept["id"],
                    defaults={"name": cogatlas_concept["name"]},
                    definition=cogatlas_concept["definition_text"],
                )
                cogatlas_concept.save()
                concept_list.append(cogatlas_concept)
        cogatlas_task.concepts = concept_list
        cogatlas_task.save()
        return cogatlas_task
    except:
        # Any error with API, etc, return None
        return None
Exemple #3
0
def get_cognitiveatlas_task(task_id):
    '''get_cognitiveatlas_task
    return the database entry for CognitiveAtlasTask if it exists, and update concepts for that task. If not, create it.
    :param task_id: the unique id for the cognitive atlas task
    '''
    try:
        task = get_task(id=task_id).json[0]
        cogatlas_task, _ = CognitiveAtlasTask.objects.update_or_create(
            cog_atlas_id=task["id"], defaults={"name": task["name"]})
        concept_list = []
        if "concepts" in task.keys():
            for concept in task["concepts"]:
                cogatlas_concept = get_concept(
                    id=concept["concept_id"]).json[0]
                cogatlas_concept, _ = CognitiveAtlasConcept.objects.update_or_create(
                    cog_atlas_id=cogatlas_concept["id"],
                    defaults={"name": cogatlas_concept["name"]},
                    definition=cogatlas_concept["definition_text"])
                cogatlas_concept.save()
                concept_list.append(cogatlas_concept)
        cogatlas_task.concepts = concept_list
        cogatlas_task.save()
        return cogatlas_task
    except BaseException:
        # Any error with API, etc, return None
        return None
Exemple #4
0
 def get_concepts(self, force_load=False):
     if os.path.exists(os.path.join(self.datadir,
                                    'concepts_df.csv')) and not force_load:
         print('using cached cognitive atlas concepts')
         self.concepts_df = pandas.read_csv(
             os.path.join(self.datadir, 'concepts_df.csv'))
     else:
         self.concepts_df = get_concept().pandas
         self.concepts_df.to_csv(
             os.path.join(self.datadir, 'concepts_df.csv'))
     self.concepts = self.concepts_df.name.tolist()
Exemple #5
0
def test_concepts():
    print "### TESTING CONCEPT QUERIES:"
    concept_id = "trm_5022ef7599294"
    concept_name = "anxiety"
    contrast_id = "cnt_5299143fed521"

    # concept_id
    result = get_concept(id=concept_id)
    assert_equal(result.json[0]["name"],concept_name)

    # concept_name
    result = get_concept(name=concept_name)
    assert_equal(result.json[0]["id"],concept_id)

    # contrast_id
    result = get_concept(contrast_id=contrast_id)
    assert_equal(result.json[0]["id"],concept_id)

    # concept_id and concept_name
    result = get_concept(id=concept_id,name=concept_name)
    assert_equal(result.json[0]["name"],concept_name)

    # concept_id, and contrast_id
    result = get_concept(id=concept_id,contrast_id=contrast_id)
    assert_equal(result.json[0]["name"],concept_name)

    # concept_name and contrast_id
    result = get_concept(name=concept_name,contrast_id=contrast_id)
    assert_equal(result.json[0]["id"],concept_id)
Exemple #6
0
def test_concepts():
    print("### TESTING CONCEPT QUERIES:")
    concept_id = "trm_5022ef7599294"
    concept_name = "anxiety"
    contrast_id = "cnt_5299143fed521"

    # concept_id
    result = get_concept(id=concept_id)
    assert_equal(result.json[0]["name"], concept_name)

    # concept_name
    result = get_concept(name=concept_name)
    assert_equal(result.json[0]["id"], concept_id)

    # contrast_id
    result = get_concept(contrast_id=contrast_id)
    assert_true(any(concept_id == obj["id"] for obj in result.json))
    #assert_equal(result.json[0]["id"],concept_id)

    # concept_id and concept_name
    result = get_concept(id=concept_id, name=concept_name)
    assert_equal(result.json[0]["name"], concept_name)

    # concept_id, and contrast_id
    result = get_concept(id=concept_id, contrast_id=contrast_id)
    assert_equal(result.json[0]["name"], concept_name)

    # concept_name and contrast_id
    result = get_concept(name=concept_name, contrast_id=contrast_id)
    assert_equal(result.json[0]["id"], concept_id)
Exemple #7
0
def get_isAbout_label(url):
    '''
    Added by DBK to get labels for isAbout urls
    :param url: url to get label for
    :return: string label
    '''

    scicrunch_base_uri = 'https://scicrunch.org/api/1/ilx/search/curie/'

    # load user's api key from environment variable. If not found then exit with error message
    try:
        user_key = os.environ["INTERLEX_API_KEY"]
    except KeyError:
        print("Please set the environment variable INTERLEX_API_KEY")
        sys.exit(1)

    if "cognitiveatlas" in url:
        #skip for things that aren't concepts or disorders for the time being
        if ("concept" not in url) and ("disorder" not in url):
            # for now if we don't have a concept or disorder url from cogatlas then just return nothing for label
            # will need to work with cog atlas folks about how to retrieve tasks and other types from cog atlas
            return ""
        #print(url)
        # parse out id of term and get using cog atlas python tool...
        id = url.rsplit('/', 1)[0].rsplit('/', 1)[1]
        # don't know if this is a concept or disorder so we'll try both
        try:
            tmp = get_concept(id=id, silent=True)
            label = tmp.json['name'].lower()
            #print("cogatlas concept label: %s" %(isAbout_term_labels[url]))
        except:
            tmp = get_disorder(id=id, silent=True)
            label = tmp.json['name'].lower()
            #print("cogatlas disorder label: %s" %isAbout_term_labels[url])

    elif "interlex" in url:
        # get label for interlex terms
        payload = {}
        headers = {}
        full_url = scicrunch_base_uri + url.rsplit('/', 1)[1].replace(
            '_', ':').rstrip("']'") + "?key=" + user_key
        #print(full_url)
        response = requests.request("GET",
                                    full_url,
                                    headers=headers,
                                    data=payload)
        # response is a json dictionary. here we want the label
        label = response.json()["data"]["label"].lower()
        #print("interlex label: %s" %isAbout_term_labels[url] )

    return label
Exemple #8
0
def main():

    tasks = get_task()

    output_folder = os.path.abspath("data")

    print "Generating Cognitive Atlas Data..."

    # Cognitive Atlas tasks we are interested in from NeuroVault tags
    task_uids = ['trm_553e77e53497d', 'trm_553ebfc390256', 'trm_553e88a66b676', 'trm_553fd2fc7a648', 'trm_4ebd482eba5b1', 'trm_4ebc98cc77e7b', 'trm_4ebc728326a13', 'trm_4ebc6a6b75ebf', 'trm_4ebc9d2e397f2', 'trm_553fce5d21da7', 'trm_553fcbbe974ba', 'trm_4da890594742a', 'trm_4d559bcd67c18', 'trm_4cacee4a1d875', 'trm_4c898c0786246', 'trm_4ebd47b8bab6b', 'tsk_4a57abb949a4f', 'trm_4f2456027809f', 'trm_553e73e29cf7d', 'trm_4c8a834779883', 'trm_4cacf22a22d80', 'trm_4e8dd3831f0cc', 'trm_53c4465b0466f', 'trm_553fbbf79ebc5', 'trm_5542841f3dcd5', 'trm_5346938eed092', 'trm_534692ef3b5df', 'trm_534690b0e9dc5', 'trm_5346927710e88', 'trm_4f244ad7dcde7', 'trm_551b1460e89a3', 'trm_553e6b8e33da4', 'trm_553e85265f51e', 'tsk_4a57abb949bf6', 'trm_4f24179122380', 'tsk_4a57abb949e1a', 'trm_4cacf3fbc503b', 'trm_5181f83b77fa4', 'trm_5181f863d24f4', 'trm_553eb45e2b709', 'trm_550b5b066d37b', 'trm_550b50095d4a3', 'trm_550b53d7dd674', 'trm_550b5c1a7f4db', 'trm_550b54a8b30f4', 'trm_550b557e5f90e', 'trm_550b5a47aa23e', 'trm_553eb28436233', 'trm_50df0dd9d0b6f', 'trm_553fc858cacc5']

    # Functions for making nodes
    def make_node(nid,name,color):
        return {"nid":nid,"name":name,"color":color}

    for task in tasks.json:
        if task["name"] != "":
            print "Parsing task %s..." %task["name"]
            task_name = task["name"].replace(" ","_").replace("/","_").lower()
            if task["id"] in task_uids:
                task_node = make_node(task["id"],task["name"],"#63506d")
                single_task = get_task(id=task["id"]).json[0]
                # We only want to see contrasts with associated concepts
                task_contrasts = single_task["contrasts"]
                task_concepts = []
                for contrast in task_contrasts:
                    try:
                        contrast_node = make_node(contrast["id"],contrast["contrast_text"],"#d89013")
                        contrast_concepts = get_concept(contrast_id=contrast["id"])
                        children = []
                        current_names = []
                        for concept in contrast_concepts.json:
                            if concept["name"] not in current_names:
                                children.append(make_node(concept["id"],concept["name"],"#3c7263"))
                                current_names.append(concept["name"])
                        contrast_node["children"] = children
                        # Only append contrast if it has children
                        if len(children) > 0:
                            task_concepts.append(contrast_node)
                    except:
                        pass
                task_node["children"] = task_concepts
                # Save to file if we have children
                if len(task_concepts) > 0:
                    filey = open('%s/%s.json' %(output_folder,task_name),'w')
                    filey.write(json.dumps(task_node, sort_keys=True,indent=4, separators=(',', ': ')))
                    filey.close()
def get_task_graph(task_id, images=None):
    """get_task_graph will return a tree for a single cognitive atlas tasks defined in NeuroVault
    :param task_id: the Cognitive Atlas task id
    :param get_images_with_contrasts: boolean to return images that have contrasts (default False)
    """
    # Get all contrasts defined for Cognitive Atlas
    task = CognitiveAtlasTask.objects.filter(cog_atlas_id=task_id)[0]
    task_node = make_node(task.cog_atlas_id, task.name, "#63506d")
    task_contrasts = CognitiveAtlasContrast.objects.filter(task=task)
    task_concepts = []

    for contrast in task_contrasts:
        contrast_node = make_node(contrast.cog_atlas_id, contrast.name, "#d89013")
        try:
            contrast_concepts = get_concept(contrast_id=contrast.cog_atlas_id)
        except ValueError:
            # TODO: disparity between NeuroVault and Cognitive Atlas!
            continue

        children = []  # concept children of a contrast
        current_names = []

        # Do we have images tagged with the contrast?
        if not images:
            images = StatisticMap.objects.filter(cognitive_contrast_cogatlas=contrast)

        for concept in contrast_concepts.json:
            if concept and concept["name"] not in current_names:
                concept_node = make_node(concept["id"], concept["name"], "#3c7263")

                # Image nodes
                if len(images) > 0:
                    stat_map_nodes = [make_node(i.pk, i.name, "#337ab7", "/images/%s" % i.pk) for i in images]
                    concept_node["children"] = stat_map_nodes

                children.append(concept_node)
                current_names.append(concept["name"])
            contrast_node["children"] = children
        # Only append contrast if it has children
        if len(children) > 0:
            task_concepts.append(contrast_node)

    task_node["children"] = task_concepts    

    return task_node
        concept_acc.loc[concept,
                        "correct_rejection"] = calculate_hits(Ya, Yp, 0, 0)
    return concept_acc


base_acc = get_concept_acc(predictions)
forward_acc = get_concept_acc(predictions_forward)

# Just compare the two for now
diff_acc = forward_acc - base_acc

# Add the concept names to each
from cognitiveatlas.api import get_concept
concept_names = []
for concept in diff_acc.index:
    concept_names.append(get_concept(id=concept).json[0]["name"])

base_acc["name"] = concept_names
forward_acc["name"] = concept_names
diff_acc["name"] = concept_names

# Add the number of images
number_images = []
for concept in diff_acc.index:
    number_images.append(Ymat.loc[:, concept].sum())

base_acc["number_images"] = number_images
forward_acc["number_images"] = number_images
diff_acc["number_images"] = number_images

diff_acc = diff_acc.sort(columns=["hit"], ascending=False)
Exemple #11
0
#class Concept(models.NodeModel):
#    name = models.StringProperty()
#    uid = models.StringProperty(indexed=True)
#    definition = models.StringProperty()
#    related_to = models.Relationship('self',rel_type='RELATEDTO')
#    part_of = models.Relationship('self',rel_type='PARTOF')
#    is_a = models.Relationship('self',rel_type='ISA')
#    measured_by = models.Relationship(Contrast,rel_type='MEASUREDBY')
#    mentioned_in = models.Relationship('PMID',rel_type='MENTIONEDIN')

for row in concepts.iterrows():
    uid = row[1].url.split("/")[-1]
    name = row[1].term
    try:
        concept = get_concept(id=uid, silent=True).json
        definition = concept[0]["definition_text"]
    except:
        definition = ""
    properties = {"definition": definition}
    node = make_node("concept", uid, name, properties)

# Assertions!
# We will store the old uid as a property, in case we need to map back to original data

for row in assertions.iterrows():
    uid = row[1].id
    user = row[1].id_user
    subject = row[1].id_subject
    relation = row[1].id_relation
    id_type = row[1].id_type
Exemple #12
0
results = "%s/results" % base  # any kind of tsv/result file
decode_folder = "%s/decode" % base

if not os.path.exists(decode_folder):
    os.mkdir(decode_folder)

# Images by Concepts data frame
labels_tsv = "%s/images_contrasts_df.tsv" % results
images = pandas.read_csv(labels_tsv, sep="\t", index_col=0)
output_folder = "%s/classification_final" % results

# Get standard mask, 4mm
standard_mask = get_standard_mask(4)

# Get all cognitive atlas concepts
all_concepts = get_concept().json
concepts = dict()
for concept in all_concepts:
    concepts[concept["id"]] = str(concept["name"])

# You will need to copy abstracts.txt into this folder from the repo
abstracts = pandas.read_csv("%s/abstracts.txt" % decode_folder,
                            sep="\t",
                            index_col=0,
                            header=None)
abstracts.columns = ["text"]


# Functions to parse text
def remove_nonenglish_chars(text):
    return re.sub("[^a-zA-Z]", " ", text)
results = "%s/results" %base  # any kind of tsv/result file
decode_folder = "%s/decode" %base

if not os.path.exists(decode_folder):
    os.mkdir(decode_folder)

# Images by Concepts data frame
labels_tsv = "%s/images_contrasts_df.tsv" %results
images = pandas.read_csv(labels_tsv,sep="\t",index_col=0)
output_folder = "%s/classification_final" %results

# Get standard mask, 4mm
standard_mask=get_standard_mask(4)

# Get all cognitive atlas concepts
all_concepts = get_concept().json
concepts = dict()
for concept in all_concepts:
    concepts[concept["id"]] = str(concept["name"])

# You will need to copy abstracts.txt into this folder from the repo
abstracts = pandas.read_csv("%s/abstracts.txt" %decode_folder,sep="\t",index_col=0,header=None)
abstracts.columns = ["text"]

# Functions to parse text
def remove_nonenglish_chars(text):
    return re.sub("[^a-zA-Z]", " ", text)
    
def text2sentences(text,remove_non_english_chars=True):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')    
    if remove_non_english_chars:
       rsa_df.loc[task,"RSA"] = rsa(df1,df2)

# Now let's calculate for contrasts (note - we have no images tagged with same contrast)
contrasts = images["cognitive_contrast_cogatlas_id"].unique().tolist()
for contrast in contrasts:
   contrast_images = images.image_id[images.cognitive_contrast_cogatlas_id==task].tolist()
   if len(contrast_images) > 2:
       df1 = spatial.loc[spatial.index.isin(contrast_images),spatial.columns.isin(contrast_images)]
       df2 = graph.loc[graph.index.isin(contrast_images),graph.columns.isin(contrast_images)]
       rsa_df.loc[task,"RSA"] = rsa(df1,df2)

# However - we can look at RSA for concept images! Let's make a df of contrast_id by images
from cognitiveatlas.api import get_concept
concepts = []
for contrast in contrasts:
    tmp = get_concept(contrast_id=contrast).json
    concepts = concepts + [t["id"] for t in tmp if "id" in t]

concepts = numpy.unique(concepts).tolist()
contrast_df = pandas.DataFrame(0,index=contrasts,columns=concepts)

# Now fill in the data frame
for contrast in contrasts:
    tmp = get_concept(contrast_id=contrast).json
    contrast_concepts = [t["id"] for t in tmp if "id" in t]
    contrast_df.loc[contrast,contrast_concepts] = 1

# Save if we want it later
contrast_df.to_csv("data/contrast_by_concept_binary_df.tsv",sep="\t")

# Now let's perform RSA by concept
#class Concept(models.NodeModel):
#    name = models.StringProperty()
#    uid = models.StringProperty(indexed=True)
#    definition = models.StringProperty()
#    related_to = models.Relationship('self',rel_type='RELATEDTO')
#    part_of = models.Relationship('self',rel_type='PARTOF')
#    is_a = models.Relationship('self',rel_type='ISA')
#    measured_by = models.Relationship(Contrast,rel_type='MEASUREDBY')
#    mentioned_in = models.Relationship('PMID',rel_type='MENTIONEDIN')

for row in concepts.iterrows():
    uid = row[1].url.split("/")[-1]
    name = row[1].term
    try:
        concept = get_concept(id=uid,silent=True).json
        definition = concept[0]["definition_text"]
    except:
        definition = ""
    properties={"definition":definition}
    node = make_node("concept",uid,name,properties)

# Assertions!
# We will store the old uid as a property, in case we need to map back to original data

for row in assertions.iterrows():
    uid = row[1].id
    user = row[1].id_user
    subject = row[1].id_subject
    relation = row[1].id_relation
    id_type = row[1].id_type
import os
import json
import pandas
from cogpheno.apps.assessments.models import CognitiveAtlasTask, CognitiveAtlasConcept
    
from cognitiveatlas.api import get_task, get_concept
tasks = get_task()
concepts = get_concept()

for t in range(0,len(tasks.json)):
    task = tasks.json[t]
    print "%s of %s" %(t,len(tasks.json)) 
    task, _ = CognitiveAtlasTask.objects.update_or_create(cog_atlas_id=task["id"], defaults={"name":task["name"]})
    task.save()

# Or just update those not in
#termid_present = [ct.cog_atlas_id for ct in CognitiveAtlasTask.objects.all()]
#termid = [tasks.json[x]["id"] for x in range(0,len(tasks.json))]
#termid_missing = [x for x in range(0,len(termid)) if termid[x] not in termid_present]
#for m in termid_missing:
#    task = tasks.json[m]
#    task, _ = CognitiveAtlasTask.objects.update_or_create(cog_atlas_id=task["id"], defaults={"name":task["name"]})
#    task.save()

for c in range(0,len(concepts.json)):
    concept = concepts.json[c]
    print "%s of %s" %(c,len(concepts.json))
    concept, _ = CognitiveAtlasConcept.objects.update_or_create(cog_atlas_id=concept["id"], defaults={"name":concept["name"]},definition=concept["definition_text"])
    concept.save()

     meta_single["images"] = neurovault_row["thumbnail"].tolist()
     meta_single["task"] = neurovault_row["cognitive_paradigm_cogatlas"].tolist()[0]
     meta_single["contrast"] = neurovault_row["cognitive_contrast_cogatlas"].tolist()[0]
     meta_single["download"] = neurovault_row["file"].tolist()[0]
     meta_single["concept"] = concepts
     if neurovault_row["description"].tolist()[0]:
         meta_single["description"] =  str(neurovault_row["description"].tolist()[0]).encode("utf-8")
     else:
         meta_single["description"] = ""
     if len(meta_single["description"]) > 600:
         meta_single["description"] = "%s..." % meta_single["description"][0:600]
 else: # A concept node
     if node != "1":
         relationship_table_row = relationship_table[relationship_table.id==node]
         contrast_name = relationship_table_row.name.tolist()[0]
         concept = get_concept(id=node).json
         # Reverse inference scores - all images
         if node in scores_df.node.unique().tolist(): # a node with images below it
             meta_single["scores"] = scores_df[scores_df.node == node].to_json(orient="records")
             image_ids = scores_df[scores_df.node == node].image_id.unique().tolist()
             meta_single["images"] = images["thumbnail"][images.image_id.isin(image_ids)].tolist()
         # Cognitive Atlas meta data
         meta_single["url"] = "http://www.cognitiveatlas.org/term/id/%s" %node
         meta_single["type"] = "concept"
         meta_single["thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png"
         meta_single["concept"] = [relationship_table.name[relationship_table.id==node].tolist()[0]]
         meta_single["task"] = ""
         meta_single["contrast"] = []
         meta_single["category"] = ""
         meta_single["download"] = "http://www.cognitiveatlas.org/rdf/id/%s" %node
         if concept[0]["definition_text"]:
Exemple #18
0
def main():

    tasks = get_task()

    output_folder = os.path.abspath("data")

    print "Generating Cognitive Atlas Data..."

    # Cognitive Atlas tasks we are interested in from NeuroVault tags
    task_uids = [
        'trm_553e77e53497d', 'trm_553ebfc390256', 'trm_553e88a66b676',
        'trm_553fd2fc7a648', 'trm_4ebd482eba5b1', 'trm_4ebc98cc77e7b',
        'trm_4ebc728326a13', 'trm_4ebc6a6b75ebf', 'trm_4ebc9d2e397f2',
        'trm_553fce5d21da7', 'trm_553fcbbe974ba', 'trm_4da890594742a',
        'trm_4d559bcd67c18', 'trm_4cacee4a1d875', 'trm_4c898c0786246',
        'trm_4ebd47b8bab6b', 'tsk_4a57abb949a4f', 'trm_4f2456027809f',
        'trm_553e73e29cf7d', 'trm_4c8a834779883', 'trm_4cacf22a22d80',
        'trm_4e8dd3831f0cc', 'trm_53c4465b0466f', 'trm_553fbbf79ebc5',
        'trm_5542841f3dcd5', 'trm_5346938eed092', 'trm_534692ef3b5df',
        'trm_534690b0e9dc5', 'trm_5346927710e88', 'trm_4f244ad7dcde7',
        'trm_551b1460e89a3', 'trm_553e6b8e33da4', 'trm_553e85265f51e',
        'tsk_4a57abb949bf6', 'trm_4f24179122380', 'tsk_4a57abb949e1a',
        'trm_4cacf3fbc503b', 'trm_5181f83b77fa4', 'trm_5181f863d24f4',
        'trm_553eb45e2b709', 'trm_550b5b066d37b', 'trm_550b50095d4a3',
        'trm_550b53d7dd674', 'trm_550b5c1a7f4db', 'trm_550b54a8b30f4',
        'trm_550b557e5f90e', 'trm_550b5a47aa23e', 'trm_553eb28436233',
        'trm_50df0dd9d0b6f', 'trm_553fc858cacc5'
    ]

    # Functions for making nodes
    def make_node(nid, name, color):
        return {"nid": nid, "name": name, "color": color}

    for task in tasks.json:
        if task["name"] != "":
            print "Parsing task %s..." % task["name"]
            task_name = task["name"].replace(" ", "_").replace("/",
                                                               "_").lower()
            if task["id"] in task_uids:
                task_node = make_node(task["id"], task["name"], "#63506d")
                single_task = get_task(id=task["id"]).json[0]
                # We only want to see contrasts with associated concepts
                task_contrasts = single_task["contrasts"]
                task_concepts = []
                for contrast in task_contrasts:
                    try:
                        contrast_node = make_node(contrast["id"],
                                                  contrast["contrast_text"],
                                                  "#d89013")
                        contrast_concepts = get_concept(
                            contrast_id=contrast["id"])
                        children = []
                        current_names = []
                        for concept in contrast_concepts.json:
                            if concept["name"] not in current_names:
                                children.append(
                                    make_node(concept["id"], concept["name"],
                                              "#3c7263"))
                                current_names.append(concept["name"])
                        contrast_node["children"] = children
                        # Only append contrast if it has children
                        if len(children) > 0:
                            task_concepts.append(contrast_node)
                    except:
                        pass
                task_node["children"] = task_concepts
                # Save to file if we have children
                if len(task_concepts) > 0:
                    filey = open('%s/%s.json' % (output_folder, task_name),
                                 'w')
                    filey.write(
                        json.dumps(task_node,
                                   sort_keys=True,
                                   indent=4,
                                   separators=(',', ': ')))
                    filey.close()
Exemple #19
0
def concept_node_triples(image_dict=None,
                         output_file="concept_node_triples.tsv",
                         delim="\t",
                         save_to_file=True,
                         lookup_key_type="contrast"):
    '''concept_node_triples
    Export a list of nodes, in triples
    :param delim: delimiter for output file
    :param save_to_file: boolean, False will return pandas data frame
    :param image_dict [OPTIONAL]: dict
    a dictionary of [term_id:image_file] pairs, eg
    ..note::
         {"cnt_4decfedb91973":["image1.nii.gz","image2.nii.gz"]}
        This will mean that the images in the list will be assigned to all concept nodes associated with the term specified. This allows for inference over the tree (for example, some relationship with concept nodes that are parents of assigned nodes). Specifying an image dictionary will append the images as the base nodes of the tree. No image dictionary means that the base nodes will be the lowest level concepts. You must specify the term type as "contrast" or "task" (see lookup_key_type)
    :param delim: str
        delimeter for output file, default is tab.
    :param output_file: path
    :param lookup_key_type: the term type used as a key in the image_dict. Either "task" or "contrast" (default is contrast)
    ..note::
         Output looks like

        id    parent  name
        1 none BASE                   # there is always a base node
        trm_12345 1   MEMORY                  # high level concept groups
        trm_23456 1   PERCEPTION              
        trm_34567 trm_12345   WORKING MEMORY          # concepts
        trm_56789 trm_12345   LONG TERM MEMORY
        trm_67890 trm_34567   image1.nii.gz           # associated images (discovered by way of contrasts)
        trm_78901 trm_34567   image2.nii.gz

    '''
    concepts = filter_concepts()
    if save_to_file == True:
        filey = init_output_file(output_file, delim=delim)
    df = pandas.DataFrame(columns=["id", "parent", "name"])
    df.loc[0] = ["1", "None", "BASE"]

    # Generate a unique id for each concept
    concept_lookup = dict()
    for c in range(0, len(concepts)):
        concept_lookup[concepts[c]["id"]] = c + 2
    count = 1

    # Generate tree for main concepts
    for concept in concepts:
        parents = []
        if "relationships" in concept:
            for relation in concept["relationships"]:
                if relation["direction"] == "parent":
                    # We can only use "kind of" otherwise we get circular reference
                    if relation["relationship"] == "kind of":
                        if relation["id"] in concept_lookup:
                            parents.append(relation["id"])
        if not parents:
            # make_node(node_id,name,parent,delim,file_obj):
            if save_to_file == True:
                make_node(concept["id"], concept["name"], "1", delim, filey)
            df.loc[count] = [concept["id"], "1", concept["name"]]
            count += 1
        else:
            for parent in parents:
                # make_node(node_id,name,parent,delim,file_obj):
                if save_to_file == True:
                    make_node(concept["id"], concept["name"], parent, delim,
                              filey)
                df.loc[count] = [concept["id"], parent, concept["name"]]
                count += 1

    # Now add an entry for each image / contrast, may be multiple for each image
    if image_dict:
        node_id = max(concept_lookup.values()) + 1
        for conid, image_paths in image_dict.items():
            if lookup_key_type == "contrast":
                concepts_single = get_concept(contrast_id=conid).json
                key_id = "id"
            else:
                concepts_single = get_task(id=conid).json[0]
                if "concepts" in list(concepts_single.keys()):
                    concepts_single = concepts_single["concepts"]
                else:
                    concepts_single = None
                key_id = "concept_id"

            if concepts_single != None:
                for con in concepts_single:  # The concept is the parent of the image
                    if con:
                        for image_path in image_paths:
                            # make_node(node_id,name,parent,delim,file_obj):
                            if save_to_file == True:
                                make_node("node_%s" % node_id, image_path,
                                          con[key_id], delim, filey)
                            df.loc[count] = [
                                "node_%s" % node_id, con[key_id], image_path
                            ]
                            node_id += 1
                            count += 1
    if save_to_file == True:
        filey.close()
        print("%s has been created." % output_file)
    return df
def concept_node_triples(image_dict=None,output_file="concept_node_triples.tsv",
                         delim="\t",save_to_file=True,lookup_key_type="contrast"):
    '''concept_node_triples
    Export a list of nodes, in triples
    :param delim: delimiter for output file
    :param save_to_file: boolean, False will return pandas data frame
    :param image_dict [OPTIONAL]: dict
    a dictionary of [term_id:image_file] pairs, eg
    ..note::
         {"cnt_4decfedb91973":["image1.nii.gz","image2.nii.gz"]}
        This will mean that the images in the list will be assigned to all concept nodes associated with the term specified. This allows for inference over the tree (for example, some relationship with concept nodes that are parents of assigned nodes). Specifying an image dictionary will append the images as the base nodes of the tree. No image dictionary means that the base nodes will be the lowest level concepts. You must specify the term type as "contrast" or "task" (see lookup_key_type)
    :param delim: str
        delimeter for output file, default is tab.
    :param output_file: path
    :param lookup_key_type: the term type used as a key in the image_dict. Either "task" or "contrast" (default is contrast)
    ..note::
         Output looks like

        id    parent  name
        1 none BASE                   # there is always a base node
        trm_12345 1   MEMORY                  # high level concept groups
        trm_23456 1   PERCEPTION              
        trm_34567 trm_12345   WORKING MEMORY          # concepts
        trm_56789 trm_12345   LONG TERM MEMORY
        trm_67890 trm_34567   image1.nii.gz           # associated images (discovered by way of contrasts)
        trm_78901 trm_34567   image2.nii.gz

    '''
    concepts = filter_concepts()
    if save_to_file == True:
        filey = init_output_file(output_file,delim=delim)
    df = pandas.DataFrame(columns=["id","parent","name"])
    df.loc[0] = ["1","None","BASE"]

    # Generate a unique id for each concept
    concept_lookup = dict()
    for c in range(0,len(concepts)):
        concept_lookup[concepts[c]["id"]] = c+2
    count=1

    # Generate tree for main concepts
    for concept in concepts:
        parents = []
        if "relationships" in concept:
            for relation in concept["relationships"]:
                if relation["direction"] == "parent":
                    # We can only use "kind of" otherwise we get circular reference
                    if relation["relationship"] == "kind of":
                        if relation["id"] in concept_lookup:
                            parents.append(relation["id"])
        if not parents:
            # make_node(node_id,name,parent,delim,file_obj):
            if save_to_file == True:
                make_node(concept["id"],concept["name"],"1",delim,filey)
            df.loc[count] = [concept["id"],"1",concept["name"]]
            count+=1
        else:
            for parent in parents:    
                # make_node(node_id,name,parent,delim,file_obj):
                if save_to_file == True:
                    make_node(concept["id"],concept["name"],parent,delim,filey)
                df.loc[count] = [concept["id"],parent,concept["name"]]
                count+=1

    # Now add an entry for each image / contrast, may be multiple for each image
    if image_dict:
        node_id = max(concept_lookup.values()) + 1
        for conid, image_paths in image_dict.items():
            if lookup_key_type == "contrast":
                concepts_single = get_concept(contrast_id=conid).json
                key_id = "id"
            else:
                concepts_single = get_task(id=conid).json[0]
                if "concepts" in list(concepts_single.keys()):
                    concepts_single = concepts_single["concepts"]
                else:
                    concepts_single = None
                key_id = "concept_id"

            if concepts_single != None:  
                for con in concepts_single: # The concept is the parent of the image
                    if con:
                        for image_path in image_paths:
                            # make_node(node_id,name,parent,delim,file_obj):
                            if save_to_file == True:
                                make_node("node_%s" %node_id,image_path,con[key_id],delim,filey)
                            df.loc[count] = ["node_%s" %node_id,con[key_id],image_path]
                            node_id +=1
                            count+=1
    if save_to_file == True:
        filey.close()
        print("%s has been created." % output_file) 
    return df
images = pandas.read_csv("%s/contrast_defined_images_filtered.tsv" %results,sep="\t")

unique_concepts = dict()
for row in images.iterrows():
    idx = row[1].image_id
    # Bug with getting contrasts for images:
    if idx == 109:
        unique_concepts[idx] = ["trm_567982752ff4a","trm_4a3fd79d0afcf","trm_5534111a8bc96",
                                "trm_557b48a224b95","trm_557b4a81a4a17","trm_4a3fd79d0b64e","trm_4a3fd79d0a33b",
                                "trm_557b4a7315f1b","trm_4a3fd79d0af71","trm_557b4b56de455","trm_557b4add1837e"]
    elif idx == 118:
        unique_concepts[idx] = ["trm_4a3fd79d0b642","trm_4a3fd79d0a33b","trm_557b4a7315f1b","trm_4a3fd79d0af71",
                                "trm_557b4b56de455"]
    else:
        contrast = row[1].cognitive_contrast_cogatlas_id
        concepts = get_concept(contrast_id=contrast)
        concepts = numpy.unique(concepts.pandas.id).tolist() 
        unique_concepts[idx] = concepts
    
all_concepts = []
for image_id,concepts in unique_concepts.iteritems():
    for concept in concepts:
        if concept not in all_concepts:
            all_concepts.append(concept)


res = {"all_concepts":all_concepts,"unique_concepts":unique_concepts,"images":images}

## STEP 1: GENERATE IMAGE BY CONCEPT DATA FRAME
concept_df = pandas.DataFrame(0,columns=all_concepts,index=images.image_id.unique().tolist())
for image_id,concepts in unique_concepts.iteritems():
if not os.path.exists(decode_folder):
    os.mkdir(decode_folder)

# Images by Concepts data frame
labels_tsv = "%s/images_contrasts_df.tsv" %results
images = pandas.read_csv(labels_tsv,sep="\t",index_col=0)
output_folder = "%s/classification_final" %results

# Get standard mask, 4mm
standard_mask=get_standard_mask(4)

# Load the regression params data frame
result = pickle.load(open("%s/regression_params_dfs.pkl" %output_folder,"rb"))

all_concepts = get_concept().json
concepts = dict()
for concept in all_concepts:
    concepts[concept["id"]] = str(concept["name"])

# You will need to copy abstracts.txt into this folder from the repo
abstracts = pandas.read_csv("%s/abstracts.txt" %decode_folder,sep="\t",index_col=0,header=None)
abstracts.columns = ["text"]

# Functions to parse text
def remove_nonenglish_chars(text):
    return re.sub("[^a-zA-Z]", " ", text)
    
def text2sentences(text,remove_non_english_chars=True):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')    
    if remove_non_english_chars:
Exemple #23
0
def main():

    base = "data/"

    # Make a folder for mean images
    if not os.path.exists("mr"):
        os.mkdir("mr")

    # Get Neurovault Images with defined cognitive atlas contrast
    collections = get_collections()

    # Filter images to those that have a DOI
    collections = collections[collections.DOI.isnull() == False]

    # Get image meta data for collections (N=1023)
    images = get_images(collection_pks=collections.collection_id.tolist())

    # Filter images to those with contrasts defined (N=98)
    images = images[images.cognitive_contrast_cogatlas_id.isnull() == False]

    # Get rid of any not in MNI
    images = images[images.not_mni == False]

    # Get rid of thresholded images
    images = images[images.is_thresholded == False]

    ### Step 1: Load meta data sources
    unique_contrasts = images.cognitive_contrast_cogatlas_id.unique().tolist()

    # Images that do not match the correct identifier will not be used (eg, "Other")
    expression = re.compile("cnt_*")
    unique_contrasts = [u for u in unique_contrasts if expression.match(u)]

    # Make sure exists in cognitive atlas
    existing_contrasts = []
    for u in unique_contrasts:
        try:
            tmp = get_concept(contrast_id=u, silent=True)
            existing_contrasts.append(u)
        except:
            print "%s is defined in NeuroVault, does not exist in Cognitive Atlas" % u

    image_lookup = dict()
    for u in existing_contrasts:
        image_lookup[u] = images.image_id[images.cognitive_contrast_cogatlas_id
                                          == u].tolist()

    # Create a data structure of tasks and contrasts for our analysis
    relationship_table = concept_node_triples(image_dict=image_lookup,
                                              save_to_file=False)

    unique_nodes = relationship_table.id.unique().tolist()

    # We will store a data frame of meta data
    # Lookup for meta_data is the id of the node!
    meta_data = {}

    for node in unique_nodes:
        meta_single = {}
        # This is an image node
        if re.search("node_", node):
            print "Found image node!"
            relationship_table_row = relationship_table[relationship_table.id
                                                        == node]
            image_id = relationship_table_row.name.tolist()[0]
            meta_single["category"] = ""
            meta_single["type"] = "nii"
            # NeuroVault metadata
            concepts = relationship_table.parent[relationship_table.name ==
                                                 image_id]
            concepts = [
                relationship_table.name[relationship_table.id == c].tolist()[0]
                for c in concepts
            ]
            neurovault_row = images[images.image_id == int(image_id)]
            collection_row = collections[collections.collection_id ==
                                         neurovault_row.collection_id.tolist()
                                         [0]]
            collection_meta = {
                "DOI":
                collection_row["DOI"].tolist()[0],
                "authors":
                collection_row["authors"].tolist()[0],
                "journal":
                collection_row["journal_name"].tolist()[0],
                "url":
                collection_row["url"].tolist()[0],
                "subjects":
                collection_row["number_of_subjects"].tolist()[0],
                "smoothing_fwhm":
                str(collection_row["smoothing_fwhm"].tolist()[0]).encode(
                    "utf-8")
            }
            meta_single["url"] = neurovault_row["url"].tolist()[0]
            meta_single["thumbnail"] = neurovault_row["thumbnail"].tolist()[0]
            meta_single["images"] = neurovault_row["thumbnail"].tolist()
            meta_single["task"] = neurovault_row[
                "cognitive_paradigm_cogatlas"].tolist()[0]
            meta_single["contrast"] = neurovault_row[
                "cognitive_contrast_cogatlas"].tolist()[0]
            meta_single["download"] = neurovault_row["file"].tolist()[0]
            meta_single["concept"] = concepts
            if neurovault_row["description"].tolist()[0]:
                meta_single["description"] = str(
                    neurovault_row["description"].tolist()[0]).encode("utf-8")
            else:
                meta_single["description"] = ""
            if len(meta_single["description"]) > 600:
                meta_single["description"] = "%s..." % meta_single[
                    "description"][0:600]
        else:  # A concept node
            if node != "1":
                relationship_table_row = relationship_table[
                    relationship_table.id == node]
                concept = get_concept(id=node, silent=True).json
                children_nodes = [
                    relationship_table.name.tolist()[x]
                    for x in range(relationship_table.shape[0])
                    if relationship_table.parent.tolist()[x] == node
                ]
                while len(
                    [x for x in children_nodes if not isinstance(x, int)]) > 0:
                    new_parent_nodes = [
                        x for x in children_nodes if not isinstance(x, int)
                    ]
                    children_nodes = [
                        x for x in children_nodes if x not in new_parent_nodes
                    ]
                    for new_parent in new_parent_nodes:
                        node_name = relationship_table.id[
                            relationship_table.name == new_parent].tolist()[0]
                        children_nodes = children_nodes + [
                            relationship_table.name.tolist()[x]
                            for x in range(relationship_table.shape[0]) if
                            relationship_table.parent.tolist()[x] == node_name
                        ]
                # Now only keep children that are images
                meta_single["images"] = images["thumbnail"][
                    images.image_id.isin(children_nodes)].tolist()
                # Cognitive Atlas meta data
                meta_single[
                    "url"] = "http://www.cognitiveatlas.org/term/id/%s" % node
                meta_single["type"] = "concept"
                meta_single[
                    "thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png"
                meta_single["concept"] = [
                    relationship_table.name[relationship_table.id ==
                                            node].tolist()[0]
                ]
                meta_single["task"] = ""
                meta_single["contrast"] = []
                meta_single[
                    "download"] = "http://www.cognitiveatlas.org/rdf/id/%s" % node
                if concept[0]["definition_text"]:
                    meta_single["description"] = concept[0][
                        "definition_text"].encode("utf-8")
                else:
                    meta_single["description"] = ""
                if len(meta_single["description"]) > 600:
                    meta_single["description"] = "%s..." % meta_single[
                        "description"][0:600]
        meta_data[node] = meta_single

    ## STEP 2: VISUALIZATION WITH PYBRAINCOMPARE
    from pybraincompare.ontology.tree import named_ontology_tree_from_tsv, make_ontology_tree_d3

    # First let's look at the tree structure
    # output_json = "%s/task_contrast_tree.json" % outfolder
    tree = named_ontology_tree_from_tsv(relationship_table,
                                        output_json=None,
                                        meta_data=meta_data)
    html_snippet = make_ontology_tree_d3(tree)
    web_folder = base
    make_analysis_web_folder(html_snippet, web_folder)

    # To get a dump of just the tree (for use in more advanced custom web interface)
    filey = open('%s/reverseinference.json' % base, 'wb')
    filey.write(
        json.dumps(tree, sort_keys=True, indent=4, separators=(',', ': ')))
    filey.close()

    ## STEP 3: Export individual nodes

    ### Images
    unique_images = images.image_id.unique().tolist()

    # Images
    for s in range(0, len(unique_images)):
        image_id = unique_images[s]
        meta_data = {}
        meta_data["image_id"] = image_id
        print "Parsing data for images %s of %s" % (s, len(unique_images))
        concepts = relationship_table.parent[relationship_table.name == str(
            image_id)].tolist()
        concepts = [
            relationship_table.name[relationship_table.id == c].tolist()[0]
            for c in concepts
        ]
        concepts_ids = [
            relationship_table.id[relationship_table.id == c].tolist()[0]
            for c in concepts
        ]
        neurovault_row = images[images.image_id == int(image_id)]
        collection_row = collections[collections.collection_id ==
                                     neurovault_row.collection_id.tolist()[0]]
        collection_meta = {
            "DOI":
            collection_row["DOI"].tolist()[0],
            "authors":
            collection_row["authors"].tolist()[0],
            "journal":
            collection_row["journal_name"].tolist()[0],
            "url":
            collection_row["url"].tolist()[0],
            "subjects":
            collection_row["number_of_subjects"].tolist()[0],
            "smoothing_fwhm":
            str(collection_row["smoothing_fwhm"].tolist()[0]).encode("utf-8"),
            "title":
            collection_row["name"].tolist()[0]
        }
        meta_data["collection"] = collection_meta
        meta_data["url"] = neurovault_row["url"].tolist()[0]
        meta_data["thumbnail"] = neurovault_row["thumbnail"].tolist()[0]
        meta_data["images"] = neurovault_row["thumbnail"].tolist()
        meta_data["task"] = neurovault_row[
            "cognitive_paradigm_cogatlas"].tolist()[0]
        meta_data["contrast"] = neurovault_row[
            "cognitive_contrast_cogatlas"].tolist()[0]
        meta_data["download"] = neurovault_row["file"].tolist()[0]
        meta_data["concept"] = concepts
        meta_data["concept_id"] = concepts_ids
        if neurovault_row["description"].tolist()[0]:
            try:
                description = str(
                    neurovault_row["description"].tolist()[0]).encode("utf-8")
            except:
                description = ""
            if description != "nan":
                meta_data["description"] = description
            else:
                meta_data["description"] = ""
        else:
            meta_data["description"] = ""
        if len(meta_data["description"]) > 600:
            meta_data[
                "description"] = "%s..." % meta_data["description"][0:600]
        output_file = "%s/ri_%s.json" % (base, meta_data["image_id"])
        filey = open(output_file, 'wb')
        filey.write(
            json.dumps(meta_data,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': ')))
        filey.close()

    ### Concepts
    for node in unique_nodes:
        # This is a concept node
        if not re.search("node_", node):
            if node != "1":
                relationship_table_row = relationship_table[
                    relationship_table.id == node]
                concept = get_concept(id=node).json
                meta_single = {}
                children_nodes = [
                    relationship_table.name.tolist()[x]
                    for x in range(relationship_table.shape[0])
                    if relationship_table.parent.tolist()[x] == node
                ]
                while len(
                    [x for x in children_nodes if not isinstance(x, int)]) > 0:
                    new_parent_nodes = [
                        x for x in children_nodes if not isinstance(x, int)
                    ]
                    children_nodes = [
                        x for x in children_nodes if x not in new_parent_nodes
                    ]
                    for new_parent in new_parent_nodes:
                        node_name = relationship_table.id[
                            relationship_table.name == new_parent].tolist()[0]
                        children_nodes = children_nodes + [
                            relationship_table.name.tolist()[x]
                            for x in range(relationship_table.shape[0]) if
                            relationship_table.parent.tolist()[x] == node_name
                        ]
                # Now only keep children that are images
                meta_single["images"] = images["thumbnail"][
                    images.image_id.isin(children_nodes)].tolist()
                meta_single["image_list"] = children_nodes
                # Cognitive Atlas meta data
                meta_single[
                    "url"] = "http://www.cognitiveatlas.org/term/id/%s" % node
                meta_single["type"] = "concept"
                meta_single[
                    "thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png"
                meta_single["concept"] = [
                    relationship_table.name[relationship_table.id ==
                                            node].tolist()[0]
                ]
                meta_single["task"] = ""
                meta_single["contrast"] = []
                meta_single[
                    "download"] = "http://www.cognitiveatlas.org/rdf/id/%s" % node
                if concept[0]["definition_text"]:
                    meta_single["description"] = concept[0][
                        "definition_text"].encode("utf-8")
                else:
                    meta_single["description"] = ""
                if len(meta_single["description"]) > 600:
                    meta_single["description"] = "%s..." % meta_single[
                        "description"][0:600]
                output_file = "%s/ri_%s.json" % (base, node)
                filey = open(output_file, 'wb')
                filey.write(
                    json.dumps(meta_single,
                               sort_keys=True,
                               indent=4,
                               separators=(',', ': ')))
                filey.close()
        concept_acc.loc[concept,"aprime"] = aprime
        concept_acc.loc[concept,"miss"] = calculate_hits(Ya,Yp,1,0)
        concept_acc.loc[concept,"correct_rejection"] = calculate_hits(Ya,Yp,0,0)
    return concept_acc 

base_acc = get_concept_acc(predictions)
forward_acc = get_concept_acc(predictions_forward)

# Just compare the two for now
diff_acc = forward_acc - base_acc

# Add the concept names to each
from cognitiveatlas.api import get_concept
concept_names = []
for concept in diff_acc.index:
    concept_names.append(get_concept(id=concept).json[0]["name"])

base_acc["name"] = concept_names
forward_acc["name"] = concept_names
diff_acc["name"] = concept_names

# Add the number of images
number_images = []
for concept in diff_acc.index:
    number_images.append(Ymat.loc[:,concept].sum())

base_acc["number_images"] = number_images
forward_acc["number_images"] = number_images
diff_acc["number_images"] = number_images

diff_acc = diff_acc.sort(columns=["hit"],ascending=False)
                                  index_col=0)
images = pandas.read_csv("data/contrast_defined_images_filtered.tsv",
                         sep="\t",
                         index_col=0)

# source will be in rows, target in columns
concept_kindof = pandas.DataFrame()
concept_partof = pandas.DataFrame()
concepts = contrast_lookup.columns.tolist()
seen = []

while len(concepts) > 0:
    concept = concepts.pop(0)
    seen.append(concept)
    try:
        tmp = get_concept(id=concept).json[0]
        if 'relationships' in tmp:
            for relation in tmp["relationships"]:
                if relation['id'] not in seen and relation[
                        'id'] not in concepts:
                    print "Adding concept %s" % (relation['id'])
                    concepts.append(relation['id'])
                if relation['direction'] == "parent":
                    if relation['relationship'] == 'kind of':
                        concept_kindof.loc[tmp['id'], relation['id']] = 1
                    elif relation['relationship'] == 'part of':
                        concept_partof.loc[tmp['id'], relation['id']] = 1
                elif relation['direction'] == "child":
                    if relation['relationship'] == 'kind of':
                        concept_kindof.loc[relation['id'], tmp['id']] = 1
                    elif relation['relationship'] == 'part of':
Exemple #26
0
def main(argv):
    parser = ArgumentParser(
        description=
        'This program will find all *.jsonld files in the list of input'
        'directories and compute the frequency of use of isAbout concepts. '
        'The frequency table will be exported as a markdown table for use in'
        'web documents or GitHub README markdown files. ')

    parser.add_argument('-jsonld',
                        dest='jsonld',
                        nargs='+',
                        default=[],
                        required=True,
                        help="space separated list"
                        "of directories to evaluate for jsonld files.")
    parser.add_argument('-outfile',
                        dest='outfile',
                        required=True,
                        help="Output file for markdown table, full path")
    parser.add_argument(
        '-jsonld_output_dir',
        dest='jsonld_output_dir',
        required=True,
        help="This is a directory"
        "where we'll store the concept json-ld files using the NIDM-Terms properties"
    )

    args = parser.parse_args()

    isAbout_terms = {}
    total_concept_count = 0

    # download context file for json-ld files of concepts used
    # try to open the url and get the pointed to file
    try:
        # open url and get file
        opener = url.urlopen(CONTEXT)
        # write temporary file to disk and use for stats
        temp = tempfile.NamedTemporaryFile(delete=False)
        temp.write(opener.read())
        temp.close()
        context_file = temp.name
    except:
        print("ERROR! Can't open url: %s" % CONTEXT)
        exit()
    # load downloaded context
    with open(context_file) as context_data:
        context = json.load(context_data)

    # Retrieve cognitive atlas concepts and disorders
    cogatlas_concepts = get_concept(silent=True)
    cogatlas_disorders = get_disorder(silent=True)

    # for each input directory
    for direct in args.jsonld:
        # find *.jsonld files
        files = glob2.glob(direct + '/**/*.jsonld', recursive=True)
        # loop through each file and get isAbout terms
        for file in files:
            # read file with json
            # open the file as a dictionary
            print("opening file: %s" % file)
            with open(file) as dct:
                json_tmp = json.load(dct)

            if type(json_tmp['terms']) is dict:
                # for each key (term) in jsonld file, check for isAbout property
                for term in json_tmp['terms'].keys():
                    # expanded = jsonld.expand(json_tmp[term])
                    # for jsonld files with only a single term we have a simple dictionary where the term label isn't
                    # the highest-level key so we handle differently
                    if term == 'isAbout':
                        if isinstance(json_tmp['terms'][term], list):
                            # if not a dictionary then a list of dictionaries
                            for isabout_entry in json_tmp['terms'][term][
                                    'isAbout']:

                                # add concept to dictionary
                                total_concept_count = add_to_dict(
                                    id, isabout_entry, isAbout_terms,
                                    total_concept_count)

                                if INTERLEX_URI_FRAG in isabout_entry['@id']:
                                    # for storing concept as json-ld file
                                    concept_jsonld = get_interlex_concept_properties(
                                        isabout_entry['@id'], context)
                                elif (COGATLAS_URI_FRAG
                                      in isabout_entry['@id']) and (
                                          "task" not in isabout_entry['@id']):
                                    concept_jsonld = get_cogatlas_properties(
                                        isabout_entry, context,
                                        cogatlas_concepts, cogatlas_disorders)
                                elif "task" in isabout_entry['@id']:
                                    concept_jsonld = get_cogatlas_task_properties(
                                        isabout_entry, context)

                                else:
                                    continue
                                # write concept jsonld file
                                write_jsonld(concept_jsonld,
                                             args.jsonld_output_dir)

                        # else it's a dictionary with a single isAbout entry
                        else:
                            total_concept_count = add_to_dict(
                                id, json_tmp['terms'][term], isAbout_terms,
                                total_concept_count)
                            if INTERLEX_URI_FRAG in json_tmp['terms'][term]:
                                # for storing concept as json-ld file
                                concept_jsonld = get_interlex_concept_properties(
                                    json_tmp['terms'][term]['@id'], context)
                            elif COGATLAS_URI_FRAG in json_tmp['terms'][term]['@id'] and \
                                ("task" not in json_tmp['terms'][term]['@id']):
                                concept_jsonld = get_cogatlas_properties(
                                    json_tmp['terms'][term], context,
                                    cogatlas_concepts, cogatlas_disorders)
                            elif "task" in json_tmp['terms'][term]:
                                concept_jsonld = get_cogatlas_task_properties(
                                    json_tmp['terms'][term], context)

                            else:
                                continue
                            # write concept jsonld file
                            write_jsonld(concept_jsonld,
                                         args.jsonld_output_dir)

            elif type(json_tmp['terms']) is list:
                for term in json_tmp['terms']:
                    # expanded = jsonld.expand(json_tmp[term])
                    # for jsonld files with only a single term we have a simple dictionary where the term label isn't
                    # the highest-level key so we handle differently
                    for property in term:
                        if property == 'isAbout':
                            # for each concept in isAbout property
                            if isinstance(term[property], list):
                                for isabout_entry in term[property]:
                                    total_concept_count = add_to_dict(
                                        id, isabout_entry, isAbout_terms,
                                        total_concept_count)
                                    if INTERLEX_URI_FRAG in isabout_entry[
                                            '@id']:
                                        # for storing concept as json-ld file
                                        concept_jsonld = get_interlex_concept_properties(
                                            isabout_entry['@id'], context)
                                    elif COGATLAS_URI_FRAG in isabout_entry['@id'] and \
                                            ("task" not in isabout_entry['@id']):
                                        concept_jsonld = get_cogatlas_properties(
                                            isabout_entry, context,
                                            cogatlas_concepts,
                                            cogatlas_disorders)
                                    elif "task" in isabout_entry['@id']:
                                        concept_jsonld = get_cogatlas_task_properties(
                                            isabout_entry['@id'],
                                            isabout_entry['label'], context)

                                    else:
                                        continue
                                    # write concept jsonld file
                                    write_jsonld(concept_jsonld,
                                                 args.jsonld_output_dir)

                            else:
                                total_concept_count = add_to_dict(
                                    id, term[property], isAbout_terms,
                                    total_concept_count)
                                if INTERLEX_URI_FRAG in term[property]['@id']:
                                    # for storing concept as json-ld file
                                    concept_jsonld = get_interlex_concept_properties(
                                        term[property]['@id'], context)
                                elif COGATLAS_URI_FRAG in term[property]['@id'] and \
                                            ("task" not in term[property]['@id']):
                                    concept_jsonld = get_cogatlas_properties(
                                        term[property], context,
                                        cogatlas_concepts, cogatlas_disorders)
                                elif "task" in term[property]['@id']:
                                    concept_jsonld = get_cogatlas_task_properties(
                                        term[property]['@id'],
                                        term[property]['label'], context)
                                else:
                                    continue
                                # write concept jsonld file
                                write_jsonld(concept_jsonld,
                                             args.jsonld_output_dir)

    # open markdown txt file
    md_file = open(args.outfile, "w")
    ## Added by NQ to test GitHub Actions
    print('opening output file in', args.outfile)
    # set up header of table
    md_file.write("| concept URL | label | use frequency (%) |\n")
    md_file.write("| ----------- | ----- | ----------------- |\n")

    # now cycle through isAbout_terms dictionary and compute frequencies
    for key in isAbout_terms.keys():
        isAbout_terms[key]['freq'] = (isAbout_terms[key]['count'] /
                                      total_concept_count) * 100.0

    res = OrderedDict(
        sorted(isAbout_terms.items(),
               key=lambda x: getitem(x[1], 'freq'),
               reverse=True))

    # write markdown table sorted
    for key in res.keys():
        # add to markdown table file
        md_file.write("| %s | %s | %f |\n" %
                      (key, res[key]['label'], res[key]['freq']))

    ##Added by NQ to show that the code finished running
    print('File has been successfully written in', md_file)

    md_file.close()

    # if a single-file jsonld file already exists than add these terms to it else create a new one
    output_dir = os.path.split(args.jsonld_output_dir)[0]
    if isfile(join(output_dir, "NIDM_Concepts.jsonld")):
        cmd = "python " + join(sys.path[0], "combinebidsjsonld.py") + " -inputDir " + args.jsonld_output_dir + " -outputDir " + \
              join(output_dir, "NIDM_Concepts.jsonld") + " -association \"NIDM\"" + " -jsonld " + \
              join(output_dir, "NIDM_Concepts.jsonld")
    else:
        cmd = "python " + join(sys.path[0], "combinebidsjsonld.py") + " -inputDir " + args.jsonld_output_dir + " -outputDir " + \
              join(output_dir, "NIDM_Concepts.jsonld") + " -association \"NIDM\""

    print(cmd)
    system(cmd)
Exemple #27
0
#/usr/bin/python

#Make coordinate plane

import pandas
data_file = "data.tsv"
from cognitiveatlas.api import get_concept

# Retrieve concepts from the cognitive atlas
concepts = get_concept().pandas
concept_names = concepts["name"].tolist()
concept_definition = concepts["definition_text"].tolist()
concept_definition = [d.encode("utf-8") for d in concept_definition]
df=pandas.DataFrame(columns=["X","Y","concept","definition","id"])
nrows=30 #nrows and ncol for ~900 points
x=range(1,31)*30
y=[]
for i in range(1,31):
    y = y + [i]*30

len(y)==len(x)
df.X = x[0:810]
df.Y = y[0:810]
df.id = concepts["id"].tolist()
df.concept = concept_names
df.definition = concept_definition
df.to_csv("data.tsv",sep="\t",index=False)
        if predicted == actual:
            concepts_df.loc[actual_concepts,"correct"] = concepts_df.loc[actual_concepts,"correct"] + predicted_count
        else:
            concepts_df.loc[actual_concepts,"incorrect"] = concepts_df.loc[actual_concepts,"incorrect"] + predicted_count

# Add the number of images
for concept_name in labels.columns:
    number_images = labels[concept_name][labels[concept_name]==1].shape[0]
    concepts_df.loc[concept_name,"number_images"] = number_images

concepts_df.to_csv("%s/classification_concept_confusion_cogatid_perform.tsv" %results,sep="\t")

# Replace concept ids with concept names
conceptnames = []
for conceptname in concepts_df.index:
    conceptnames.append(get_concept(id=conceptname).json[0]["name"])

concepts_df.index = conceptnames        
concepts_df.to_csv("%s/classification_concept_confusion_perform.tsv" %results,sep="\t")

# Normalize by the row count (to see what percentage of the time we get it wrong/right)
concepts_df_norm = pandas.DataFrame(columns=["correct","incorrect","number_images"])
for row in concepts_df.iterrows():
   rowsum = row[1][0:2].sum()
   if rowsum != 0:
       norm_values = [float(x)/rowsum for x in row[1].tolist()[0:2]]
       norm_values.append(concepts_df.loc[row[0],"number_images"])
       concepts_df_norm.loc[row[0]] = norm_values

concepts_df_norm.sort(columns=["correct"],ascending=False,inplace=True)
concepts_df_norm.to_csv("%s/classification_concept_confusion_norm_perform.tsv" %results,sep="\t")
Exemple #29
0
#/usr/bin/python

#Make coordinate plane

import pandas

data_file = "data.tsv"
from cognitiveatlas.api import get_concept

# Retrieve concepts from the cognitive atlas
concepts = get_concept().pandas
concept_names = concepts["name"].tolist()
concept_definition = concepts["definition_text"].tolist()
concept_definition = [d.encode("utf-8") for d in concept_definition]
df = pandas.DataFrame(columns=["X", "Y", "concept", "definition", "id"])
nrows = 30  #nrows and ncol for ~900 points
x = range(1, 31) * 30
y = []
for i in range(1, 31):
    y = y + [i] * 30

len(y) == len(x)
df.X = x[0:810]
df.Y = y[0:810]
df.id = concepts["id"].tolist()
df.concept = concept_names
df.definition = concept_definition
df.to_csv("data.tsv", sep="\t", index=False)
from cognitiveatlas.api import get_concept, get_task
from py2neo import Graph, Path, Node, Rel, authenticate
import os

# Get concepts, tasks
concepts = get_concept()
concept_ids = concepts.pandas.id.tolist()
concept_names = concepts.pandas.name.tolist()
tasks = get_task()
task_ids = tasks.pandas.id.tolist()
task_names = tasks.pandas.name.tolist()

# get contrasts from tasks
contrast_ids = []
contrast_names = []
contrast_tasks = []
for t in tasks.json:
    task = get_task(id=t["id"])
    contrasts = task.json[0]["contrasts"]
    for contrast in contrasts:
        contrast_tasks.append(t["id"])
        contrast_ids.append(contrast["id"])
        contrast_names.append(contrast["contrast_text"])

# set up authentication parameters
pw=open('neo4j_pw').readline().strip()
authenticate("localhost:7474", "neo4j", pw)

# connect to authenticated graph database
graph = Graph()
Exemple #31
0
def download_cognitive_atlas(data_dir=None, overwrite=False, verbose=1):
    """Download Cognitive Atlas ontology and extract IDs and relationships.

    .. versionadded:: 0.0.2

    Parameters
    ----------
    data_dir : :obj:`str`, optional
        Location in which to place Cognitive Atlas files.
        Default is None, which uses the package's default path for downloaded
        data.
    overwrite : :obj:`bool`, optional
        Whether to overwrite existing files or not. Default is False.
    verbose : :obj:`int`, optional
        Default is 1.

    Returns
    -------
    out_dict : :obj:`dict`
        Dictionary with two keys: 'ids' and 'relationships'. Each points to a
        csv file. The 'ids' file contains CogAt identifiers, canonical names,
        and aliases, sorted by alias length (number of characters).
        The 'relationships' file contains associations between CogAt items,
        with three columns: input, output, and rel_type (relationship type).
    """
    from cognitiveatlas.api import get_concept, get_disorder, get_task

    dataset_name = "cognitive_atlas"
    data_dir = _get_dataset_dir(dataset_name,
                                data_dir=data_dir,
                                verbose=verbose)

    ids_file = op.join(data_dir, "cogat_aliases.csv")
    rels_file = op.join(data_dir, "cogat_relationships.csv")
    if overwrite or not all([op.isfile(f) for f in [ids_file, rels_file]]):
        concepts = get_concept(silent=True).pandas
        tasks = get_task(silent=True).pandas
        disorders = get_disorder(silent=True).pandas

        # Identifiers and aliases
        long_concepts = _longify(concepts)
        long_tasks = _longify(tasks)

        # Disorders currently lack aliases
        disorders["name"] = disorders["name"].str.lower()
        disorders = disorders.assign(alias=disorders["name"])
        disorders = disorders[["id", "name", "alias"]]

        # Combine into aliases DataFrame
        aliases = pd.concat((long_concepts, long_tasks, disorders), axis=0)
        aliases = _expand_df(aliases)
        aliases = aliases.replace("", np.nan)
        aliases = aliases.dropna(axis=0)
        aliases = aliases.reset_index(drop=True)

        # Relationships
        relationship_list = []
        for i, id_ in enumerate(concepts["id"].unique()):
            if i % 100 == 0:
                time.sleep(5)
            row = [id_, id_, "isSelf"]
            relationship_list.append(row)
            concept = get_concept(id=id_, silent=True).json
            for rel in concept["relationships"]:
                reltype = _get_concept_reltype(rel["relationship"],
                                               rel["direction"])
                if reltype is not None:
                    row = [id_, rel["id"], reltype]
                    relationship_list.append(row)

        for i, id_ in enumerate(tasks["id"].unique()):
            if i % 100 == 0:
                time.sleep(5)
            row = [id_, id_, "isSelf"]
            relationship_list.append(row)
            task = get_task(id=id_, silent=True).json
            for rel in task["concepts"]:
                row = [id_, rel["concept_id"], "measures"]
                relationship_list.append(row)
                row = [rel["concept_id"], id_, "measuredBy"]
                relationship_list.append(row)

        for i, id_ in enumerate(disorders["id"].unique()):
            if i % 100 == 0:
                time.sleep(5)
            row = [id_, id_, "isSelf"]
            relationship_list.append(row)
            disorder = get_disorder(id=id_, silent=True).json
            for rel in disorder["disorders"]:
                if rel["relationship"] == "ISA":
                    rel_type = "isA"
                else:
                    rel_type = rel["relationship"]
                row = [id_, rel["id"], rel_type]
                relationship_list.append(row)

        relationships = pd.DataFrame(columns=["input", "output", "rel_type"],
                                     data=relationship_list)
        ctp_df = concepts[["id", "id_concept_class"]]
        ctp_df = ctp_df.assign(rel_type="inCategory")
        ctp_df.columns = ["input", "output", "rel_type"]
        ctp_df["output"].replace("", np.nan, inplace=True)
        ctp_df.dropna(axis=0, inplace=True)
        relationships = pd.concat((ctp_df, relationships))
        relationships = relationships.reset_index(drop=True)
        aliases.to_csv(ids_file, index=False)
        relationships.to_csv(rels_file, index=False)
    out_dict = {"ids": ids_file, "relationships": rels_file}

    return out_dict
Exemple #32
0
task_uids = [task["id"] for task in get_task().json]
contrasts = dict() # contrast lookup by task uid

# Now we can retrieve the full data. We are interested in contrasts, so let's save those.
for task in task_uids:
    if task not in contrasts:
        task_complete = get_task(task).json[0]
        # Only save if we have contrasts
        if len(task_complete["contrasts"]) > 0:
            contrasts[task] = task_complete["contrasts"]

# How many tasks have contrasts?
len(contrasts)
# 437

# Step 3: Make a contrast --> concept lookup
concepts = dict()
for task_uid,contrast_set in contrasts.iteritems():
    for contrast in contrast_set:
        contrast_uid = contrast["id"]
        if contrast_uid not in concepts:
            try: # Some calls don't work
                concepts[contrast_uid] = get_concept(contrast_id=contrast_uid).json[0]
            except:
                pass

# How many concepts are asserted to measure different contrasts?
len(concepts)

def get_concepts():
    return get_concept().json
def main():

    base = "data/"

    # Make a folder for mean images
    if not os.path.exists("mr"):
        os.mkdir("mr")

    # Get Neurovault Images with defined cognitive atlas contrast    
    collections = get_collections()

    # Filter images to those that have a DOI
    collections = collections[collections.DOI.isnull()==False]
    
    # Get image meta data for collections (N=1023)
    images = get_images(collection_pks=collections.collection_id.tolist())

    # Filter images to those with contrasts defined (N=98)
    images = images[images.cognitive_contrast_cogatlas_id.isnull()==False]

    # Get rid of any not in MNI
    images = images[images.not_mni == False]

    # Get rid of thresholded images
    images = images[images.is_thresholded == False]

    ### Step 1: Load meta data sources 
    unique_contrasts = images.cognitive_contrast_cogatlas_id.unique().tolist()

    # Images that do not match the correct identifier will not be used (eg, "Other")
    expression = re.compile("cnt_*")
    unique_contrasts = [u for u in unique_contrasts if expression.match(u)]

    # Make sure exists in cognitive atlas
    existing_contrasts = []
    for u in unique_contrasts:
        try:
           tmp = get_concept(contrast_id=u,silent=True)
           existing_contrasts.append(u)
        except:
            print "%s is defined in NeuroVault, does not exist in Cognitive Atlas" %u

    image_lookup = dict()
    for u in existing_contrasts:
        image_lookup[u] = images.image_id[images.cognitive_contrast_cogatlas_id==u].tolist()

    # Create a data structure of tasks and contrasts for our analysis
    relationship_table = concept_node_triples(image_dict=image_lookup,save_to_file=False)

    unique_nodes = relationship_table.id.unique().tolist()

    # We will store a data frame of meta data
    # Lookup for meta_data is the id of the node!
    meta_data = {}

    for node in unique_nodes:
        meta_single = {}
        # This is an image node
        if re.search("node_",node):
            print "Found image node!"
            relationship_table_row = relationship_table[relationship_table.id==node]
            image_id = relationship_table_row.name.tolist()[0]
            meta_single["category"] = ""
            meta_single["type"] = "nii"
            # NeuroVault metadata
            concepts = relationship_table.parent[relationship_table.name == image_id]
            concepts = [relationship_table.name[relationship_table.id==c].tolist()[0] for c in concepts]
            neurovault_row = images[images.image_id == int(image_id)]
            collection_row = collections[collections.collection_id == neurovault_row.collection_id.tolist()[0]]
            collection_meta = {"DOI":collection_row["DOI"].tolist()[0],
                               "authors":collection_row["authors"].tolist()[0],
                               "journal":collection_row["journal_name"].tolist()[0],
                               "url":collection_row["url"].tolist()[0],
                               "subjects":collection_row["number_of_subjects"].tolist()[0],
                               "smoothing_fwhm":str(collection_row["smoothing_fwhm"].tolist()[0]).encode("utf-8")}
            meta_single["url"] = neurovault_row["url"].tolist()[0]
            meta_single["thumbnail"] = neurovault_row["thumbnail"].tolist()[0]
            meta_single["images"] = neurovault_row["thumbnail"].tolist()
            meta_single["task"] = neurovault_row["cognitive_paradigm_cogatlas"].tolist()[0]
            meta_single["contrast"] = neurovault_row["cognitive_contrast_cogatlas"].tolist()[0]
            meta_single["download"] = neurovault_row["file"].tolist()[0]
            meta_single["concept"] = concepts
            if neurovault_row["description"].tolist()[0]:
                meta_single["description"] =  str(neurovault_row["description"].tolist()[0]).encode("utf-8")
            else:
                meta_single["description"] = ""
            if len(meta_single["description"]) > 600:
                meta_single["description"] = "%s..." % meta_single["description"][0:600]
        else: # A concept node
            if node != "1":
                relationship_table_row = relationship_table[relationship_table.id==node]
                concept = get_concept(id=node,silent=True).json
                children_nodes = [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node]
                while len([x for x in children_nodes if not isinstance(x,int)]) > 0:
                    new_parent_nodes = [x for x in children_nodes if not isinstance(x,int)]
                    children_nodes = [x for x in children_nodes if x not in new_parent_nodes]
                    for new_parent in new_parent_nodes:
                        node_name = relationship_table.id[relationship_table.name==new_parent].tolist()[0]
                        children_nodes = children_nodes + [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node_name]
                # Now only keep children that are images
                meta_single["images"] = images["thumbnail"][images.image_id.isin(children_nodes)].tolist()
                # Cognitive Atlas meta data
                meta_single["url"] = "http://www.cognitiveatlas.org/term/id/%s" %node
                meta_single["type"] = "concept"
                meta_single["thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png"
                meta_single["concept"] = [relationship_table.name[relationship_table.id==node].tolist()[0]]
                meta_single["task"] = ""
                meta_single["contrast"] = []
                meta_single["download"] = "http://www.cognitiveatlas.org/rdf/id/%s" %node
                if concept[0]["definition_text"]:
                    meta_single["description"] = concept[0]["definition_text"].encode("utf-8")
                else:
                    meta_single["description"] = ""
                if len(meta_single["description"]) > 600:
                    meta_single["description"] = "%s..." % meta_single["description"][0:600]
        meta_data[node] = meta_single
    
    
    ## STEP 2: VISUALIZATION WITH PYBRAINCOMPARE
    from pybraincompare.ontology.tree import named_ontology_tree_from_tsv, make_ontology_tree_d3

    # First let's look at the tree structure
    # output_json = "%s/task_contrast_tree.json" % outfolder
    tree = named_ontology_tree_from_tsv(relationship_table,output_json=None,meta_data=meta_data)
    html_snippet = make_ontology_tree_d3(tree)
    web_folder = base
    make_analysis_web_folder(html_snippet,web_folder)

    # To get a dump of just the tree (for use in more advanced custom web interface)
    filey = open('%s/reverseinference.json' %base,'wb')
    filey.write(json.dumps(tree, sort_keys=True,indent=4, separators=(',', ': ')))
    filey.close()

    ## STEP 3: Export individual nodes

    ### Images
    unique_images = images.image_id.unique().tolist()

    # Images
    for s in range(0,len(unique_images)):
        image_id = unique_images[s]
        meta_data = {}
        meta_data["image_id"] = image_id
        print "Parsing data for images %s of %s" %(s,len(unique_images))
        concepts = relationship_table.parent[relationship_table.name == str(image_id)].tolist()
        concepts = [relationship_table.name[relationship_table.id==c].tolist()[0] for c in concepts]
        concepts_ids = [relationship_table.id[relationship_table.id==c].tolist()[0] for c in concepts]
        neurovault_row = images[images.image_id == int(image_id)]            
        collection_row = collections[collections.collection_id == neurovault_row.collection_id.tolist()[0]]
        collection_meta = {"DOI":collection_row["DOI"].tolist()[0],
                           "authors":collection_row["authors"].tolist()[0],
                           "journal":collection_row["journal_name"].tolist()[0],
                           "url":collection_row["url"].tolist()[0],
                           "subjects":collection_row["number_of_subjects"].tolist()[0],
                           "smoothing_fwhm":str(collection_row["smoothing_fwhm"].tolist()[0]).encode("utf-8"),
                           "title":collection_row["name"].tolist()[0]}
        meta_data["collection"] = collection_meta
        meta_data["url"] = neurovault_row["url"].tolist()[0]
        meta_data["thumbnail"] = neurovault_row["thumbnail"].tolist()[0]
        meta_data["images"] = neurovault_row["thumbnail"].tolist()
        meta_data["task"] = neurovault_row["cognitive_paradigm_cogatlas"].tolist()[0]
        meta_data["contrast"] = neurovault_row["cognitive_contrast_cogatlas"].tolist()[0]
        meta_data["download"] = neurovault_row["file"].tolist()[0]
        meta_data["concept"] = concepts
        meta_data["concept_id"] = concepts_ids
        if neurovault_row["description"].tolist()[0]:
            try:
                description = str(neurovault_row["description"].tolist()[0]).encode("utf-8")
            except:
                description = ""
            if description != "nan":
                meta_data["description"] =  description
            else:
                meta_data["description"] = ""
        else:
            meta_data["description"] = ""
        if len(meta_data["description"]) > 600:
            meta_data["description"] = "%s..." % meta_data["description"][0:600]
        output_file = "%s/ri_%s.json" %(base,meta_data["image_id"])
        filey = open(output_file,'wb')
        filey.write(json.dumps(meta_data, sort_keys=True,indent=4, separators=(',', ': ')))
        filey.close()
    

    ### Concepts
    for node in unique_nodes:
        # This is a concept node
        if not re.search("node_",node):
            if node != "1":
                relationship_table_row = relationship_table[relationship_table.id==node]
                concept = get_concept(id=node).json
                meta_single = {}
                children_nodes = [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node]
                while len([x for x in children_nodes if not isinstance(x,int)]) > 0:
                    new_parent_nodes = [x for x in children_nodes if not isinstance(x,int)]
                    children_nodes = [x for x in children_nodes if x not in new_parent_nodes]
                    for new_parent in new_parent_nodes:
                        node_name = relationship_table.id[relationship_table.name==new_parent].tolist()[0]
                        children_nodes = children_nodes + [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node_name]
                # Now only keep children that are images
                meta_single["images"] = images["thumbnail"][images.image_id.isin(children_nodes)].tolist()
                meta_single["image_list"] = children_nodes
                # Cognitive Atlas meta data
                meta_single["url"] = "http://www.cognitiveatlas.org/term/id/%s" %node
                meta_single["type"] = "concept"
                meta_single["thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png"
                meta_single["concept"] = [relationship_table.name[relationship_table.id==node].tolist()[0]]
                meta_single["task"] = ""
                meta_single["contrast"] = []
                meta_single["download"] = "http://www.cognitiveatlas.org/rdf/id/%s" %node
                if concept[0]["definition_text"]:
                    meta_single["description"] = concept[0]["definition_text"].encode("utf-8")
                else:
                    meta_single["description"] = ""
                if len(meta_single["description"]) > 600:
                    meta_single["description"] = "%s..." % meta_single["description"][0:600]
                output_file = "%s/ri_%s.json" %(base,node)
                filey = open(output_file,'wb')
                filey.write(json.dumps(meta_single, sort_keys=True,indent=4, separators=(',', ': ')))
                filey.close()
    # Bug with getting contrasts for images:
    if idx == 109:
        unique_concepts[idx] = [
            "trm_567982752ff4a", "trm_4a3fd79d0afcf", "trm_5534111a8bc96",
            "trm_557b48a224b95", "trm_557b4a81a4a17", "trm_4a3fd79d0b64e",
            "trm_4a3fd79d0a33b", "trm_557b4a7315f1b", "trm_4a3fd79d0af71",
            "trm_557b4b56de455", "trm_557b4add1837e"
        ]
    elif idx == 118:
        unique_concepts[idx] = [
            "trm_4a3fd79d0b642", "trm_4a3fd79d0a33b", "trm_557b4a7315f1b",
            "trm_4a3fd79d0af71", "trm_557b4b56de455"
        ]
    else:
        contrast = row[1].cognitive_contrast_cogatlas_id
        concepts = get_concept(contrast_id=contrast)
        concepts = numpy.unique(concepts.pandas.id).tolist()
        unique_concepts[idx] = concepts

all_concepts = []
for image_id, concepts in unique_concepts.iteritems():
    for concept in concepts:
        if concept not in all_concepts:
            all_concepts.append(concept)

res = {
    "all_concepts": all_concepts,
    "unique_concepts": unique_concepts,
    "images": images
}
# Let's make a concept by concept data frame
contrast_lookup = pandas.read_csv("data/contrast_by_concept_binary_df.tsv",sep="\t",index_col=0)
images = pandas.read_csv("data/contrast_defined_images_filtered.tsv",sep="\t",index_col=0)

# source will be in rows, target in columns
concept_kindof = pandas.DataFrame()
concept_partof = pandas.DataFrame()
concepts = contrast_lookup.columns.tolist()
seen = []

while len(concepts) > 0:
    concept = concepts.pop(0)
    seen.append(concept)
    try:
        tmp = get_concept(id=concept).json[0]
        if 'relationships' in tmp:
            for relation in tmp["relationships"]:
                if relation['id'] not in seen and relation['id'] not in concepts:
                    print "Adding concept %s" %(relation['id'])
                    concepts.append(relation['id'])
                if relation['direction'] == "parent":
                    if relation['relationship'] == 'kind of':
                        concept_kindof.loc[tmp['id'],relation['id']] = 1
                    elif relation['relationship'] == 'part of':
                        concept_partof.loc[tmp['id'],relation['id']] = 1
                elif relation['direction'] == "child":
                    if relation['relationship'] == 'kind of':
                        concept_kindof.loc[relation['id'],tmp['id']] = 1
                    elif relation['relationship'] == 'part of':
                        concept_partof.loc[relation['id'],tmp['id']] = 1
Exemple #37
0
def pull_ontology(out_dir='auto', overwrite=False):
    """
    Download Cognitive Atlas ontology and combine Concepts, Tasks, and
    Disorders to create ID and relationship DataFrames.
    """
    if out_dir == 'auto':
        out_dir = op.join(get_resource_path(), 'ontology')
    else:
        out_dir = op.abspath(out_dir)

    ids_file = op.join(out_dir, 'cogat_ids.csv')
    rels_file = op.join(out_dir, 'cogat_relationships.csv')
    if overwrite or not all([op.isfile(f) for f in [ids_file, rels_file]]):
        concepts = get_concept(silent=True).pandas
        tasks = get_task(silent=True).pandas
        disorders = get_disorder(silent=True).pandas

        # Identifiers and aliases
        long_concepts = _longify(concepts)
        long_tasks = _longify(tasks)

        # Disorders currently lack aliases
        disorders['name'] = disorders['name'].str.lower()
        disorders = disorders.assign(alias=disorders['name'])
        disorders = disorders[['id', 'name', 'alias']]

        # Combine into id_df
        id_df = pd.concat((long_concepts, long_tasks, disorders), axis=0)
        id_df = _expand_df(id_df)
        id_df = id_df.replace('', np.nan)
        id_df = id_df.dropna(axis=0)
        id_df = id_df.reset_index(drop=True)

        # Relationships
        relationships = []
        for i, id_ in enumerate(concepts['id'].unique()):
            if i % 100 == 0:
                time.sleep(5)
            row = [id_, id_, 'isSelf']
            relationships.append(row)
            concept = get_concept(id=id_, silent=True).json
            for rel in concept['relationships']:
                reltype = _get_concept_reltype(rel['relationship'],
                                               rel['direction'])
                if reltype is not None:
                    row = [id_, rel['id'], reltype]
                    relationships.append(row)

        for i, id_ in enumerate(tasks['id'].unique()):
            if i % 100 == 0:
                time.sleep(5)
            row = [id_, id_, 'isSelf']
            relationships.append(row)
            task = get_task(id=id_, silent=True).json
            for rel in task['concepts']:
                row = [id_, rel['concept_id'], 'measures']
                relationships.append(row)
                row = [rel['concept_id'], id_, 'measuredBy']
                relationships.append(row)

        for i, id_ in enumerate(disorders['id'].unique()):
            if i % 100 == 0:
                time.sleep(5)
            row = [id_, id_, 'isSelf']
            relationships.append(row)
            disorder = get_disorder(id=id_, silent=True).json
            for rel in disorder['disorders']:
                if rel['relationship'] == 'ISA':
                    rel_type = 'isA'
                else:
                    rel_type = rel['relationship']
                row = [id_, rel['id'], rel_type]
                relationships.append(row)

        rel_df = pd.DataFrame(columns=['input', 'output', 'rel_type'],
                              data=relationships)
        ctp_df = concepts[['id', 'id_concept_class']]
        ctp_df = ctp_df.assign(rel_type='inCategory')
        ctp_df.columns = ['input', 'output', 'rel_type']
        ctp_df['output'].replace('', np.nan, inplace=True)
        ctp_df.dropna(axis=0, inplace=True)
        rel_df = pd.concat((ctp_df, rel_df))
        rel_df = rel_df.reset_index(drop=True)
        id_df.to_csv(ids_file, index=False)
        rel_df.to_csv(rels_file, index=False)
    else:
        id_df = pd.read_csv(ids_file)
        rel_df = pd.read_csv(rels_file)

    return id_df, rel_df
from cognitiveatlas.api import get_concept
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import *
import pandas
import numpy
import json
import nltk
import re

paragraphs = json.load(open("william_james.json","rb"))

# Get all cognitive atlas concepts
all_concepts = get_concept().json
concepts = dict()
for concept in all_concepts:
    concepts[concept["id"]] = str(concept["name"])

# Functions to parse text
def remove_nonenglish_chars(text):
    return re.sub("[^a-zA-Z]", " ", text)
    
def text2sentences(text,remove_non_english_chars=True):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')    
    if remove_non_english_chars:
        text = remove_nonenglish_chars(text)
    for s in tokenizer.tokenize(text):
        yield s

def processText(text):
    '''combines text2sentences and sentence2words'''