def concept_json_dump(): ''' use cognitiveatlas library to dump concepts to a json file ''' all_concepts = [] concepts = get_concept().json for concept in concepts: all_concepts.append(get_concept(id=concept['id']).json) with open("all_concepts.json", 'w') as fp: json.dump(all_concepts, fp)
def get_cognitiveatlas_task(task_id): """get_cognitiveatlas_task return the database entry for CognitiveAtlasTask if it exists, and update concepts for that task. If not, create it. :param task_id: the unique id for the cognitive atlas task """ try: task = get_task(id=task_id).json[0] cogatlas_task, _ = CognitiveAtlasTask.objects.update_or_create( cog_atlas_id=task["id"], defaults={"name": task["name"]} ) concept_list = [] if "concepts" in task.keys(): for concept in task["concepts"]: cogatlas_concept = get_concept(id=concept["concept_id"]).json[0] cogatlas_concept, _ = CognitiveAtlasConcept.objects.update_or_create( cog_atlas_id=cogatlas_concept["id"], defaults={"name": cogatlas_concept["name"]}, definition=cogatlas_concept["definition_text"], ) cogatlas_concept.save() concept_list.append(cogatlas_concept) cogatlas_task.concepts = concept_list cogatlas_task.save() return cogatlas_task except: # Any error with API, etc, return None return None
def get_cognitiveatlas_task(task_id): '''get_cognitiveatlas_task return the database entry for CognitiveAtlasTask if it exists, and update concepts for that task. If not, create it. :param task_id: the unique id for the cognitive atlas task ''' try: task = get_task(id=task_id).json[0] cogatlas_task, _ = CognitiveAtlasTask.objects.update_or_create( cog_atlas_id=task["id"], defaults={"name": task["name"]}) concept_list = [] if "concepts" in task.keys(): for concept in task["concepts"]: cogatlas_concept = get_concept( id=concept["concept_id"]).json[0] cogatlas_concept, _ = CognitiveAtlasConcept.objects.update_or_create( cog_atlas_id=cogatlas_concept["id"], defaults={"name": cogatlas_concept["name"]}, definition=cogatlas_concept["definition_text"]) cogatlas_concept.save() concept_list.append(cogatlas_concept) cogatlas_task.concepts = concept_list cogatlas_task.save() return cogatlas_task except BaseException: # Any error with API, etc, return None return None
def get_concepts(self, force_load=False): if os.path.exists(os.path.join(self.datadir, 'concepts_df.csv')) and not force_load: print('using cached cognitive atlas concepts') self.concepts_df = pandas.read_csv( os.path.join(self.datadir, 'concepts_df.csv')) else: self.concepts_df = get_concept().pandas self.concepts_df.to_csv( os.path.join(self.datadir, 'concepts_df.csv')) self.concepts = self.concepts_df.name.tolist()
def test_concepts(): print "### TESTING CONCEPT QUERIES:" concept_id = "trm_5022ef7599294" concept_name = "anxiety" contrast_id = "cnt_5299143fed521" # concept_id result = get_concept(id=concept_id) assert_equal(result.json[0]["name"],concept_name) # concept_name result = get_concept(name=concept_name) assert_equal(result.json[0]["id"],concept_id) # contrast_id result = get_concept(contrast_id=contrast_id) assert_equal(result.json[0]["id"],concept_id) # concept_id and concept_name result = get_concept(id=concept_id,name=concept_name) assert_equal(result.json[0]["name"],concept_name) # concept_id, and contrast_id result = get_concept(id=concept_id,contrast_id=contrast_id) assert_equal(result.json[0]["name"],concept_name) # concept_name and contrast_id result = get_concept(name=concept_name,contrast_id=contrast_id) assert_equal(result.json[0]["id"],concept_id)
def test_concepts(): print("### TESTING CONCEPT QUERIES:") concept_id = "trm_5022ef7599294" concept_name = "anxiety" contrast_id = "cnt_5299143fed521" # concept_id result = get_concept(id=concept_id) assert_equal(result.json[0]["name"], concept_name) # concept_name result = get_concept(name=concept_name) assert_equal(result.json[0]["id"], concept_id) # contrast_id result = get_concept(contrast_id=contrast_id) assert_true(any(concept_id == obj["id"] for obj in result.json)) #assert_equal(result.json[0]["id"],concept_id) # concept_id and concept_name result = get_concept(id=concept_id, name=concept_name) assert_equal(result.json[0]["name"], concept_name) # concept_id, and contrast_id result = get_concept(id=concept_id, contrast_id=contrast_id) assert_equal(result.json[0]["name"], concept_name) # concept_name and contrast_id result = get_concept(name=concept_name, contrast_id=contrast_id) assert_equal(result.json[0]["id"], concept_id)
def get_isAbout_label(url): ''' Added by DBK to get labels for isAbout urls :param url: url to get label for :return: string label ''' scicrunch_base_uri = 'https://scicrunch.org/api/1/ilx/search/curie/' # load user's api key from environment variable. If not found then exit with error message try: user_key = os.environ["INTERLEX_API_KEY"] except KeyError: print("Please set the environment variable INTERLEX_API_KEY") sys.exit(1) if "cognitiveatlas" in url: #skip for things that aren't concepts or disorders for the time being if ("concept" not in url) and ("disorder" not in url): # for now if we don't have a concept or disorder url from cogatlas then just return nothing for label # will need to work with cog atlas folks about how to retrieve tasks and other types from cog atlas return "" #print(url) # parse out id of term and get using cog atlas python tool... id = url.rsplit('/', 1)[0].rsplit('/', 1)[1] # don't know if this is a concept or disorder so we'll try both try: tmp = get_concept(id=id, silent=True) label = tmp.json['name'].lower() #print("cogatlas concept label: %s" %(isAbout_term_labels[url])) except: tmp = get_disorder(id=id, silent=True) label = tmp.json['name'].lower() #print("cogatlas disorder label: %s" %isAbout_term_labels[url]) elif "interlex" in url: # get label for interlex terms payload = {} headers = {} full_url = scicrunch_base_uri + url.rsplit('/', 1)[1].replace( '_', ':').rstrip("']'") + "?key=" + user_key #print(full_url) response = requests.request("GET", full_url, headers=headers, data=payload) # response is a json dictionary. here we want the label label = response.json()["data"]["label"].lower() #print("interlex label: %s" %isAbout_term_labels[url] ) return label
def main(): tasks = get_task() output_folder = os.path.abspath("data") print "Generating Cognitive Atlas Data..." # Cognitive Atlas tasks we are interested in from NeuroVault tags task_uids = ['trm_553e77e53497d', 'trm_553ebfc390256', 'trm_553e88a66b676', 'trm_553fd2fc7a648', 'trm_4ebd482eba5b1', 'trm_4ebc98cc77e7b', 'trm_4ebc728326a13', 'trm_4ebc6a6b75ebf', 'trm_4ebc9d2e397f2', 'trm_553fce5d21da7', 'trm_553fcbbe974ba', 'trm_4da890594742a', 'trm_4d559bcd67c18', 'trm_4cacee4a1d875', 'trm_4c898c0786246', 'trm_4ebd47b8bab6b', 'tsk_4a57abb949a4f', 'trm_4f2456027809f', 'trm_553e73e29cf7d', 'trm_4c8a834779883', 'trm_4cacf22a22d80', 'trm_4e8dd3831f0cc', 'trm_53c4465b0466f', 'trm_553fbbf79ebc5', 'trm_5542841f3dcd5', 'trm_5346938eed092', 'trm_534692ef3b5df', 'trm_534690b0e9dc5', 'trm_5346927710e88', 'trm_4f244ad7dcde7', 'trm_551b1460e89a3', 'trm_553e6b8e33da4', 'trm_553e85265f51e', 'tsk_4a57abb949bf6', 'trm_4f24179122380', 'tsk_4a57abb949e1a', 'trm_4cacf3fbc503b', 'trm_5181f83b77fa4', 'trm_5181f863d24f4', 'trm_553eb45e2b709', 'trm_550b5b066d37b', 'trm_550b50095d4a3', 'trm_550b53d7dd674', 'trm_550b5c1a7f4db', 'trm_550b54a8b30f4', 'trm_550b557e5f90e', 'trm_550b5a47aa23e', 'trm_553eb28436233', 'trm_50df0dd9d0b6f', 'trm_553fc858cacc5'] # Functions for making nodes def make_node(nid,name,color): return {"nid":nid,"name":name,"color":color} for task in tasks.json: if task["name"] != "": print "Parsing task %s..." %task["name"] task_name = task["name"].replace(" ","_").replace("/","_").lower() if task["id"] in task_uids: task_node = make_node(task["id"],task["name"],"#63506d") single_task = get_task(id=task["id"]).json[0] # We only want to see contrasts with associated concepts task_contrasts = single_task["contrasts"] task_concepts = [] for contrast in task_contrasts: try: contrast_node = make_node(contrast["id"],contrast["contrast_text"],"#d89013") contrast_concepts = get_concept(contrast_id=contrast["id"]) children = [] current_names = [] for concept in contrast_concepts.json: if concept["name"] not in current_names: children.append(make_node(concept["id"],concept["name"],"#3c7263")) current_names.append(concept["name"]) contrast_node["children"] = children # Only append contrast if it has children if len(children) > 0: task_concepts.append(contrast_node) except: pass task_node["children"] = task_concepts # Save to file if we have children if len(task_concepts) > 0: filey = open('%s/%s.json' %(output_folder,task_name),'w') filey.write(json.dumps(task_node, sort_keys=True,indent=4, separators=(',', ': '))) filey.close()
def get_task_graph(task_id, images=None): """get_task_graph will return a tree for a single cognitive atlas tasks defined in NeuroVault :param task_id: the Cognitive Atlas task id :param get_images_with_contrasts: boolean to return images that have contrasts (default False) """ # Get all contrasts defined for Cognitive Atlas task = CognitiveAtlasTask.objects.filter(cog_atlas_id=task_id)[0] task_node = make_node(task.cog_atlas_id, task.name, "#63506d") task_contrasts = CognitiveAtlasContrast.objects.filter(task=task) task_concepts = [] for contrast in task_contrasts: contrast_node = make_node(contrast.cog_atlas_id, contrast.name, "#d89013") try: contrast_concepts = get_concept(contrast_id=contrast.cog_atlas_id) except ValueError: # TODO: disparity between NeuroVault and Cognitive Atlas! continue children = [] # concept children of a contrast current_names = [] # Do we have images tagged with the contrast? if not images: images = StatisticMap.objects.filter(cognitive_contrast_cogatlas=contrast) for concept in contrast_concepts.json: if concept and concept["name"] not in current_names: concept_node = make_node(concept["id"], concept["name"], "#3c7263") # Image nodes if len(images) > 0: stat_map_nodes = [make_node(i.pk, i.name, "#337ab7", "/images/%s" % i.pk) for i in images] concept_node["children"] = stat_map_nodes children.append(concept_node) current_names.append(concept["name"]) contrast_node["children"] = children # Only append contrast if it has children if len(children) > 0: task_concepts.append(contrast_node) task_node["children"] = task_concepts return task_node
concept_acc.loc[concept, "correct_rejection"] = calculate_hits(Ya, Yp, 0, 0) return concept_acc base_acc = get_concept_acc(predictions) forward_acc = get_concept_acc(predictions_forward) # Just compare the two for now diff_acc = forward_acc - base_acc # Add the concept names to each from cognitiveatlas.api import get_concept concept_names = [] for concept in diff_acc.index: concept_names.append(get_concept(id=concept).json[0]["name"]) base_acc["name"] = concept_names forward_acc["name"] = concept_names diff_acc["name"] = concept_names # Add the number of images number_images = [] for concept in diff_acc.index: number_images.append(Ymat.loc[:, concept].sum()) base_acc["number_images"] = number_images forward_acc["number_images"] = number_images diff_acc["number_images"] = number_images diff_acc = diff_acc.sort(columns=["hit"], ascending=False)
#class Concept(models.NodeModel): # name = models.StringProperty() # uid = models.StringProperty(indexed=True) # definition = models.StringProperty() # related_to = models.Relationship('self',rel_type='RELATEDTO') # part_of = models.Relationship('self',rel_type='PARTOF') # is_a = models.Relationship('self',rel_type='ISA') # measured_by = models.Relationship(Contrast,rel_type='MEASUREDBY') # mentioned_in = models.Relationship('PMID',rel_type='MENTIONEDIN') for row in concepts.iterrows(): uid = row[1].url.split("/")[-1] name = row[1].term try: concept = get_concept(id=uid, silent=True).json definition = concept[0]["definition_text"] except: definition = "" properties = {"definition": definition} node = make_node("concept", uid, name, properties) # Assertions! # We will store the old uid as a property, in case we need to map back to original data for row in assertions.iterrows(): uid = row[1].id user = row[1].id_user subject = row[1].id_subject relation = row[1].id_relation id_type = row[1].id_type
results = "%s/results" % base # any kind of tsv/result file decode_folder = "%s/decode" % base if not os.path.exists(decode_folder): os.mkdir(decode_folder) # Images by Concepts data frame labels_tsv = "%s/images_contrasts_df.tsv" % results images = pandas.read_csv(labels_tsv, sep="\t", index_col=0) output_folder = "%s/classification_final" % results # Get standard mask, 4mm standard_mask = get_standard_mask(4) # Get all cognitive atlas concepts all_concepts = get_concept().json concepts = dict() for concept in all_concepts: concepts[concept["id"]] = str(concept["name"]) # You will need to copy abstracts.txt into this folder from the repo abstracts = pandas.read_csv("%s/abstracts.txt" % decode_folder, sep="\t", index_col=0, header=None) abstracts.columns = ["text"] # Functions to parse text def remove_nonenglish_chars(text): return re.sub("[^a-zA-Z]", " ", text)
results = "%s/results" %base # any kind of tsv/result file decode_folder = "%s/decode" %base if not os.path.exists(decode_folder): os.mkdir(decode_folder) # Images by Concepts data frame labels_tsv = "%s/images_contrasts_df.tsv" %results images = pandas.read_csv(labels_tsv,sep="\t",index_col=0) output_folder = "%s/classification_final" %results # Get standard mask, 4mm standard_mask=get_standard_mask(4) # Get all cognitive atlas concepts all_concepts = get_concept().json concepts = dict() for concept in all_concepts: concepts[concept["id"]] = str(concept["name"]) # You will need to copy abstracts.txt into this folder from the repo abstracts = pandas.read_csv("%s/abstracts.txt" %decode_folder,sep="\t",index_col=0,header=None) abstracts.columns = ["text"] # Functions to parse text def remove_nonenglish_chars(text): return re.sub("[^a-zA-Z]", " ", text) def text2sentences(text,remove_non_english_chars=True): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if remove_non_english_chars:
rsa_df.loc[task,"RSA"] = rsa(df1,df2) # Now let's calculate for contrasts (note - we have no images tagged with same contrast) contrasts = images["cognitive_contrast_cogatlas_id"].unique().tolist() for contrast in contrasts: contrast_images = images.image_id[images.cognitive_contrast_cogatlas_id==task].tolist() if len(contrast_images) > 2: df1 = spatial.loc[spatial.index.isin(contrast_images),spatial.columns.isin(contrast_images)] df2 = graph.loc[graph.index.isin(contrast_images),graph.columns.isin(contrast_images)] rsa_df.loc[task,"RSA"] = rsa(df1,df2) # However - we can look at RSA for concept images! Let's make a df of contrast_id by images from cognitiveatlas.api import get_concept concepts = [] for contrast in contrasts: tmp = get_concept(contrast_id=contrast).json concepts = concepts + [t["id"] for t in tmp if "id" in t] concepts = numpy.unique(concepts).tolist() contrast_df = pandas.DataFrame(0,index=contrasts,columns=concepts) # Now fill in the data frame for contrast in contrasts: tmp = get_concept(contrast_id=contrast).json contrast_concepts = [t["id"] for t in tmp if "id" in t] contrast_df.loc[contrast,contrast_concepts] = 1 # Save if we want it later contrast_df.to_csv("data/contrast_by_concept_binary_df.tsv",sep="\t") # Now let's perform RSA by concept
#class Concept(models.NodeModel): # name = models.StringProperty() # uid = models.StringProperty(indexed=True) # definition = models.StringProperty() # related_to = models.Relationship('self',rel_type='RELATEDTO') # part_of = models.Relationship('self',rel_type='PARTOF') # is_a = models.Relationship('self',rel_type='ISA') # measured_by = models.Relationship(Contrast,rel_type='MEASUREDBY') # mentioned_in = models.Relationship('PMID',rel_type='MENTIONEDIN') for row in concepts.iterrows(): uid = row[1].url.split("/")[-1] name = row[1].term try: concept = get_concept(id=uid,silent=True).json definition = concept[0]["definition_text"] except: definition = "" properties={"definition":definition} node = make_node("concept",uid,name,properties) # Assertions! # We will store the old uid as a property, in case we need to map back to original data for row in assertions.iterrows(): uid = row[1].id user = row[1].id_user subject = row[1].id_subject relation = row[1].id_relation id_type = row[1].id_type
import os import json import pandas from cogpheno.apps.assessments.models import CognitiveAtlasTask, CognitiveAtlasConcept from cognitiveatlas.api import get_task, get_concept tasks = get_task() concepts = get_concept() for t in range(0,len(tasks.json)): task = tasks.json[t] print "%s of %s" %(t,len(tasks.json)) task, _ = CognitiveAtlasTask.objects.update_or_create(cog_atlas_id=task["id"], defaults={"name":task["name"]}) task.save() # Or just update those not in #termid_present = [ct.cog_atlas_id for ct in CognitiveAtlasTask.objects.all()] #termid = [tasks.json[x]["id"] for x in range(0,len(tasks.json))] #termid_missing = [x for x in range(0,len(termid)) if termid[x] not in termid_present] #for m in termid_missing: # task = tasks.json[m] # task, _ = CognitiveAtlasTask.objects.update_or_create(cog_atlas_id=task["id"], defaults={"name":task["name"]}) # task.save() for c in range(0,len(concepts.json)): concept = concepts.json[c] print "%s of %s" %(c,len(concepts.json)) concept, _ = CognitiveAtlasConcept.objects.update_or_create(cog_atlas_id=concept["id"], defaults={"name":concept["name"]},definition=concept["definition_text"]) concept.save()
meta_single["images"] = neurovault_row["thumbnail"].tolist() meta_single["task"] = neurovault_row["cognitive_paradigm_cogatlas"].tolist()[0] meta_single["contrast"] = neurovault_row["cognitive_contrast_cogatlas"].tolist()[0] meta_single["download"] = neurovault_row["file"].tolist()[0] meta_single["concept"] = concepts if neurovault_row["description"].tolist()[0]: meta_single["description"] = str(neurovault_row["description"].tolist()[0]).encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single["description"][0:600] else: # A concept node if node != "1": relationship_table_row = relationship_table[relationship_table.id==node] contrast_name = relationship_table_row.name.tolist()[0] concept = get_concept(id=node).json # Reverse inference scores - all images if node in scores_df.node.unique().tolist(): # a node with images below it meta_single["scores"] = scores_df[scores_df.node == node].to_json(orient="records") image_ids = scores_df[scores_df.node == node].image_id.unique().tolist() meta_single["images"] = images["thumbnail"][images.image_id.isin(image_ids)].tolist() # Cognitive Atlas meta data meta_single["url"] = "http://www.cognitiveatlas.org/term/id/%s" %node meta_single["type"] = "concept" meta_single["thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png" meta_single["concept"] = [relationship_table.name[relationship_table.id==node].tolist()[0]] meta_single["task"] = "" meta_single["contrast"] = [] meta_single["category"] = "" meta_single["download"] = "http://www.cognitiveatlas.org/rdf/id/%s" %node if concept[0]["definition_text"]:
def main(): tasks = get_task() output_folder = os.path.abspath("data") print "Generating Cognitive Atlas Data..." # Cognitive Atlas tasks we are interested in from NeuroVault tags task_uids = [ 'trm_553e77e53497d', 'trm_553ebfc390256', 'trm_553e88a66b676', 'trm_553fd2fc7a648', 'trm_4ebd482eba5b1', 'trm_4ebc98cc77e7b', 'trm_4ebc728326a13', 'trm_4ebc6a6b75ebf', 'trm_4ebc9d2e397f2', 'trm_553fce5d21da7', 'trm_553fcbbe974ba', 'trm_4da890594742a', 'trm_4d559bcd67c18', 'trm_4cacee4a1d875', 'trm_4c898c0786246', 'trm_4ebd47b8bab6b', 'tsk_4a57abb949a4f', 'trm_4f2456027809f', 'trm_553e73e29cf7d', 'trm_4c8a834779883', 'trm_4cacf22a22d80', 'trm_4e8dd3831f0cc', 'trm_53c4465b0466f', 'trm_553fbbf79ebc5', 'trm_5542841f3dcd5', 'trm_5346938eed092', 'trm_534692ef3b5df', 'trm_534690b0e9dc5', 'trm_5346927710e88', 'trm_4f244ad7dcde7', 'trm_551b1460e89a3', 'trm_553e6b8e33da4', 'trm_553e85265f51e', 'tsk_4a57abb949bf6', 'trm_4f24179122380', 'tsk_4a57abb949e1a', 'trm_4cacf3fbc503b', 'trm_5181f83b77fa4', 'trm_5181f863d24f4', 'trm_553eb45e2b709', 'trm_550b5b066d37b', 'trm_550b50095d4a3', 'trm_550b53d7dd674', 'trm_550b5c1a7f4db', 'trm_550b54a8b30f4', 'trm_550b557e5f90e', 'trm_550b5a47aa23e', 'trm_553eb28436233', 'trm_50df0dd9d0b6f', 'trm_553fc858cacc5' ] # Functions for making nodes def make_node(nid, name, color): return {"nid": nid, "name": name, "color": color} for task in tasks.json: if task["name"] != "": print "Parsing task %s..." % task["name"] task_name = task["name"].replace(" ", "_").replace("/", "_").lower() if task["id"] in task_uids: task_node = make_node(task["id"], task["name"], "#63506d") single_task = get_task(id=task["id"]).json[0] # We only want to see contrasts with associated concepts task_contrasts = single_task["contrasts"] task_concepts = [] for contrast in task_contrasts: try: contrast_node = make_node(contrast["id"], contrast["contrast_text"], "#d89013") contrast_concepts = get_concept( contrast_id=contrast["id"]) children = [] current_names = [] for concept in contrast_concepts.json: if concept["name"] not in current_names: children.append( make_node(concept["id"], concept["name"], "#3c7263")) current_names.append(concept["name"]) contrast_node["children"] = children # Only append contrast if it has children if len(children) > 0: task_concepts.append(contrast_node) except: pass task_node["children"] = task_concepts # Save to file if we have children if len(task_concepts) > 0: filey = open('%s/%s.json' % (output_folder, task_name), 'w') filey.write( json.dumps(task_node, sort_keys=True, indent=4, separators=(',', ': '))) filey.close()
def concept_node_triples(image_dict=None, output_file="concept_node_triples.tsv", delim="\t", save_to_file=True, lookup_key_type="contrast"): '''concept_node_triples Export a list of nodes, in triples :param delim: delimiter for output file :param save_to_file: boolean, False will return pandas data frame :param image_dict [OPTIONAL]: dict a dictionary of [term_id:image_file] pairs, eg ..note:: {"cnt_4decfedb91973":["image1.nii.gz","image2.nii.gz"]} This will mean that the images in the list will be assigned to all concept nodes associated with the term specified. This allows for inference over the tree (for example, some relationship with concept nodes that are parents of assigned nodes). Specifying an image dictionary will append the images as the base nodes of the tree. No image dictionary means that the base nodes will be the lowest level concepts. You must specify the term type as "contrast" or "task" (see lookup_key_type) :param delim: str delimeter for output file, default is tab. :param output_file: path :param lookup_key_type: the term type used as a key in the image_dict. Either "task" or "contrast" (default is contrast) ..note:: Output looks like id parent name 1 none BASE # there is always a base node trm_12345 1 MEMORY # high level concept groups trm_23456 1 PERCEPTION trm_34567 trm_12345 WORKING MEMORY # concepts trm_56789 trm_12345 LONG TERM MEMORY trm_67890 trm_34567 image1.nii.gz # associated images (discovered by way of contrasts) trm_78901 trm_34567 image2.nii.gz ''' concepts = filter_concepts() if save_to_file == True: filey = init_output_file(output_file, delim=delim) df = pandas.DataFrame(columns=["id", "parent", "name"]) df.loc[0] = ["1", "None", "BASE"] # Generate a unique id for each concept concept_lookup = dict() for c in range(0, len(concepts)): concept_lookup[concepts[c]["id"]] = c + 2 count = 1 # Generate tree for main concepts for concept in concepts: parents = [] if "relationships" in concept: for relation in concept["relationships"]: if relation["direction"] == "parent": # We can only use "kind of" otherwise we get circular reference if relation["relationship"] == "kind of": if relation["id"] in concept_lookup: parents.append(relation["id"]) if not parents: # make_node(node_id,name,parent,delim,file_obj): if save_to_file == True: make_node(concept["id"], concept["name"], "1", delim, filey) df.loc[count] = [concept["id"], "1", concept["name"]] count += 1 else: for parent in parents: # make_node(node_id,name,parent,delim,file_obj): if save_to_file == True: make_node(concept["id"], concept["name"], parent, delim, filey) df.loc[count] = [concept["id"], parent, concept["name"]] count += 1 # Now add an entry for each image / contrast, may be multiple for each image if image_dict: node_id = max(concept_lookup.values()) + 1 for conid, image_paths in image_dict.items(): if lookup_key_type == "contrast": concepts_single = get_concept(contrast_id=conid).json key_id = "id" else: concepts_single = get_task(id=conid).json[0] if "concepts" in list(concepts_single.keys()): concepts_single = concepts_single["concepts"] else: concepts_single = None key_id = "concept_id" if concepts_single != None: for con in concepts_single: # The concept is the parent of the image if con: for image_path in image_paths: # make_node(node_id,name,parent,delim,file_obj): if save_to_file == True: make_node("node_%s" % node_id, image_path, con[key_id], delim, filey) df.loc[count] = [ "node_%s" % node_id, con[key_id], image_path ] node_id += 1 count += 1 if save_to_file == True: filey.close() print("%s has been created." % output_file) return df
def concept_node_triples(image_dict=None,output_file="concept_node_triples.tsv", delim="\t",save_to_file=True,lookup_key_type="contrast"): '''concept_node_triples Export a list of nodes, in triples :param delim: delimiter for output file :param save_to_file: boolean, False will return pandas data frame :param image_dict [OPTIONAL]: dict a dictionary of [term_id:image_file] pairs, eg ..note:: {"cnt_4decfedb91973":["image1.nii.gz","image2.nii.gz"]} This will mean that the images in the list will be assigned to all concept nodes associated with the term specified. This allows for inference over the tree (for example, some relationship with concept nodes that are parents of assigned nodes). Specifying an image dictionary will append the images as the base nodes of the tree. No image dictionary means that the base nodes will be the lowest level concepts. You must specify the term type as "contrast" or "task" (see lookup_key_type) :param delim: str delimeter for output file, default is tab. :param output_file: path :param lookup_key_type: the term type used as a key in the image_dict. Either "task" or "contrast" (default is contrast) ..note:: Output looks like id parent name 1 none BASE # there is always a base node trm_12345 1 MEMORY # high level concept groups trm_23456 1 PERCEPTION trm_34567 trm_12345 WORKING MEMORY # concepts trm_56789 trm_12345 LONG TERM MEMORY trm_67890 trm_34567 image1.nii.gz # associated images (discovered by way of contrasts) trm_78901 trm_34567 image2.nii.gz ''' concepts = filter_concepts() if save_to_file == True: filey = init_output_file(output_file,delim=delim) df = pandas.DataFrame(columns=["id","parent","name"]) df.loc[0] = ["1","None","BASE"] # Generate a unique id for each concept concept_lookup = dict() for c in range(0,len(concepts)): concept_lookup[concepts[c]["id"]] = c+2 count=1 # Generate tree for main concepts for concept in concepts: parents = [] if "relationships" in concept: for relation in concept["relationships"]: if relation["direction"] == "parent": # We can only use "kind of" otherwise we get circular reference if relation["relationship"] == "kind of": if relation["id"] in concept_lookup: parents.append(relation["id"]) if not parents: # make_node(node_id,name,parent,delim,file_obj): if save_to_file == True: make_node(concept["id"],concept["name"],"1",delim,filey) df.loc[count] = [concept["id"],"1",concept["name"]] count+=1 else: for parent in parents: # make_node(node_id,name,parent,delim,file_obj): if save_to_file == True: make_node(concept["id"],concept["name"],parent,delim,filey) df.loc[count] = [concept["id"],parent,concept["name"]] count+=1 # Now add an entry for each image / contrast, may be multiple for each image if image_dict: node_id = max(concept_lookup.values()) + 1 for conid, image_paths in image_dict.items(): if lookup_key_type == "contrast": concepts_single = get_concept(contrast_id=conid).json key_id = "id" else: concepts_single = get_task(id=conid).json[0] if "concepts" in list(concepts_single.keys()): concepts_single = concepts_single["concepts"] else: concepts_single = None key_id = "concept_id" if concepts_single != None: for con in concepts_single: # The concept is the parent of the image if con: for image_path in image_paths: # make_node(node_id,name,parent,delim,file_obj): if save_to_file == True: make_node("node_%s" %node_id,image_path,con[key_id],delim,filey) df.loc[count] = ["node_%s" %node_id,con[key_id],image_path] node_id +=1 count+=1 if save_to_file == True: filey.close() print("%s has been created." % output_file) return df
images = pandas.read_csv("%s/contrast_defined_images_filtered.tsv" %results,sep="\t") unique_concepts = dict() for row in images.iterrows(): idx = row[1].image_id # Bug with getting contrasts for images: if idx == 109: unique_concepts[idx] = ["trm_567982752ff4a","trm_4a3fd79d0afcf","trm_5534111a8bc96", "trm_557b48a224b95","trm_557b4a81a4a17","trm_4a3fd79d0b64e","trm_4a3fd79d0a33b", "trm_557b4a7315f1b","trm_4a3fd79d0af71","trm_557b4b56de455","trm_557b4add1837e"] elif idx == 118: unique_concepts[idx] = ["trm_4a3fd79d0b642","trm_4a3fd79d0a33b","trm_557b4a7315f1b","trm_4a3fd79d0af71", "trm_557b4b56de455"] else: contrast = row[1].cognitive_contrast_cogatlas_id concepts = get_concept(contrast_id=contrast) concepts = numpy.unique(concepts.pandas.id).tolist() unique_concepts[idx] = concepts all_concepts = [] for image_id,concepts in unique_concepts.iteritems(): for concept in concepts: if concept not in all_concepts: all_concepts.append(concept) res = {"all_concepts":all_concepts,"unique_concepts":unique_concepts,"images":images} ## STEP 1: GENERATE IMAGE BY CONCEPT DATA FRAME concept_df = pandas.DataFrame(0,columns=all_concepts,index=images.image_id.unique().tolist()) for image_id,concepts in unique_concepts.iteritems():
if not os.path.exists(decode_folder): os.mkdir(decode_folder) # Images by Concepts data frame labels_tsv = "%s/images_contrasts_df.tsv" %results images = pandas.read_csv(labels_tsv,sep="\t",index_col=0) output_folder = "%s/classification_final" %results # Get standard mask, 4mm standard_mask=get_standard_mask(4) # Load the regression params data frame result = pickle.load(open("%s/regression_params_dfs.pkl" %output_folder,"rb")) all_concepts = get_concept().json concepts = dict() for concept in all_concepts: concepts[concept["id"]] = str(concept["name"]) # You will need to copy abstracts.txt into this folder from the repo abstracts = pandas.read_csv("%s/abstracts.txt" %decode_folder,sep="\t",index_col=0,header=None) abstracts.columns = ["text"] # Functions to parse text def remove_nonenglish_chars(text): return re.sub("[^a-zA-Z]", " ", text) def text2sentences(text,remove_non_english_chars=True): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if remove_non_english_chars:
def main(): base = "data/" # Make a folder for mean images if not os.path.exists("mr"): os.mkdir("mr") # Get Neurovault Images with defined cognitive atlas contrast collections = get_collections() # Filter images to those that have a DOI collections = collections[collections.DOI.isnull() == False] # Get image meta data for collections (N=1023) images = get_images(collection_pks=collections.collection_id.tolist()) # Filter images to those with contrasts defined (N=98) images = images[images.cognitive_contrast_cogatlas_id.isnull() == False] # Get rid of any not in MNI images = images[images.not_mni == False] # Get rid of thresholded images images = images[images.is_thresholded == False] ### Step 1: Load meta data sources unique_contrasts = images.cognitive_contrast_cogatlas_id.unique().tolist() # Images that do not match the correct identifier will not be used (eg, "Other") expression = re.compile("cnt_*") unique_contrasts = [u for u in unique_contrasts if expression.match(u)] # Make sure exists in cognitive atlas existing_contrasts = [] for u in unique_contrasts: try: tmp = get_concept(contrast_id=u, silent=True) existing_contrasts.append(u) except: print "%s is defined in NeuroVault, does not exist in Cognitive Atlas" % u image_lookup = dict() for u in existing_contrasts: image_lookup[u] = images.image_id[images.cognitive_contrast_cogatlas_id == u].tolist() # Create a data structure of tasks and contrasts for our analysis relationship_table = concept_node_triples(image_dict=image_lookup, save_to_file=False) unique_nodes = relationship_table.id.unique().tolist() # We will store a data frame of meta data # Lookup for meta_data is the id of the node! meta_data = {} for node in unique_nodes: meta_single = {} # This is an image node if re.search("node_", node): print "Found image node!" relationship_table_row = relationship_table[relationship_table.id == node] image_id = relationship_table_row.name.tolist()[0] meta_single["category"] = "" meta_single["type"] = "nii" # NeuroVault metadata concepts = relationship_table.parent[relationship_table.name == image_id] concepts = [ relationship_table.name[relationship_table.id == c].tolist()[0] for c in concepts ] neurovault_row = images[images.image_id == int(image_id)] collection_row = collections[collections.collection_id == neurovault_row.collection_id.tolist() [0]] collection_meta = { "DOI": collection_row["DOI"].tolist()[0], "authors": collection_row["authors"].tolist()[0], "journal": collection_row["journal_name"].tolist()[0], "url": collection_row["url"].tolist()[0], "subjects": collection_row["number_of_subjects"].tolist()[0], "smoothing_fwhm": str(collection_row["smoothing_fwhm"].tolist()[0]).encode( "utf-8") } meta_single["url"] = neurovault_row["url"].tolist()[0] meta_single["thumbnail"] = neurovault_row["thumbnail"].tolist()[0] meta_single["images"] = neurovault_row["thumbnail"].tolist() meta_single["task"] = neurovault_row[ "cognitive_paradigm_cogatlas"].tolist()[0] meta_single["contrast"] = neurovault_row[ "cognitive_contrast_cogatlas"].tolist()[0] meta_single["download"] = neurovault_row["file"].tolist()[0] meta_single["concept"] = concepts if neurovault_row["description"].tolist()[0]: meta_single["description"] = str( neurovault_row["description"].tolist()[0]).encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single[ "description"][0:600] else: # A concept node if node != "1": relationship_table_row = relationship_table[ relationship_table.id == node] concept = get_concept(id=node, silent=True).json children_nodes = [ relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x] == node ] while len( [x for x in children_nodes if not isinstance(x, int)]) > 0: new_parent_nodes = [ x for x in children_nodes if not isinstance(x, int) ] children_nodes = [ x for x in children_nodes if x not in new_parent_nodes ] for new_parent in new_parent_nodes: node_name = relationship_table.id[ relationship_table.name == new_parent].tolist()[0] children_nodes = children_nodes + [ relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x] == node_name ] # Now only keep children that are images meta_single["images"] = images["thumbnail"][ images.image_id.isin(children_nodes)].tolist() # Cognitive Atlas meta data meta_single[ "url"] = "http://www.cognitiveatlas.org/term/id/%s" % node meta_single["type"] = "concept" meta_single[ "thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png" meta_single["concept"] = [ relationship_table.name[relationship_table.id == node].tolist()[0] ] meta_single["task"] = "" meta_single["contrast"] = [] meta_single[ "download"] = "http://www.cognitiveatlas.org/rdf/id/%s" % node if concept[0]["definition_text"]: meta_single["description"] = concept[0][ "definition_text"].encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single[ "description"][0:600] meta_data[node] = meta_single ## STEP 2: VISUALIZATION WITH PYBRAINCOMPARE from pybraincompare.ontology.tree import named_ontology_tree_from_tsv, make_ontology_tree_d3 # First let's look at the tree structure # output_json = "%s/task_contrast_tree.json" % outfolder tree = named_ontology_tree_from_tsv(relationship_table, output_json=None, meta_data=meta_data) html_snippet = make_ontology_tree_d3(tree) web_folder = base make_analysis_web_folder(html_snippet, web_folder) # To get a dump of just the tree (for use in more advanced custom web interface) filey = open('%s/reverseinference.json' % base, 'wb') filey.write( json.dumps(tree, sort_keys=True, indent=4, separators=(',', ': '))) filey.close() ## STEP 3: Export individual nodes ### Images unique_images = images.image_id.unique().tolist() # Images for s in range(0, len(unique_images)): image_id = unique_images[s] meta_data = {} meta_data["image_id"] = image_id print "Parsing data for images %s of %s" % (s, len(unique_images)) concepts = relationship_table.parent[relationship_table.name == str( image_id)].tolist() concepts = [ relationship_table.name[relationship_table.id == c].tolist()[0] for c in concepts ] concepts_ids = [ relationship_table.id[relationship_table.id == c].tolist()[0] for c in concepts ] neurovault_row = images[images.image_id == int(image_id)] collection_row = collections[collections.collection_id == neurovault_row.collection_id.tolist()[0]] collection_meta = { "DOI": collection_row["DOI"].tolist()[0], "authors": collection_row["authors"].tolist()[0], "journal": collection_row["journal_name"].tolist()[0], "url": collection_row["url"].tolist()[0], "subjects": collection_row["number_of_subjects"].tolist()[0], "smoothing_fwhm": str(collection_row["smoothing_fwhm"].tolist()[0]).encode("utf-8"), "title": collection_row["name"].tolist()[0] } meta_data["collection"] = collection_meta meta_data["url"] = neurovault_row["url"].tolist()[0] meta_data["thumbnail"] = neurovault_row["thumbnail"].tolist()[0] meta_data["images"] = neurovault_row["thumbnail"].tolist() meta_data["task"] = neurovault_row[ "cognitive_paradigm_cogatlas"].tolist()[0] meta_data["contrast"] = neurovault_row[ "cognitive_contrast_cogatlas"].tolist()[0] meta_data["download"] = neurovault_row["file"].tolist()[0] meta_data["concept"] = concepts meta_data["concept_id"] = concepts_ids if neurovault_row["description"].tolist()[0]: try: description = str( neurovault_row["description"].tolist()[0]).encode("utf-8") except: description = "" if description != "nan": meta_data["description"] = description else: meta_data["description"] = "" else: meta_data["description"] = "" if len(meta_data["description"]) > 600: meta_data[ "description"] = "%s..." % meta_data["description"][0:600] output_file = "%s/ri_%s.json" % (base, meta_data["image_id"]) filey = open(output_file, 'wb') filey.write( json.dumps(meta_data, sort_keys=True, indent=4, separators=(',', ': '))) filey.close() ### Concepts for node in unique_nodes: # This is a concept node if not re.search("node_", node): if node != "1": relationship_table_row = relationship_table[ relationship_table.id == node] concept = get_concept(id=node).json meta_single = {} children_nodes = [ relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x] == node ] while len( [x for x in children_nodes if not isinstance(x, int)]) > 0: new_parent_nodes = [ x for x in children_nodes if not isinstance(x, int) ] children_nodes = [ x for x in children_nodes if x not in new_parent_nodes ] for new_parent in new_parent_nodes: node_name = relationship_table.id[ relationship_table.name == new_parent].tolist()[0] children_nodes = children_nodes + [ relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x] == node_name ] # Now only keep children that are images meta_single["images"] = images["thumbnail"][ images.image_id.isin(children_nodes)].tolist() meta_single["image_list"] = children_nodes # Cognitive Atlas meta data meta_single[ "url"] = "http://www.cognitiveatlas.org/term/id/%s" % node meta_single["type"] = "concept" meta_single[ "thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png" meta_single["concept"] = [ relationship_table.name[relationship_table.id == node].tolist()[0] ] meta_single["task"] = "" meta_single["contrast"] = [] meta_single[ "download"] = "http://www.cognitiveatlas.org/rdf/id/%s" % node if concept[0]["definition_text"]: meta_single["description"] = concept[0][ "definition_text"].encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single[ "description"][0:600] output_file = "%s/ri_%s.json" % (base, node) filey = open(output_file, 'wb') filey.write( json.dumps(meta_single, sort_keys=True, indent=4, separators=(',', ': '))) filey.close()
concept_acc.loc[concept,"aprime"] = aprime concept_acc.loc[concept,"miss"] = calculate_hits(Ya,Yp,1,0) concept_acc.loc[concept,"correct_rejection"] = calculate_hits(Ya,Yp,0,0) return concept_acc base_acc = get_concept_acc(predictions) forward_acc = get_concept_acc(predictions_forward) # Just compare the two for now diff_acc = forward_acc - base_acc # Add the concept names to each from cognitiveatlas.api import get_concept concept_names = [] for concept in diff_acc.index: concept_names.append(get_concept(id=concept).json[0]["name"]) base_acc["name"] = concept_names forward_acc["name"] = concept_names diff_acc["name"] = concept_names # Add the number of images number_images = [] for concept in diff_acc.index: number_images.append(Ymat.loc[:,concept].sum()) base_acc["number_images"] = number_images forward_acc["number_images"] = number_images diff_acc["number_images"] = number_images diff_acc = diff_acc.sort(columns=["hit"],ascending=False)
index_col=0) images = pandas.read_csv("data/contrast_defined_images_filtered.tsv", sep="\t", index_col=0) # source will be in rows, target in columns concept_kindof = pandas.DataFrame() concept_partof = pandas.DataFrame() concepts = contrast_lookup.columns.tolist() seen = [] while len(concepts) > 0: concept = concepts.pop(0) seen.append(concept) try: tmp = get_concept(id=concept).json[0] if 'relationships' in tmp: for relation in tmp["relationships"]: if relation['id'] not in seen and relation[ 'id'] not in concepts: print "Adding concept %s" % (relation['id']) concepts.append(relation['id']) if relation['direction'] == "parent": if relation['relationship'] == 'kind of': concept_kindof.loc[tmp['id'], relation['id']] = 1 elif relation['relationship'] == 'part of': concept_partof.loc[tmp['id'], relation['id']] = 1 elif relation['direction'] == "child": if relation['relationship'] == 'kind of': concept_kindof.loc[relation['id'], tmp['id']] = 1 elif relation['relationship'] == 'part of':
def main(argv): parser = ArgumentParser( description= 'This program will find all *.jsonld files in the list of input' 'directories and compute the frequency of use of isAbout concepts. ' 'The frequency table will be exported as a markdown table for use in' 'web documents or GitHub README markdown files. ') parser.add_argument('-jsonld', dest='jsonld', nargs='+', default=[], required=True, help="space separated list" "of directories to evaluate for jsonld files.") parser.add_argument('-outfile', dest='outfile', required=True, help="Output file for markdown table, full path") parser.add_argument( '-jsonld_output_dir', dest='jsonld_output_dir', required=True, help="This is a directory" "where we'll store the concept json-ld files using the NIDM-Terms properties" ) args = parser.parse_args() isAbout_terms = {} total_concept_count = 0 # download context file for json-ld files of concepts used # try to open the url and get the pointed to file try: # open url and get file opener = url.urlopen(CONTEXT) # write temporary file to disk and use for stats temp = tempfile.NamedTemporaryFile(delete=False) temp.write(opener.read()) temp.close() context_file = temp.name except: print("ERROR! Can't open url: %s" % CONTEXT) exit() # load downloaded context with open(context_file) as context_data: context = json.load(context_data) # Retrieve cognitive atlas concepts and disorders cogatlas_concepts = get_concept(silent=True) cogatlas_disorders = get_disorder(silent=True) # for each input directory for direct in args.jsonld: # find *.jsonld files files = glob2.glob(direct + '/**/*.jsonld', recursive=True) # loop through each file and get isAbout terms for file in files: # read file with json # open the file as a dictionary print("opening file: %s" % file) with open(file) as dct: json_tmp = json.load(dct) if type(json_tmp['terms']) is dict: # for each key (term) in jsonld file, check for isAbout property for term in json_tmp['terms'].keys(): # expanded = jsonld.expand(json_tmp[term]) # for jsonld files with only a single term we have a simple dictionary where the term label isn't # the highest-level key so we handle differently if term == 'isAbout': if isinstance(json_tmp['terms'][term], list): # if not a dictionary then a list of dictionaries for isabout_entry in json_tmp['terms'][term][ 'isAbout']: # add concept to dictionary total_concept_count = add_to_dict( id, isabout_entry, isAbout_terms, total_concept_count) if INTERLEX_URI_FRAG in isabout_entry['@id']: # for storing concept as json-ld file concept_jsonld = get_interlex_concept_properties( isabout_entry['@id'], context) elif (COGATLAS_URI_FRAG in isabout_entry['@id']) and ( "task" not in isabout_entry['@id']): concept_jsonld = get_cogatlas_properties( isabout_entry, context, cogatlas_concepts, cogatlas_disorders) elif "task" in isabout_entry['@id']: concept_jsonld = get_cogatlas_task_properties( isabout_entry, context) else: continue # write concept jsonld file write_jsonld(concept_jsonld, args.jsonld_output_dir) # else it's a dictionary with a single isAbout entry else: total_concept_count = add_to_dict( id, json_tmp['terms'][term], isAbout_terms, total_concept_count) if INTERLEX_URI_FRAG in json_tmp['terms'][term]: # for storing concept as json-ld file concept_jsonld = get_interlex_concept_properties( json_tmp['terms'][term]['@id'], context) elif COGATLAS_URI_FRAG in json_tmp['terms'][term]['@id'] and \ ("task" not in json_tmp['terms'][term]['@id']): concept_jsonld = get_cogatlas_properties( json_tmp['terms'][term], context, cogatlas_concepts, cogatlas_disorders) elif "task" in json_tmp['terms'][term]: concept_jsonld = get_cogatlas_task_properties( json_tmp['terms'][term], context) else: continue # write concept jsonld file write_jsonld(concept_jsonld, args.jsonld_output_dir) elif type(json_tmp['terms']) is list: for term in json_tmp['terms']: # expanded = jsonld.expand(json_tmp[term]) # for jsonld files with only a single term we have a simple dictionary where the term label isn't # the highest-level key so we handle differently for property in term: if property == 'isAbout': # for each concept in isAbout property if isinstance(term[property], list): for isabout_entry in term[property]: total_concept_count = add_to_dict( id, isabout_entry, isAbout_terms, total_concept_count) if INTERLEX_URI_FRAG in isabout_entry[ '@id']: # for storing concept as json-ld file concept_jsonld = get_interlex_concept_properties( isabout_entry['@id'], context) elif COGATLAS_URI_FRAG in isabout_entry['@id'] and \ ("task" not in isabout_entry['@id']): concept_jsonld = get_cogatlas_properties( isabout_entry, context, cogatlas_concepts, cogatlas_disorders) elif "task" in isabout_entry['@id']: concept_jsonld = get_cogatlas_task_properties( isabout_entry['@id'], isabout_entry['label'], context) else: continue # write concept jsonld file write_jsonld(concept_jsonld, args.jsonld_output_dir) else: total_concept_count = add_to_dict( id, term[property], isAbout_terms, total_concept_count) if INTERLEX_URI_FRAG in term[property]['@id']: # for storing concept as json-ld file concept_jsonld = get_interlex_concept_properties( term[property]['@id'], context) elif COGATLAS_URI_FRAG in term[property]['@id'] and \ ("task" not in term[property]['@id']): concept_jsonld = get_cogatlas_properties( term[property], context, cogatlas_concepts, cogatlas_disorders) elif "task" in term[property]['@id']: concept_jsonld = get_cogatlas_task_properties( term[property]['@id'], term[property]['label'], context) else: continue # write concept jsonld file write_jsonld(concept_jsonld, args.jsonld_output_dir) # open markdown txt file md_file = open(args.outfile, "w") ## Added by NQ to test GitHub Actions print('opening output file in', args.outfile) # set up header of table md_file.write("| concept URL | label | use frequency (%) |\n") md_file.write("| ----------- | ----- | ----------------- |\n") # now cycle through isAbout_terms dictionary and compute frequencies for key in isAbout_terms.keys(): isAbout_terms[key]['freq'] = (isAbout_terms[key]['count'] / total_concept_count) * 100.0 res = OrderedDict( sorted(isAbout_terms.items(), key=lambda x: getitem(x[1], 'freq'), reverse=True)) # write markdown table sorted for key in res.keys(): # add to markdown table file md_file.write("| %s | %s | %f |\n" % (key, res[key]['label'], res[key]['freq'])) ##Added by NQ to show that the code finished running print('File has been successfully written in', md_file) md_file.close() # if a single-file jsonld file already exists than add these terms to it else create a new one output_dir = os.path.split(args.jsonld_output_dir)[0] if isfile(join(output_dir, "NIDM_Concepts.jsonld")): cmd = "python " + join(sys.path[0], "combinebidsjsonld.py") + " -inputDir " + args.jsonld_output_dir + " -outputDir " + \ join(output_dir, "NIDM_Concepts.jsonld") + " -association \"NIDM\"" + " -jsonld " + \ join(output_dir, "NIDM_Concepts.jsonld") else: cmd = "python " + join(sys.path[0], "combinebidsjsonld.py") + " -inputDir " + args.jsonld_output_dir + " -outputDir " + \ join(output_dir, "NIDM_Concepts.jsonld") + " -association \"NIDM\"" print(cmd) system(cmd)
#/usr/bin/python #Make coordinate plane import pandas data_file = "data.tsv" from cognitiveatlas.api import get_concept # Retrieve concepts from the cognitive atlas concepts = get_concept().pandas concept_names = concepts["name"].tolist() concept_definition = concepts["definition_text"].tolist() concept_definition = [d.encode("utf-8") for d in concept_definition] df=pandas.DataFrame(columns=["X","Y","concept","definition","id"]) nrows=30 #nrows and ncol for ~900 points x=range(1,31)*30 y=[] for i in range(1,31): y = y + [i]*30 len(y)==len(x) df.X = x[0:810] df.Y = y[0:810] df.id = concepts["id"].tolist() df.concept = concept_names df.definition = concept_definition df.to_csv("data.tsv",sep="\t",index=False)
if predicted == actual: concepts_df.loc[actual_concepts,"correct"] = concepts_df.loc[actual_concepts,"correct"] + predicted_count else: concepts_df.loc[actual_concepts,"incorrect"] = concepts_df.loc[actual_concepts,"incorrect"] + predicted_count # Add the number of images for concept_name in labels.columns: number_images = labels[concept_name][labels[concept_name]==1].shape[0] concepts_df.loc[concept_name,"number_images"] = number_images concepts_df.to_csv("%s/classification_concept_confusion_cogatid_perform.tsv" %results,sep="\t") # Replace concept ids with concept names conceptnames = [] for conceptname in concepts_df.index: conceptnames.append(get_concept(id=conceptname).json[0]["name"]) concepts_df.index = conceptnames concepts_df.to_csv("%s/classification_concept_confusion_perform.tsv" %results,sep="\t") # Normalize by the row count (to see what percentage of the time we get it wrong/right) concepts_df_norm = pandas.DataFrame(columns=["correct","incorrect","number_images"]) for row in concepts_df.iterrows(): rowsum = row[1][0:2].sum() if rowsum != 0: norm_values = [float(x)/rowsum for x in row[1].tolist()[0:2]] norm_values.append(concepts_df.loc[row[0],"number_images"]) concepts_df_norm.loc[row[0]] = norm_values concepts_df_norm.sort(columns=["correct"],ascending=False,inplace=True) concepts_df_norm.to_csv("%s/classification_concept_confusion_norm_perform.tsv" %results,sep="\t")
#/usr/bin/python #Make coordinate plane import pandas data_file = "data.tsv" from cognitiveatlas.api import get_concept # Retrieve concepts from the cognitive atlas concepts = get_concept().pandas concept_names = concepts["name"].tolist() concept_definition = concepts["definition_text"].tolist() concept_definition = [d.encode("utf-8") for d in concept_definition] df = pandas.DataFrame(columns=["X", "Y", "concept", "definition", "id"]) nrows = 30 #nrows and ncol for ~900 points x = range(1, 31) * 30 y = [] for i in range(1, 31): y = y + [i] * 30 len(y) == len(x) df.X = x[0:810] df.Y = y[0:810] df.id = concepts["id"].tolist() df.concept = concept_names df.definition = concept_definition df.to_csv("data.tsv", sep="\t", index=False)
from cognitiveatlas.api import get_concept, get_task from py2neo import Graph, Path, Node, Rel, authenticate import os # Get concepts, tasks concepts = get_concept() concept_ids = concepts.pandas.id.tolist() concept_names = concepts.pandas.name.tolist() tasks = get_task() task_ids = tasks.pandas.id.tolist() task_names = tasks.pandas.name.tolist() # get contrasts from tasks contrast_ids = [] contrast_names = [] contrast_tasks = [] for t in tasks.json: task = get_task(id=t["id"]) contrasts = task.json[0]["contrasts"] for contrast in contrasts: contrast_tasks.append(t["id"]) contrast_ids.append(contrast["id"]) contrast_names.append(contrast["contrast_text"]) # set up authentication parameters pw=open('neo4j_pw').readline().strip() authenticate("localhost:7474", "neo4j", pw) # connect to authenticated graph database graph = Graph()
def download_cognitive_atlas(data_dir=None, overwrite=False, verbose=1): """Download Cognitive Atlas ontology and extract IDs and relationships. .. versionadded:: 0.0.2 Parameters ---------- data_dir : :obj:`str`, optional Location in which to place Cognitive Atlas files. Default is None, which uses the package's default path for downloaded data. overwrite : :obj:`bool`, optional Whether to overwrite existing files or not. Default is False. verbose : :obj:`int`, optional Default is 1. Returns ------- out_dict : :obj:`dict` Dictionary with two keys: 'ids' and 'relationships'. Each points to a csv file. The 'ids' file contains CogAt identifiers, canonical names, and aliases, sorted by alias length (number of characters). The 'relationships' file contains associations between CogAt items, with three columns: input, output, and rel_type (relationship type). """ from cognitiveatlas.api import get_concept, get_disorder, get_task dataset_name = "cognitive_atlas" data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) ids_file = op.join(data_dir, "cogat_aliases.csv") rels_file = op.join(data_dir, "cogat_relationships.csv") if overwrite or not all([op.isfile(f) for f in [ids_file, rels_file]]): concepts = get_concept(silent=True).pandas tasks = get_task(silent=True).pandas disorders = get_disorder(silent=True).pandas # Identifiers and aliases long_concepts = _longify(concepts) long_tasks = _longify(tasks) # Disorders currently lack aliases disorders["name"] = disorders["name"].str.lower() disorders = disorders.assign(alias=disorders["name"]) disorders = disorders[["id", "name", "alias"]] # Combine into aliases DataFrame aliases = pd.concat((long_concepts, long_tasks, disorders), axis=0) aliases = _expand_df(aliases) aliases = aliases.replace("", np.nan) aliases = aliases.dropna(axis=0) aliases = aliases.reset_index(drop=True) # Relationships relationship_list = [] for i, id_ in enumerate(concepts["id"].unique()): if i % 100 == 0: time.sleep(5) row = [id_, id_, "isSelf"] relationship_list.append(row) concept = get_concept(id=id_, silent=True).json for rel in concept["relationships"]: reltype = _get_concept_reltype(rel["relationship"], rel["direction"]) if reltype is not None: row = [id_, rel["id"], reltype] relationship_list.append(row) for i, id_ in enumerate(tasks["id"].unique()): if i % 100 == 0: time.sleep(5) row = [id_, id_, "isSelf"] relationship_list.append(row) task = get_task(id=id_, silent=True).json for rel in task["concepts"]: row = [id_, rel["concept_id"], "measures"] relationship_list.append(row) row = [rel["concept_id"], id_, "measuredBy"] relationship_list.append(row) for i, id_ in enumerate(disorders["id"].unique()): if i % 100 == 0: time.sleep(5) row = [id_, id_, "isSelf"] relationship_list.append(row) disorder = get_disorder(id=id_, silent=True).json for rel in disorder["disorders"]: if rel["relationship"] == "ISA": rel_type = "isA" else: rel_type = rel["relationship"] row = [id_, rel["id"], rel_type] relationship_list.append(row) relationships = pd.DataFrame(columns=["input", "output", "rel_type"], data=relationship_list) ctp_df = concepts[["id", "id_concept_class"]] ctp_df = ctp_df.assign(rel_type="inCategory") ctp_df.columns = ["input", "output", "rel_type"] ctp_df["output"].replace("", np.nan, inplace=True) ctp_df.dropna(axis=0, inplace=True) relationships = pd.concat((ctp_df, relationships)) relationships = relationships.reset_index(drop=True) aliases.to_csv(ids_file, index=False) relationships.to_csv(rels_file, index=False) out_dict = {"ids": ids_file, "relationships": rels_file} return out_dict
task_uids = [task["id"] for task in get_task().json] contrasts = dict() # contrast lookup by task uid # Now we can retrieve the full data. We are interested in contrasts, so let's save those. for task in task_uids: if task not in contrasts: task_complete = get_task(task).json[0] # Only save if we have contrasts if len(task_complete["contrasts"]) > 0: contrasts[task] = task_complete["contrasts"] # How many tasks have contrasts? len(contrasts) # 437 # Step 3: Make a contrast --> concept lookup concepts = dict() for task_uid,contrast_set in contrasts.iteritems(): for contrast in contrast_set: contrast_uid = contrast["id"] if contrast_uid not in concepts: try: # Some calls don't work concepts[contrast_uid] = get_concept(contrast_id=contrast_uid).json[0] except: pass # How many concepts are asserted to measure different contrasts? len(concepts)
def get_concepts(): return get_concept().json
def main(): base = "data/" # Make a folder for mean images if not os.path.exists("mr"): os.mkdir("mr") # Get Neurovault Images with defined cognitive atlas contrast collections = get_collections() # Filter images to those that have a DOI collections = collections[collections.DOI.isnull()==False] # Get image meta data for collections (N=1023) images = get_images(collection_pks=collections.collection_id.tolist()) # Filter images to those with contrasts defined (N=98) images = images[images.cognitive_contrast_cogatlas_id.isnull()==False] # Get rid of any not in MNI images = images[images.not_mni == False] # Get rid of thresholded images images = images[images.is_thresholded == False] ### Step 1: Load meta data sources unique_contrasts = images.cognitive_contrast_cogatlas_id.unique().tolist() # Images that do not match the correct identifier will not be used (eg, "Other") expression = re.compile("cnt_*") unique_contrasts = [u for u in unique_contrasts if expression.match(u)] # Make sure exists in cognitive atlas existing_contrasts = [] for u in unique_contrasts: try: tmp = get_concept(contrast_id=u,silent=True) existing_contrasts.append(u) except: print "%s is defined in NeuroVault, does not exist in Cognitive Atlas" %u image_lookup = dict() for u in existing_contrasts: image_lookup[u] = images.image_id[images.cognitive_contrast_cogatlas_id==u].tolist() # Create a data structure of tasks and contrasts for our analysis relationship_table = concept_node_triples(image_dict=image_lookup,save_to_file=False) unique_nodes = relationship_table.id.unique().tolist() # We will store a data frame of meta data # Lookup for meta_data is the id of the node! meta_data = {} for node in unique_nodes: meta_single = {} # This is an image node if re.search("node_",node): print "Found image node!" relationship_table_row = relationship_table[relationship_table.id==node] image_id = relationship_table_row.name.tolist()[0] meta_single["category"] = "" meta_single["type"] = "nii" # NeuroVault metadata concepts = relationship_table.parent[relationship_table.name == image_id] concepts = [relationship_table.name[relationship_table.id==c].tolist()[0] for c in concepts] neurovault_row = images[images.image_id == int(image_id)] collection_row = collections[collections.collection_id == neurovault_row.collection_id.tolist()[0]] collection_meta = {"DOI":collection_row["DOI"].tolist()[0], "authors":collection_row["authors"].tolist()[0], "journal":collection_row["journal_name"].tolist()[0], "url":collection_row["url"].tolist()[0], "subjects":collection_row["number_of_subjects"].tolist()[0], "smoothing_fwhm":str(collection_row["smoothing_fwhm"].tolist()[0]).encode("utf-8")} meta_single["url"] = neurovault_row["url"].tolist()[0] meta_single["thumbnail"] = neurovault_row["thumbnail"].tolist()[0] meta_single["images"] = neurovault_row["thumbnail"].tolist() meta_single["task"] = neurovault_row["cognitive_paradigm_cogatlas"].tolist()[0] meta_single["contrast"] = neurovault_row["cognitive_contrast_cogatlas"].tolist()[0] meta_single["download"] = neurovault_row["file"].tolist()[0] meta_single["concept"] = concepts if neurovault_row["description"].tolist()[0]: meta_single["description"] = str(neurovault_row["description"].tolist()[0]).encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single["description"][0:600] else: # A concept node if node != "1": relationship_table_row = relationship_table[relationship_table.id==node] concept = get_concept(id=node,silent=True).json children_nodes = [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node] while len([x for x in children_nodes if not isinstance(x,int)]) > 0: new_parent_nodes = [x for x in children_nodes if not isinstance(x,int)] children_nodes = [x for x in children_nodes if x not in new_parent_nodes] for new_parent in new_parent_nodes: node_name = relationship_table.id[relationship_table.name==new_parent].tolist()[0] children_nodes = children_nodes + [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node_name] # Now only keep children that are images meta_single["images"] = images["thumbnail"][images.image_id.isin(children_nodes)].tolist() # Cognitive Atlas meta data meta_single["url"] = "http://www.cognitiveatlas.org/term/id/%s" %node meta_single["type"] = "concept" meta_single["thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png" meta_single["concept"] = [relationship_table.name[relationship_table.id==node].tolist()[0]] meta_single["task"] = "" meta_single["contrast"] = [] meta_single["download"] = "http://www.cognitiveatlas.org/rdf/id/%s" %node if concept[0]["definition_text"]: meta_single["description"] = concept[0]["definition_text"].encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single["description"][0:600] meta_data[node] = meta_single ## STEP 2: VISUALIZATION WITH PYBRAINCOMPARE from pybraincompare.ontology.tree import named_ontology_tree_from_tsv, make_ontology_tree_d3 # First let's look at the tree structure # output_json = "%s/task_contrast_tree.json" % outfolder tree = named_ontology_tree_from_tsv(relationship_table,output_json=None,meta_data=meta_data) html_snippet = make_ontology_tree_d3(tree) web_folder = base make_analysis_web_folder(html_snippet,web_folder) # To get a dump of just the tree (for use in more advanced custom web interface) filey = open('%s/reverseinference.json' %base,'wb') filey.write(json.dumps(tree, sort_keys=True,indent=4, separators=(',', ': '))) filey.close() ## STEP 3: Export individual nodes ### Images unique_images = images.image_id.unique().tolist() # Images for s in range(0,len(unique_images)): image_id = unique_images[s] meta_data = {} meta_data["image_id"] = image_id print "Parsing data for images %s of %s" %(s,len(unique_images)) concepts = relationship_table.parent[relationship_table.name == str(image_id)].tolist() concepts = [relationship_table.name[relationship_table.id==c].tolist()[0] for c in concepts] concepts_ids = [relationship_table.id[relationship_table.id==c].tolist()[0] for c in concepts] neurovault_row = images[images.image_id == int(image_id)] collection_row = collections[collections.collection_id == neurovault_row.collection_id.tolist()[0]] collection_meta = {"DOI":collection_row["DOI"].tolist()[0], "authors":collection_row["authors"].tolist()[0], "journal":collection_row["journal_name"].tolist()[0], "url":collection_row["url"].tolist()[0], "subjects":collection_row["number_of_subjects"].tolist()[0], "smoothing_fwhm":str(collection_row["smoothing_fwhm"].tolist()[0]).encode("utf-8"), "title":collection_row["name"].tolist()[0]} meta_data["collection"] = collection_meta meta_data["url"] = neurovault_row["url"].tolist()[0] meta_data["thumbnail"] = neurovault_row["thumbnail"].tolist()[0] meta_data["images"] = neurovault_row["thumbnail"].tolist() meta_data["task"] = neurovault_row["cognitive_paradigm_cogatlas"].tolist()[0] meta_data["contrast"] = neurovault_row["cognitive_contrast_cogatlas"].tolist()[0] meta_data["download"] = neurovault_row["file"].tolist()[0] meta_data["concept"] = concepts meta_data["concept_id"] = concepts_ids if neurovault_row["description"].tolist()[0]: try: description = str(neurovault_row["description"].tolist()[0]).encode("utf-8") except: description = "" if description != "nan": meta_data["description"] = description else: meta_data["description"] = "" else: meta_data["description"] = "" if len(meta_data["description"]) > 600: meta_data["description"] = "%s..." % meta_data["description"][0:600] output_file = "%s/ri_%s.json" %(base,meta_data["image_id"]) filey = open(output_file,'wb') filey.write(json.dumps(meta_data, sort_keys=True,indent=4, separators=(',', ': '))) filey.close() ### Concepts for node in unique_nodes: # This is a concept node if not re.search("node_",node): if node != "1": relationship_table_row = relationship_table[relationship_table.id==node] concept = get_concept(id=node).json meta_single = {} children_nodes = [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node] while len([x for x in children_nodes if not isinstance(x,int)]) > 0: new_parent_nodes = [x for x in children_nodes if not isinstance(x,int)] children_nodes = [x for x in children_nodes if x not in new_parent_nodes] for new_parent in new_parent_nodes: node_name = relationship_table.id[relationship_table.name==new_parent].tolist()[0] children_nodes = children_nodes + [relationship_table.name.tolist()[x] for x in range(relationship_table.shape[0]) if relationship_table.parent.tolist()[x]==node_name] # Now only keep children that are images meta_single["images"] = images["thumbnail"][images.image_id.isin(children_nodes)].tolist() meta_single["image_list"] = children_nodes # Cognitive Atlas meta data meta_single["url"] = "http://www.cognitiveatlas.org/term/id/%s" %node meta_single["type"] = "concept" meta_single["thumbnail"] = "http://www.cognitiveatlas.org/images/logo-front.png" meta_single["concept"] = [relationship_table.name[relationship_table.id==node].tolist()[0]] meta_single["task"] = "" meta_single["contrast"] = [] meta_single["download"] = "http://www.cognitiveatlas.org/rdf/id/%s" %node if concept[0]["definition_text"]: meta_single["description"] = concept[0]["definition_text"].encode("utf-8") else: meta_single["description"] = "" if len(meta_single["description"]) > 600: meta_single["description"] = "%s..." % meta_single["description"][0:600] output_file = "%s/ri_%s.json" %(base,node) filey = open(output_file,'wb') filey.write(json.dumps(meta_single, sort_keys=True,indent=4, separators=(',', ': '))) filey.close()
# Bug with getting contrasts for images: if idx == 109: unique_concepts[idx] = [ "trm_567982752ff4a", "trm_4a3fd79d0afcf", "trm_5534111a8bc96", "trm_557b48a224b95", "trm_557b4a81a4a17", "trm_4a3fd79d0b64e", "trm_4a3fd79d0a33b", "trm_557b4a7315f1b", "trm_4a3fd79d0af71", "trm_557b4b56de455", "trm_557b4add1837e" ] elif idx == 118: unique_concepts[idx] = [ "trm_4a3fd79d0b642", "trm_4a3fd79d0a33b", "trm_557b4a7315f1b", "trm_4a3fd79d0af71", "trm_557b4b56de455" ] else: contrast = row[1].cognitive_contrast_cogatlas_id concepts = get_concept(contrast_id=contrast) concepts = numpy.unique(concepts.pandas.id).tolist() unique_concepts[idx] = concepts all_concepts = [] for image_id, concepts in unique_concepts.iteritems(): for concept in concepts: if concept not in all_concepts: all_concepts.append(concept) res = { "all_concepts": all_concepts, "unique_concepts": unique_concepts, "images": images }
# Let's make a concept by concept data frame contrast_lookup = pandas.read_csv("data/contrast_by_concept_binary_df.tsv",sep="\t",index_col=0) images = pandas.read_csv("data/contrast_defined_images_filtered.tsv",sep="\t",index_col=0) # source will be in rows, target in columns concept_kindof = pandas.DataFrame() concept_partof = pandas.DataFrame() concepts = contrast_lookup.columns.tolist() seen = [] while len(concepts) > 0: concept = concepts.pop(0) seen.append(concept) try: tmp = get_concept(id=concept).json[0] if 'relationships' in tmp: for relation in tmp["relationships"]: if relation['id'] not in seen and relation['id'] not in concepts: print "Adding concept %s" %(relation['id']) concepts.append(relation['id']) if relation['direction'] == "parent": if relation['relationship'] == 'kind of': concept_kindof.loc[tmp['id'],relation['id']] = 1 elif relation['relationship'] == 'part of': concept_partof.loc[tmp['id'],relation['id']] = 1 elif relation['direction'] == "child": if relation['relationship'] == 'kind of': concept_kindof.loc[relation['id'],tmp['id']] = 1 elif relation['relationship'] == 'part of': concept_partof.loc[relation['id'],tmp['id']] = 1
def pull_ontology(out_dir='auto', overwrite=False): """ Download Cognitive Atlas ontology and combine Concepts, Tasks, and Disorders to create ID and relationship DataFrames. """ if out_dir == 'auto': out_dir = op.join(get_resource_path(), 'ontology') else: out_dir = op.abspath(out_dir) ids_file = op.join(out_dir, 'cogat_ids.csv') rels_file = op.join(out_dir, 'cogat_relationships.csv') if overwrite or not all([op.isfile(f) for f in [ids_file, rels_file]]): concepts = get_concept(silent=True).pandas tasks = get_task(silent=True).pandas disorders = get_disorder(silent=True).pandas # Identifiers and aliases long_concepts = _longify(concepts) long_tasks = _longify(tasks) # Disorders currently lack aliases disorders['name'] = disorders['name'].str.lower() disorders = disorders.assign(alias=disorders['name']) disorders = disorders[['id', 'name', 'alias']] # Combine into id_df id_df = pd.concat((long_concepts, long_tasks, disorders), axis=0) id_df = _expand_df(id_df) id_df = id_df.replace('', np.nan) id_df = id_df.dropna(axis=0) id_df = id_df.reset_index(drop=True) # Relationships relationships = [] for i, id_ in enumerate(concepts['id'].unique()): if i % 100 == 0: time.sleep(5) row = [id_, id_, 'isSelf'] relationships.append(row) concept = get_concept(id=id_, silent=True).json for rel in concept['relationships']: reltype = _get_concept_reltype(rel['relationship'], rel['direction']) if reltype is not None: row = [id_, rel['id'], reltype] relationships.append(row) for i, id_ in enumerate(tasks['id'].unique()): if i % 100 == 0: time.sleep(5) row = [id_, id_, 'isSelf'] relationships.append(row) task = get_task(id=id_, silent=True).json for rel in task['concepts']: row = [id_, rel['concept_id'], 'measures'] relationships.append(row) row = [rel['concept_id'], id_, 'measuredBy'] relationships.append(row) for i, id_ in enumerate(disorders['id'].unique()): if i % 100 == 0: time.sleep(5) row = [id_, id_, 'isSelf'] relationships.append(row) disorder = get_disorder(id=id_, silent=True).json for rel in disorder['disorders']: if rel['relationship'] == 'ISA': rel_type = 'isA' else: rel_type = rel['relationship'] row = [id_, rel['id'], rel_type] relationships.append(row) rel_df = pd.DataFrame(columns=['input', 'output', 'rel_type'], data=relationships) ctp_df = concepts[['id', 'id_concept_class']] ctp_df = ctp_df.assign(rel_type='inCategory') ctp_df.columns = ['input', 'output', 'rel_type'] ctp_df['output'].replace('', np.nan, inplace=True) ctp_df.dropna(axis=0, inplace=True) rel_df = pd.concat((ctp_df, rel_df)) rel_df = rel_df.reset_index(drop=True) id_df.to_csv(ids_file, index=False) rel_df.to_csv(rels_file, index=False) else: id_df = pd.read_csv(ids_file) rel_df = pd.read_csv(rels_file) return id_df, rel_df
from cognitiveatlas.api import get_concept from nltk.corpus import stopwords from nltk.stem.porter import * from nltk.stem import * import pandas import numpy import json import nltk import re paragraphs = json.load(open("william_james.json","rb")) # Get all cognitive atlas concepts all_concepts = get_concept().json concepts = dict() for concept in all_concepts: concepts[concept["id"]] = str(concept["name"]) # Functions to parse text def remove_nonenglish_chars(text): return re.sub("[^a-zA-Z]", " ", text) def text2sentences(text,remove_non_english_chars=True): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if remove_non_english_chars: text = remove_nonenglish_chars(text) for s in tokenizer.tokenize(text): yield s def processText(text): '''combines text2sentences and sentence2words'''