def test_project_fields_not_found(): # test that things don't break if the field isn't in project rest_parser = RestParser(verbosity_level=0) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'not_real_field' project = rest_parser.run( BRAIN_VOL_FILES, "/projects/{}?fields={}".format(cmu_test_project_uuid, field) ) print (project) keys = set( [ i for i in project ] ) assert "error" in keys
def test_uri_subjects(): global cmu_test_subject_uuid restParser = RestParser() restParser.setOutputFormat(RestParser.OBJECT_FORMAT) result = restParser.run(BRAIN_VOL_FILES, '/subjects/{}'.format(cmu_test_subject_uuid)) assert type(result) == dict assert 'uuid' in result assert 'instruments' in result assert 'derivatives' in result assert cmu_test_subject_uuid == result['uuid']
def test_project_fields_not_found(): # test that things don't break if the field isn't in project rest_parser = RestParser(verbosity_level=0) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'not_real_field' project = rest_parser.run( BRAIN_VOL_FILES, "/projects/{}?fields={}".format(cmu_test_project_uuid, field)) assert ('field_values' in project) fv = project['field_values'] assert (type(fv) == list) assert len(fv) == 0
def test_project_fields_instruments(): rest_parser = RestParser(verbosity_level=0) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'AGE_AT_SCAN' project = rest_parser.run( BRAIN_VOL_FILES, "/projects/{}?fields={}".format(cmu_test_project_uuid, field)) assert ('field_values' in project) fv = project['field_values'] assert (type(fv) == list) fields_used = set([i["field"] for i in fv]) assert field in fields_used
def test_project_fields_deriv(): rest_parser = RestParser(verbosity_level=0) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'fs_000003' project = rest_parser.run( BRAIN_VOL_FILES, "/projects/{}?fields={}".format(cmu_test_project_uuid, field) ) # edited by DBK to account for only field values being returned #assert( 'field_values' in project ) assert (len(project) > 0) #fv = project['field_values'] fv = project assert( type( fv ) == list ) fields_used = set( [ i.label for i in fv ] ) assert 'Brain Segmentation Volume (mm^3)' in fields_used
def test_odd_isabout_uris(): rest_parser = RestParser(verbosity_level=0) # rest_parser.setOutputFormat(RestParser.CLI_FORMAT) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'http://www.cognitiveatlas.org/ontology/cogat.owl#CAO_00962' fields = rest_parser.run( BRAIN_VOL_FILES, "/projects?fields={}".format(field) ) # edited by DBK to account for only field values being returned #assert( 'field_values' in project ) assert (len(fields) > 0) #fv = project['field_values'] print (fields) fv = fields assert( type( fv ) == list ) fields_used = set( [ i.label for i in fv ] ) assert 'ADOS_TOTAL' in fields_used
def test_project_fields_instruments(): rest_parser = RestParser(verbosity_level=0) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'AGE_AT_SCAN' project = rest_parser.run( BRAIN_VOL_FILES, "/projects/{}?fields={}".format(cmu_test_project_uuid, field)) # edited by DBK to account for only field values being returned #assert( 'field_values' in project ) assert (len(project) > 0) #fv = project['field_values'] fv = project assert (type(fv) == list) fields_used = set([i["field"] for i in fv]) assert field in fields_used
def test_multiple_project_fields(): rest_parser = RestParser(verbosity_level=0) # rest_parser.setOutputFormat(RestParser.CLI_FORMAT) rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'fs_000003,ilx_0100400' # ilx0100400 is 'isAbout' age fields = rest_parser.run( BRAIN_VOL_FILES, "/projects?fields={}".format(field) ) # edited by DBK to account for only field values being returned #assert( 'field_values' in project ) assert (len(fields) > 0) #fv = project['field_values'] print (fields) fv = fields assert( type( fv ) == list ) fields_used = set( [ i.label for i in fv ] ) assert 'Brain Segmentation Volume (mm^3)' in fields_used assert 'age at scan' in fields_used
def test_rest_sub_id(): restParser = RestParser() restParser.setOutputFormat(RestParser.OBJECT_FORMAT) result = restParser.run(ALL_FILES, '/projects/{}'.format(cmu_test_project_uuid)) sub_id = result['subjects']['subject id'][5] sub_uuid = result['subjects']['uuid'][5] result2 = restParser.run(ALL_FILES, '/subjects/{}'.format(sub_id)) pp = pprint.PrettyPrinter() pp.pprint('/subjects/{}'.format(sub_id)) # make sure we got the same UUID when looking up by sub id assert result2['uuid'] == sub_uuid assert len(result2['instruments']) > 0
def test_project_fields_instruments(): rest_parser = RestParser(verbosity_level=0) # projects = rest_parser.run(BRAIN_VOL_FILES, '/projects') # proj_uuid = projects[0] proj_uuid = cmu_test_project_uuid rest_parser.setOutputFormat(RestParser.OBJECT_FORMAT) field = 'age at scan' uri = "/projects/{}?fields={}".format(proj_uuid, field) project = rest_parser.run(BRAIN_VOL_FILES, uri) # edited by DBK to account for only field values being returned #assert( 'field_values' in project ) assert (len(project) > 0) #fv = project['field_values'] fv = project assert (type(fv) == list) fields_used = set([i.label for i in fv]) assert field in fields_used
def query(nidm_file_list, cde_file_list, query_file, output_file, get_participants, get_instruments, get_instrument_vars, get_dataelements, get_brainvols, get_dataelements_brainvols, get_fields, uri, blaze, j, verbosity): """ This function provides query support for NIDM graphs. """ #query result list results = [] # if there is a CDE file list, seed the CDE cache if cde_file_list: getCDEs(cde_file_list.split(",")) if blaze: os.environ["BLAZEGRAPH_URL"] = blaze print("setting BLAZEGRAPH_URL to {}".format(blaze)) if get_participants: df = GetParticipantIDs(nidm_file_list.split(','), output_file=output_file) if ((output_file) is None): print(df.to_string()) return df elif get_instruments: #first get all project UUIDs then iterate and get instruments adding to output dataframe project_list = GetProjectsUUID(nidm_file_list.split(',')) count = 1 for project in project_list: if count == 1: df = GetProjectInstruments(nidm_file_list.split(','), project_id=project) count += 1 else: df = df.append( GetProjectInstruments(nidm_file_list.split(','), project_id=project)) #write dataframe #if output file parameter specified if (output_file is not None): df.to_csv(output_file) #with open(output_file,'w') as myfile: # wr=csv.writer(myfile,quoting=csv.QUOTE_ALL) # wr.writerow(df) #pd.DataFrame.from_records(df,columns=["Instruments"]).to_csv(output_file) else: print(df.to_string()) elif get_instrument_vars: #first get all project UUIDs then iterate and get instruments adding to output dataframe project_list = GetProjectsUUID(nidm_file_list.split(',')) count = 1 for project in project_list: if count == 1: df = GetInstrumentVariables(nidm_file_list.split(','), project_id=project) count += 1 else: df = df.append( GetInstrumentVariables(nidm_file_list.split(','), project_id=project)) #write dataframe #if output file parameter specified if (output_file is not None): df.to_csv(output_file) else: print(df.to_string()) elif get_dataelements: datael = GetDataElements(nidm_file_list=nidm_file_list) #if output file parameter specified if (output_file is not None): datael.to_csv(output_file) else: print(datael.to_string()) elif get_fields: # fields only query. We'll do it with the rest api restParser = RestParser(verbosity_level=int(verbosity)) if (output_file is not None): restParser.setOutputFormat(RestParser.OBJECT_FORMAT) df_list = [] else: restParser.setOutputFormat(RestParser.CLI_FORMAT) # set up uri to do fields query for each nidm file for nidm_file in nidm_file_list.split(","): # get project UUID project = GetProjectsUUID([nidm_file]) uri = "/projects/" + project[0].toPython().split( "/")[-1] + "?fields=" + get_fields # get fields output from each file and concatenate if (output_file is None): # just print results print(restParser.run([nidm_file], uri)) else: df_list.append(pd.DataFrame(restParser.run([nidm_file], uri))) if (output_file is not None): # concatenate data frames df = pd.concat(df_list) # output to csv file df.to_csv(output_file) elif uri: restParser = RestParser(verbosity_level=int(verbosity)) if j: restParser.setOutputFormat(RestParser.JSON_FORMAT) elif (output_file is not None): restParser.setOutputFormat(RestParser.OBJECT_FORMAT) else: restParser.setOutputFormat(RestParser.CLI_FORMAT) df = restParser.run(nidm_file_list.split(','), uri) if (output_file is not None): if j: with open(output_file, "w+") as f: f.write(dumps(df)) else: # convert object df to dataframe and output pd.DataFrame(df).to_csv(output_file) else: print(df) elif get_dataelements_brainvols: brainvol = GetBrainVolumeDataElements(nidm_file_list=nidm_file_list) #if output file parameter specified if (output_file is not None): brainvol.to_csv(output_file) else: print(brainvol.to_string()) elif get_brainvols: brainvol = GetBrainVolumes(nidm_file_list=nidm_file_list) #if output file parameter specified if (output_file is not None): brainvol.to_csv(output_file) else: print(brainvol.to_string()) elif query_file: df = sparql_query_nidm(nidm_file_list.split(','), query_file, output_file) if ((output_file) is None): print(df.to_string()) return df else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm query --help") exit(1)
def data_aggregation(): # all data from all the files is collected """ This function provides query support for NIDM graphs. """ # query result list results = [] # if there is a CDE file list, seed the CDE cache if v: #ex: age,sex,DX_GROUP print( "***********************************************************************************************************" ) command = "pynidm k-means -nl " + n + " -variables \"" + v + "\" " + "-k " + str( k_num) + " -m " + cm print("Your command was: " + command) if (o is not None): f = open(o, "w") f.write("Your command was " + command) f.close() verbosity = 0 restParser = RestParser(verbosity_level=int(verbosity)) restParser.setOutputFormat(RestParser.OBJECT_FORMAT) global df_list # used in dataparsing() df_list = [] # set up uri to do fields query for each nidm file global file_list file_list = n.split(",") df_list_holder = {} for i in range(len(file_list)): df_list_holder[i] = [] df_holder = {} for i in range(len(file_list)): df_holder[i] = [] global condensed_data_holder condensed_data_holder = {} for i in range(len(file_list)): condensed_data_holder[i] = [] count = 0 not_found_count = 0 for nidm_file in file_list: # get project UUID project = GetProjectsUUID([nidm_file]) # split the model into its constituent variables global var_list # below, we edit the model so it splits by +,~, or =. However, to help it out in catching everything # we replaced ~ and = with a + so that we can still use split. Regex wasn't working. var_list = v.split(",") for i in range(len(var_list) ): # here, we remove any leading or trailing spaces var_list[i] = var_list[i].strip() # set the dependent variable to the one dependent variable in the model global vars # used in dataparsing() vars = "" for i in range(len(var_list) - 1, -1, -1): if not "*" in var_list[ i]: # removing the star term from the columns we're about to pull from data vars = vars + var_list[i] + "," else: print( "Interacting variables are not present in clustering models. They will be removed." ) vars = vars[0:len(vars) - 1] uri = "/projects/" + project[0].toPython().split( "/")[-1] + "?fields=" + vars # get fields output from each file and concatenate df_list_holder[count].append( pd.DataFrame(restParser.run([nidm_file], uri))) # global dep_var df = pd.concat(df_list_holder[count]) with tempfile.NamedTemporaryFile( delete=False ) as temp: # turns the dataframe into a temporary csv df.to_csv(temp.name + '.csv') temp.close() data = list( csv.reader(open(temp.name + '.csv')) ) # makes the csv a 2D list to make it easier to call the contents of certain cells var_list = vars.split( ",") # makes a list of the independent variables numcols = (len(data) - 1) // ( len(var_list) ) # Finds the number of columns in the original dataframe global condensed_data # also used in linreg() condensed_data_holder[count] = [ [0] * (len(var_list)) ] # makes an array 1 row by the number of necessary columns for i in range( numcols ): # makes the 2D array big enough to store all of the necessary values in the edited dataset condensed_data_holder[count].append([0] * (len(var_list))) for m in range(0, len(var_list)): end_url = var_list[m].split("/") if "/" in var_list[m]: var_list[m] = end_url[len(end_url) - 1] for i in range( len(var_list) ): # stores the independent variable names in the first row condensed_data_holder[count][0][i] = var_list[i] numrows = 1 # begins at the first row to add data fieldcolumn = 0 # the column the variable name is in in the original dataset valuecolumn = 0 # the column the value is in in the original dataset datacolumn = 0 # if it is identified by the dataElement name instead of the field's name not_found_list = [] for i in range(len(data[0])): if data[0][ i] == 'sourceVariable': # finds the column where the variable names are fieldcolumn = i elif data[0][ i] == 'source_variable': # finds the column where the variable names are fieldcolumn = i elif data[0][i] == 'isAbout': aboutcolumn = i elif data[0][i] == 'label': namecolumn = i # finds the column where the variable names are elif data[0][i] == 'value': valuecolumn = i # finds the column where the values are elif data[0][ i] == 'dataElement': # finds the column where the data element is if necessary datacolumn = i for i in range( len(condensed_data_holder[count][0]) ): # starts iterating through the dataset, looking for the name in that for j in range( 1, len(data) ): # column, so it can append the values under the proper variables try: if data[j][fieldcolumn] == condensed_data_holder[count][ 0][i]: # in the dataframe, the name is in column 3 condensed_data_holder[count][numrows][i] = data[j][ valuecolumn] # in the dataframe, the value is in column 2 numrows = numrows + 1 # moves on to the next row to add the proper values elif data[j][aboutcolumn] == condensed_data_holder[ count][0][i]: condensed_data_holder[count][numrows][i] = data[j][ valuecolumn] # in the dataframe, the value is in column 2 numrows = numrows + 1 # moves on to the next row to add the proper values elif condensed_data_holder[count][0][i] in data[j][ aboutcolumn]: # this is in case the uri only works by querying the part after the last backslash condensed_data_holder[count][numrows][i] = data[j][ valuecolumn] # in the dataframe, the value is in column 2 numrows = numrows + 1 # moves on to the next row to add the proper values elif data[j][namecolumn] == condensed_data_holder[ count][0][ i]: # in the dataframe, the name is in column 12 condensed_data_holder[count][numrows][i] = data[j][ valuecolumn] # in the dataframe, the value is in column 2 numrows = numrows + 1 # moves on to the next row to add the proper values elif condensed_data_holder[count][0][i] == data[j][ datacolumn]: # in the dataframe, the name is in column 9 condensed_data_holder[count][numrows][i] = data[j][ valuecolumn] # in the dataframe, the value is in column 2 numrows = numrows + 1 # moves on to the next row to add the proper values except IndexError: numrows = numrows + 1 numrows = 1 # resets to the first row for the next variable temp_list = condensed_data_holder[count] for j in range( len(temp_list[0]) - 1, 0, -1 ): # if the software appends a column with 0 as the heading, it removes this null column if temp_list[0][j] == "0" or temp_list[0][j] == "NaN": for row in condensed_data_holder[count]: row.pop(j) rowsize = len(condensed_data_holder[count][0]) count1 = 0 for i in range(0, rowsize): for row in condensed_data_holder[count]: if row[i] == 0 or row[i] == "NaN" or row[i] == "0": count1 = count1 + 1 if count1 > len(condensed_data_holder[count]) - 2: not_found_list.append(condensed_data_holder[count][0][i]) count1 = 0 for i in range(len(condensed_data_holder[count][0])): if " " in condensed_data_holder[count][0][i]: condensed_data_holder[count][0][i] = condensed_data_holder[ count][0][i].replace(" ", "_") for i in range(len(var_list)): if "/" in var_list[i]: splitted = var_list[i].split("/") var_list[i] = splitted[len(splitted) - 1] if " " in var_list[i]: var_list[i] = var_list[i].replace(" ", "_") count = count + 1 if len(not_found_list) > 0: print( "***********************************************************************************************************" ) print() print("Your variables were " + v) print() print( "The following variables were not found in " + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) if (o is not None): f = open(o, "a") f.write("Your variables were " + v) f.write( "The following variables were not found in " + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) f.close() for i in range(0, len(not_found_list)): print(str(i + 1) + ". " + not_found_list[i]) if (o is not None): f = open(o, "a") f.write(str(i + 1) + ". " + not_found_list[i]) f.close() for j in range(len(not_found_list) - 1, 0, -1): not_found_list.pop(j) not_found_count = not_found_count + 1 print() if not_found_count > 0: exit(1) else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm k-means --help") exit(1)
def test_cli_rest_routes(): rest_parser = RestParser(verbosity_level=0) rest_parser.setOutputFormat(RestParser.CLI_FORMAT) # # / projects # text = rest_parser.run(BRAIN_VOL_FILES, "/projects") project_uuid = assess_one_col_output(text) # # /statistics/projects/{} # txt_out = rest_parser.run(BRAIN_VOL_FILES, "/statistics/projects/{}".format(project_uuid)) lines = txt_out.strip().splitlines() assert re.search('^-+ +-+$', lines[0]) lines = lines[1:] # done testing line one, slice it off split_lines = [str.split(x) for x in lines] found_gender = found_age_max = found_age_min = found_title = False for split in split_lines: if len(split) > 0: # skip blank lines between apendicies if re.search('title', split[0]): found_title = True if re.search('age_max', split[0]): found_age_max = True if re.search('age_min', split[0]): found_age_min = True if re.search('gender', split[0]): found_gender = True assert found_title assert found_age_max assert found_age_min assert found_gender # # /projects/{}/subjects # sub_text = rest_parser.run(BRAIN_VOL_FILES, '/projects/{}/subjects'.format(project_uuid)) subject_uuid = assess_one_col_output(sub_text) # # /projects/{}/subjects/{}/instruments # # result should be in 3 sections: summary , derivatives, instruments inst_text = rest_parser.run( BRAIN_VOL_FILES, '/projects/{}/subjects/{}/'.format(project_uuid, subject_uuid)) sections = inst_text.split("\n\n") # summary tests summary_lines = sections[0].strip().splitlines()[ 1:-1] # first and last lines should be ----- summary = dict() for l in summary_lines: summary[l.split()[0]] = l.split()[1] inst_uuid = summary['instruments'].split(',')[0] deriv_uuid = summary['derivatives'].split(',')[0] assert is_uuid(inst_uuid) assert is_uuid(deriv_uuid) # derivatives test deriv_lines = sections[1].strip().splitlines() deriv_headers = deriv_lines[0].split() heads = ['Derivative_UUID', 'Measurement', 'Label', 'Value', 'Datumtype'] for i in range(len(heads)): assert re.search(heads[i], deriv_headers[i], re.IGNORECASE) d_uuid = deriv_lines[2].split()[0] assert is_uuid(d_uuid) assert d_uuid in summary['derivatives'].split(',') #instruments test inst_lines = sections[2].strip().splitlines() inst_headers = inst_lines[0].split() heads = ['Instrument_UUID', 'Category', 'Value'] for i in range(len(heads)): assert re.search(heads[i], inst_headers[i], re.IGNORECASE) i_uuid = inst_lines[2].split()[0] assert is_uuid(i_uuid) assert i_uuid in summary['instruments'].split(',')
def query(nidm_file_list, cde_file_list, query_file, output_file, get_participants, get_instruments, get_instrument_vars, get_dataelements, get_brainvols,get_dataelements_brainvols, uri, j, verbosity): """ This function provides query support for NIDM graphs. """ #query result list results = [] # if there is a CDE file list, seed the CDE cache if cde_file_list: getCDEs(cde_file_list.split(",")) if get_participants: df = GetParticipantIDs(nidm_file_list.split(','),output_file=output_file) if ((output_file) is None): print(df.to_string()) return df elif get_instruments: #first get all project UUIDs then iterate and get instruments adding to output dataframe project_list = GetProjectsUUID(nidm_file_list.split(',')) count=1 for project in project_list: if count == 1: df = GetProjectInstruments(nidm_file_list.split(','),project_id=project) count+=1 else: df = df.append(GetProjectInstruments(nidm_file_list.split(','),project_id=project)) #write dataframe #if output file parameter specified if (output_file is not None): df.to_csv(output_file) #with open(output_file,'w') as myfile: # wr=csv.writer(myfile,quoting=csv.QUOTE_ALL) # wr.writerow(df) #pd.DataFrame.from_records(df,columns=["Instruments"]).to_csv(output_file) else: print(df.to_string()) elif get_instrument_vars: #first get all project UUIDs then iterate and get instruments adding to output dataframe project_list = GetProjectsUUID(nidm_file_list.split(',')) count=1 for project in project_list: if count == 1: df = GetInstrumentVariables(nidm_file_list.split(','),project_id=project) count+=1 else: df = df.append(GetInstrumentVariables(nidm_file_list.split(','),project_id=project)) #write dataframe #if output file parameter specified if (output_file is not None): df.to_csv(output_file) else: print(df.to_string()) elif get_dataelements: datael = GetDataElements(nidm_file_list=nidm_file_list) #if output file parameter specified if (output_file is not None): datael.to_csv(output_file) else: print(datael.to_string()) elif uri: restParser = RestParser(verbosity_level = int(verbosity)) if j: restParser.setOutputFormat(RestParser.JSON_FORMAT) else: restParser.setOutputFormat(RestParser.CLI_FORMAT) df = restParser.run(nidm_file_list.split(','), uri) print (df) elif get_dataelements_brainvols: brainvol = GetBrainVolumeDataElements(nidm_file_list=nidm_file_list) #if output file parameter specified if (output_file is not None): brainvol.to_csv(output_file) else: print(brainvol.to_string()) elif get_brainvols: brainvol = GetBrainVolumes(nidm_file_list=nidm_file_list) #if output file parameter specified if (output_file is not None): brainvol.to_csv(output_file) else: print(brainvol.to_string()) elif query_file: df = sparql_query_nidm(nidm_file_list.split(','),query_file,output_file) if ((output_file) is None): print(df.to_string()) return df else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm query --help") exit(1)