def create_matched_result(result_no, cde_id, unique_values, graphdb): vd = create_matched_value_domain(cde_id, unique_values, graphdb) q = "MATCH (n:CDE) - [:IS_CAT] - (d:DEC) WHERE n.CDE_ID = {0:d} RETURN n.CDE_LONG_NAME, d.DEC_ID, d.name".format( cde_id) query_result = utils.query_graph(q, graphdb) cde_data = query_result.values()[0] conceptcode_query = "MATCH (d:DEC) - [:IS_PROP] - (c:Concept) WHERE d.DEC_ID = {0:d} RETURN c.CODE as ConceptCode \n".format( cde_data[1]) conceptcode_query += "UNION ALL MATCH (d:DEC) - [:IS_OBJ] - (c:Concept) WHERE d.DEC_ID = {0:d} RETURN c.CODE as ConceptCode ".format( cde_data[1]) query_result = utils.query_graph(conceptcode_query, graphdb) conceptcodes = query_result.value() unique_concepts = list(set(conceptcodes)) r = { 'resultNumber': result_no, 'result': { 'dataElement': { 'id': cde_id, 'name': cde_data[0] }, 'dataElementConcept': { 'id': cde_data[1], 'name': cde_data[2], 'conceptCodes': ["ncit:{0:s}".format(str(u)) for u in unique_concepts] }, 'valueDomain': vd } } return r
def score_enum_values(unique_values, cde_indices, g, MIN_SCORE=0): query = "CALL {\n" query += " MATCH (c:CDE) WHERE ID(c) IN [{0:s}] \n".format(",".join( [str(i) for i in cde_indices])) query += " RETURN ID(c) as cde_index, 0 AS g_max \n" for u in unique_values: query += " UNION ALL " query += " CALL {\n" query += " CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node as a, score ".format( str(u)) query += " MATCH (a:AnswerText) - [:CAN_BE] - (ans:Answer) - [:PERMITS] - (c:CDE)\n " # query += " WHERE ID(c) IN [{0:s}] \n".format(",".join([str(i) for i in cde_indices])) query += " RETURN ID(c) as cde_index, CASE MAX(score) > {0:1.2f} WHEN TRUE THEN 1 ELSE 0 END as g".format( float(MIN_SCORE)) query += " UNION ALL \n" query += " CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node as s, score ".format( str(u)) query += " MATCH (s:Synonym) - [:IS_CALLED] - (con:Concept) - [:EQUALS] - (ans:Answer) - [:PERMITS] - (c:CDE)\n " # query += " WHERE ID(c) IN [{0:s}] \n".format(",".join([str(i) for i in cde_indices])) query += " RETURN ID(c) as cde_index, CASE MAX(score) > {0:1.2f} WHEN TRUE THEN 1 ELSE 0 END as g".format( float(MIN_SCORE)) query += " }\n" query += " RETURN cde_index, MAX(g) AS g_max \n" query += "}\n" query += "RETURN cde_index,SUM(g_max) * 1.0 / {0:d}".format( len(unique_values)) result = utils.query_graph(query, g) values = result.values() values.sort(key=lambda z: z[1], reverse=True) return values
def enumeration_ansindex_single_search(single_unique_value, g): query = "CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node AS n, score\n".format( str(single_unique_value)) query += " MATCH (n:AnswerText) - [:CAN_BE] - (a:Answer) - [:PERMITS] - (c:CDE) \n" query += " RETURN ID(c), c.CDE_ID, MAX(score)" result = utils.query_graph(query, g) return result.values()
def nameindex_query_multiple(input_string_list, g, score_coef=1, MIN_SCORE=0): if isinstance(score_coef, list): if len(score_coef) != len(input_string_list): raise ValueError("nameindex query score coef length mismatch") else: score_coef_list = score_coef else: score_coef_list = [score_coef] * len(input_string_list) query = "CALL {\n" input_string = input_string_list[0] query += " CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node, score \n".format( str(input_string)) query += " WHERE score > {0:1.2f}\n".format(MIN_SCORE / score_coef_list[0]) query += " RETURN ID(node) as node_id, {0:1.2f} * score AS normal_score, LABELS(node) as node_labels\n".format( score_coef_list[0]) for i in range(1, len(input_string_list)): input_string = input_string_list[i] query += " UNION ALL" query += " CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node, score \n".format( str(input_string)) query += " WHERE score > {0:1.2f}\n".format(MIN_SCORE / score_coef_list[i]) query += " RETURN ID(node) as node_id, {0:1.2f} * score AS normal_score, LABELS(node) as node_labels\n".format( score_coef_list[i]) query += "}\n" query += "RETURN node_id,MAX(normal_score),node_labels\n" result = utils.query_graph(query, g) return result.values()
def score_value_domain(submitted_vd, annotated_res, g): annotated_vd = annotated_res['valueDomain'] annotated_cde_id = annotated_res['dataElement']['id'] query = 'MATCH (n:CDE) WHERE n.CDE_ID = {0:d} RETURN n.VALUE_DOMAIN_TYPE'.format( annotated_cde_id) result = utils.query_graph(query, g) value = result.value() if len(value) == 0: score = 0 else: submitted_observed_values = [i['observedValue'] for i in submitted_vd] enumerated = value[0] == "Enumerated" if enumerated: check_column = 'conceptCode' else: check_column = 'value' if len(annotated_vd) > 0: mismatch_count = 0 for annotated_value_dict in annotated_vd: if annotated_value_dict[ 'observedValue'] in submitted_observed_values: submitted_value_dict = submitted_vd[ submitted_observed_values.index( annotated_value_dict['observedValue'])] if submitted_value_dict['permissibleValue'][ check_column] != annotated_value_dict[ 'permissibleValue'][check_column]: mismatch_count += 1 else: mismatch_count += 1 score = 1 - mismatch_count / len(annotated_vd) else: score = 0 return score
def full_str_match_synonym(col_name, g): string_list = expand_string_names(col_name) where_clause_list = ["n.lower = '" + i + "'" for i in string_list] where_clause = " OR ".join(where_clause_list) query = "MATCH (n:Synonym) WHERE " + where_clause + "RETURN n.name" result = utils.query_graph(query, g) return result.value()
def get_CDEs(ft_result, update_column, g): node_type = ft_result[2][0] node_index = ft_result[0] query = X_FT_STRUCTURE[update_column]['ft_postprocess_params'][node_type][ 'query'](node_index) result = utils.query_graph(query, g) values = result.values() return values
def enumeration_concept_single_search(single_unique_value, g): query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node AS n, score\n".format( str(single_unique_value)) query += " MATCH (n:Synonym) - [:IS_CALLED] - (con:Concept) - [:EQUALS] - (a:Answer) - [:PERMITS] - (c:CDE)\n" query += " RETURN ID(c), c.CDE_ID, MAX(score)" result = utils.query_graph(query, g) values = result.values() values.sort(key=lambda z: z[2], reverse=True) return values
def create_answer_count_df(cde_indices, g): answer_count_df = pd.DataFrame(columns=['index', 'answer_count']) CDE_INDEX = 0 while CDE_INDEX < len(cde_indices): cde_index_batch = cde_indices[CDE_INDEX:min((CDE_INDEX + 1000), len(cde_indices))] q = "MATCH (n:CDE) - [:PERMITS] - (m:Answer) WHERE ID(n) IN [{0:s}] RETURN ID(n),COUNT(*)".format( ",".join(cde_index_batch.astype('int').astype('str'))) result = utils.query_graph(q, g) answer_counts = result.values() q2 = "MATCH (n:CDE) WHERE ID(n) IN [{0:s}] AND n.DATATYPE = 'BOOLEAN' RETURN ID(n),2".format( ",".join(cde_index_batch.astype('int').astype('str'))) result = utils.query_graph(q2, g) answer_counts2 = result.values() n1 = [i[0] for i in answer_counts] answer_counts2 = [i for i in answer_counts2 if i[0] not in n1] answer_count_df = pd.concat( (answer_count_df, pd.DataFrame(answer_counts + answer_counts2, columns=['index', 'answer_count']))) CDE_INDEX += 1000 return answer_count_df
def classify_single_enum_value(col_value,cde_index,g, SEARCH_CUTOFF = 0): output_dict = {'observedValue':str(col_value),'permissibleValue':{}} query = "CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node as a, score ".format(str(col_value)) query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:CAN_BE] - (a:AnswerText) WHERE ID(n) = {0:d} ".format(cde_index) query += "RETURN ID(ans), 'Answer', score, a.name" result = utils.query_graph(query,g) answer_values = result.values() query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node as s, score ".format(str(col_value)) query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:EQUALS] - (con:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(n) = {0:d} ".format(cde_index) query += "RETURN ID(ans), 'Synonym', score, s.name, con.CODE" result = utils.query_graph(query,g) syn_values = result.values() all_results = answer_values + syn_values all_results = [i for i in all_results if i[2] > SEARCH_CUTOFF] if len(all_results) > 0: all_results.sort(key=lambda z: z[2],reverse=True) ans_index = all_results[0][0] ans_results = [i for i in all_results if i[0] == ans_index] # Now we need to choose the best synonym synonyms = [i for i in ans_results if i[1] == 'Synonym'] if len(synonyms) > 0: #Choose the best based on 1: search score, and 2: stringdist synonyms.sort(key=lambda z: (-z[2],stringdist.levenshtein_norm(str(z[3]).lower(),str(col_value).lower()))) output_dict['permissibleValue']['value'] = str(synonyms[0][3]) output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str(synonyms[0][4]) else: query = "MATCH (a:Answer) - [:EQUALS] - (c:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(a) = {0:d} RETURN c.CODE,s.name".format(ans_index) result = utils.query_graph(query,g) values = result.values() if len(values) > 0: values.sort(key = lambda z: stringdist.levenshtein_norm(str(col_value).lower(),str(z[1]).lower())) output_dict['permissibleValue']['value'] = str(values[0][1]) output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str(values[0][0]) else: output_dict['permissibleValue']['value'] = str(ans_results[0][3]) output_dict['permissibleValue']['conceptCode'] = None else: output_dict['permissibleValue']['value'] = 'NOMATCH' output_dict['permissibleValue']['conceptCode'] = None return output_dict
def enumeration_exact_search(unique_values_list, g): unique_values_lower = [ "'" + str(i).lower() + "'" for i in unique_values_list ] unique_values_lower = list(set(unique_values_lower)) query = "MATCH (n:AnswerText) - [:CAN_BE] - (m:Answer) - [:PERMITS] - (c:CDE) WHERE n.name_lower in [{0:s}] ".format( ",".join(unique_values_lower)) query += "WITH DISTINCT n.name_lower AS name_lower, c AS c_distinct " query += "RETURN ID(c_distinct), c_distinct.CDE_ID, COUNT(*)*1.0/{0:d}".format( len(unique_values_list)) result = utils.query_graph(query, g) values = result.values() values.sort(key=lambda z: z[2], reverse=True) return values
def create_matched_value_domain(cde_id, unique_values, graphdb): if len(unique_values) == 0: return [] else: q = "MATCH (n:CDE) WHERE n.CDE_ID = {0:d} RETURN ID(n)".format(cde_id) query_result = utils.query_graph(q, graphdb) cde_node_indices = query_result.value() candidates = [] for cde_index in cde_node_indices: c = value_classifiers.classify_values(unique_values, cde_index, graphdb) neg_score = len([ i for i in c if i['permissibleValue']['value'] in ['NOMATCH', 'NONCONFORMING'] ]) candidates.append({'score': neg_score, 'vd': c}) candidates.sort(key=lambda z: z['score']) return candidates[0]['vd']
def classify_values(col_values,cde_index,g): query = "MATCH (n:CDE) where ID(n) = {0:d} RETURN n.DATATYPE, n.VALUE_DOMAIN_TYPE, n.DISPLAY_FORMAT, n.CDE_ID".format(int(cde_index)) result = utils.query_graph(query,g) values = result.values() value_domain_attributes = values[0] output_list = [] if value_domain_attributes[1]=='Enumerated': for v in col_values: classification_dict = classify_single_enum_value(v,int(cde_index),g) output_list.append(classification_dict) elif value_domain_attributes[2] is not None: for v in col_values: classification_dict = classify_display_value(v,value_domain_attributes[2]) output_list.append(classification_dict) else: for v in col_values: classification_dict = classify_datatype(v,value_domain_attributes[0]) output_list.append(classification_dict) return (output_list)
def score_value_match(value_dict_list, cde_index, g): query = "MATCH (n:CDE) WHERE ID(n) = {0:d} RETURN n.VALUE_DOMAIN_TYPE".format( int(cde_index)) result = utils.query_graph(query, g) value_domain_type = result.value()[0] if value_domain_type == "Enumerated": correct = [ i for i in value_dict_list if i['permissibleValue']['value'] != "NOMATCH" ] points = len(correct) pct = points / len(value_dict_list) else: correct = [ i for i in value_dict_list if i['permissibleValue']['value'] == "CONFORMING" ] points = len(correct) pct = points / len(value_dict_list) return pct
def enumeration_ansindex_search(value_list, g): value_set = list(set(value_list)) query = "CALL {\n" v = value_set[0] query += " CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node AS n, score\n".format( str(v)) query += " MATCH (n:AnswerText) - [:CAN_BE] - (a:Answer) - [:PERMITS] - (c:CDE)\n" query += " RETURN ID(c) AS cde_index, c.CDE_ID as cde_id, MAX(score) AS max_score \n" for v in value_set[1:len(value_list)]: query += " UNION ALL\n" query += " CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node AS n, score\n".format( str(v)) query += " MATCH (n:AnswerText) - [:CAN_BE] - (a:Answer) - [:PERMITS] - (c:CDE)\n" query += " RETURN ID(c) AS cde_index, c.CDE_ID as cde_id, MAX(score) AS max_score \n" query += "}\n" query += "RETURN cde_index, cde_id, SUM(max_score) * 1.0 / {0:d}".format( len(value_set)) result = utils.query_graph(query, g) values = result.values() if len(values) > 1: values.sort(key=lambda z: z[2], reverse=True) return values
def build_initial_column_data(col_series, g, NAMEINDEX_SEARCH_REQD=25, NAMEINDEX_CDE_REQD=5, MIN_SCORE=0, FOLLOW_ON_SEARCH_MIN_WORD_LEN=3, PRINT_STATS=False): import time t = time.time() col_name = col_series.name search_string = utils.clean_string_for_fulltext( utils.lower_upper_split( utils.period_replace(utils.underscore_replace(col_name)))) df = pd.DataFrame(columns=['index', 'cde_id']) result_types = list( set([ j for i in search_functions.X_FT_STRUCTURE for j in search_functions.X_FT_STRUCTURE[i]['ft_postprocess_params'] ])) result_type_dict = { i: [ j for j in search_functions.X_FT_STRUCTURE if i in search_functions.X_FT_STRUCTURE[j]['ft_postprocess_params'] ] for i in result_types } search_results = search_functions.nameindex_query_multiple([search_string], g) if PRINT_STATS: print("Initial search complete, {0:d} results".format( len(search_results))) print("{0:1.2f} seconds.\n".format(time.time() - t)) for col in search_functions.X_FT_STRUCTURE: search_score_df = pd.DataFrame(columns=['index', 'cde_id', col]) for result_type in search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params']: search_result_filtered = [(i[0], i[1]) for i in search_results if i[2][0] == result_type] search_result_df = pd.DataFrame(search_result_filtered, columns=['node_index', col]) q = search_functions.X_FT_STRUCTURE[col]['ft_postprocess_params'][ result_type]['query'](",".join([ str(node_index_int) for node_index_int in search_result_df['node_index'].tolist() ])) agg_type = search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params'][result_type]['aggregation'] res = utils.query_graph(q, g) res_df = pd.DataFrame(res.values(), columns=['node_index', 'index', 'cde_id']) scored_results = pd.merge(res_df, search_result_df, how='left', on='node_index') if agg_type == 'max': agg_results = scored_results[['index', 'cde_id', col]].groupby( by=['index', 'cde_id'], axis=0, as_index=False).max() elif agg_type == 'sum': agg_results = scored_results[['index', 'cde_id', col]].groupby( by=['index', 'cde_id'], axis=0, as_index=False).sum() else: agg_results = pd.DataFrame(columns=['index', 'cde_id', col]) search_score_df = pd.concat([search_score_df, agg_results]) if search_score_df.shape[0] > 0: final_agg = search_score_df.groupby(by=['index', 'cde_id'], axis=0, as_index=False).max() df = pd.merge(df, final_agg, on=['index', 'cde_id'], how='outer') elif col not in df.columns.tolist(): df[col] = 0 df['secondary_search'] = [0] * df.shape[0] for col in search_functions.X_FT_STRUCTURE: df.loc[df[col] != df[col], col] = 0 ordered_cols = [(col, search_functions.X_FT_STRUCTURE[col]['column_no']) for col in search_functions.X_FT_STRUCTURE] ordered_cols.sort(key=lambda z: z[1]) df = df[[o[0] for o in ordered_cols]] if (len(search_results) < NAMEINDEX_SEARCH_REQD) or (len([ i for i in search_results if i[2][0] in ['CDE', 'CDE_Name', 'DEC', 'QuestionText'] ]) < NAMEINDEX_CDE_REQD): new_df = pd.DataFrame(columns=['index', 'cde_id']) min_substr_length = max( int(np.floor(np.sqrt(len(search_string) * 1. / 2))), FOLLOW_ON_SEARCH_MIN_WORD_LEN) new_search_strings = [(search_string, 1) ] + search_functions.create_new_search_strings( search_string, g, min_substr_length) if PRINT_STATS: print("New Search Strings created, {0:d} strings".format( len(new_search_strings))) print("{0:1.2f} seconds.\n".format(time.time() - t)) new_search_results = search_functions.nameindex_query_multiple([ utils.clean_string_for_fulltext(s[0]) for s in new_search_strings ], g, [s[1] for s in new_search_strings]) if PRINT_STATS: print("Follow-on search complete, {0:d} results".format( len(new_search_results))) print("{0:1.2f} seconds.\n".format(time.time() - t)) for col in search_functions.X_FT_STRUCTURE: search_score_df = pd.DataFrame(columns=['index', 'cde_id', col]) for result_type in search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params']: search_result_filtered = [(i[0], i[1]) for i in new_search_results if i[2][0] == result_type] search_result_df = pd.DataFrame(search_result_filtered, columns=['node_index', col]) q = search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params'][result_type]['query'](",".join([ str(node_index_int) for node_index_int in search_result_df['node_index'].tolist() ])) agg_type = search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params'][result_type]['aggregation'] res = utils.query_graph(q, g) res_df = pd.DataFrame( res.values(), columns=['node_index', 'index', 'cde_id']) scored_results = pd.merge(res_df, search_result_df, how='left', on='node_index') if agg_type == 'max': agg_results = scored_results[['index', 'cde_id', col]].groupby( by=['index', 'cde_id'], axis=0, as_index=False).max() elif agg_type == 'sum': agg_results = scored_results[['index', 'cde_id', col]].groupby( by=['index', 'cde_id'], axis=0, as_index=False).sum() else: agg_results = pd.DataFrame( columns=['index', 'cde_id', col]) search_score_df = pd.concat([search_score_df, agg_results]) if search_score_df.shape[0] > 0: final_agg = search_score_df.groupby(by=['index', 'cde_id'], axis=0, as_index=False).max() new_df = pd.merge(new_df, final_agg, on=['index', 'cde_id'], how='outer') elif col not in new_df.columns.tolist(): new_df[col] = 0 new_df['secondary_search'] = [1] * new_df.shape[0] for col in search_functions.X_FT_STRUCTURE: new_df.loc[new_df[col] != new_df[col], col] = 0 ordered_cols = [(col, search_functions.X_FT_STRUCTURE[col]['column_no']) for col in search_functions.X_FT_STRUCTURE] ordered_cols.sort(key=lambda z: z[1]) new_df = new_df[[o[0] for o in ordered_cols]] new_df = new_df[~new_df['index'].isin(df['index'])] df = pd.concat([df, new_df]) if PRINT_STATS: print("Initial df created, {0:d} rows".format(df.shape[0])) print("{0:1.2f} seconds.\n".format(time.time() - t)) unique_values = utils.col_unique_values(col_series) if len(unique_values) > 0: unique_values_clean = [ utils.clean_string_for_fulltext(i) for i in unique_values ] enum_search1 = search_functions.enumeration_concept_search( unique_values_clean, g) enum_search1_df = pd.DataFrame( [es for es in enum_search1 if es[2] > MIN_SCORE], columns=['index', 'cde_id', 'enum_concept_search']) enum_search2 = search_functions.enumeration_ansindex_search( unique_values_clean, g) enum_search2_df = pd.DataFrame( [es for es in enum_search2 if es[2] > MIN_SCORE], columns=['index', 'cde_id', 'enum_answer_search']) df = pd.merge(df, enum_search1_df, on=['index', 'cde_id'], how='outer') df = pd.merge(df, enum_search2_df, on=['index', 'cde_id'], how='outer') else: df['enum_concept_search'] = 0 df['enum_answer_search'] = 0 if PRINT_STATS: print("Enum searches complete, {0:d} rows".format(df.shape[0])) print("{0:1.2f} seconds.\n".format(time.time() - t)) if df.shape[0] > 0: answer_count_df = search_functions.create_answer_count_df( df['index'].values, g) answer_count_df = pd.merge(df['index'], answer_count_df, on='index', how='outer') # print("Answer Count DF Created: {0:d} sec".format(int(time.time() - t))) # print("Answer Count DF size: {0:d}.\n".format(answer_count_df.shape[0])) n_ans = len(unique_values) n_lines = sum(col_series == col_series) answer_count_df.loc[ answer_count_df['answer_count'] != answer_count_df['answer_count'], 'answer_count'] = n_lines if answer_count_df.shape[0] > 0: answer_count_df = pd.DataFrame({ 'index': answer_count_df['index'], 'answer_count_score': answer_count_df.apply( lambda z: search_functions.nans_vs_nexp(n_ans, z[1]), axis=1) }) else: answer_count_df = pd.DataFrame({ 'index': [], 'answer_count_score': [] }) df = pd.merge(df, answer_count_df, on='index', how='inner') if PRINT_STATS: print("Answer count complete, {0:d} rows".format(df.shape[0])) print("{0:1.2f} seconds.\n".format(time.time() - t)) for c in df.columns: v = df[c] != df[c] if any(v): df.loc[v, c] = 0 if n_ans > 0: query = "MATCH (n:CDE) WHERE ID(n) IN [{0:s}] RETURN DISTINCT ID(n), n.DATATYPE, n.DISPLAY_FORMAT, n.VALUE_DOMAIN_TYPE".format( ",".join([str(i) for i in df['index'].values])) result = utils.query_graph(query, g) values = result.values() temp_df = pd.DataFrame(values, columns=[ 'index', 'datatype', 'display_format', 'value_domain_type' ]) enum_ids = list(temp_df['index'].loc[temp_df['value_domain_type'] == 'Enumerated'].values) enum_scores = search_functions.score_enum_values( unique_values_clean, enum_ids, g) enum_score_df = pd.DataFrame(enum_scores, columns=['index', 'value_score']) temp_df = pd.merge(temp_df, enum_score_df, on='index', how='left') for display_format in DISPLAY_FORMATS: temp_df.loc[(temp_df['value_domain_type'] == 'NonEnumerated') & (temp_df['display_format'] == display_format), 'value_score'] = len([ j for j in unique_values if datachecks. check_display_format(str(j), display_format) ]) / len(unique_values) for datatype in DATATYPES: temp_df.loc[(temp_df['value_domain_type'] == 'NonEnumerated') & (temp_df['display_format'].isnull()) & (temp_df['datatype'] == datatype), 'value_score'] = len([ j for j in unique_values if datachecks.check_datatype(str(j), datatype) ]) / len(unique_values) df = pd.merge(df, temp_df[['index', 'value_score']], on='index', how='inner') else: df['value_score'] = 0 if PRINT_STATS: print("Enum scores complete, {0:d} rows".format(df.shape[0])) print("{0:1.2f} seconds.\n".format(time.time() - t)) df['index'] = df['index'].astype('int') for col in df.columns: df.loc[df[col] != df[col], col] = 0 return df
def get_enum_answers(cde_index, g): query = "MATCH (n:CDE) - [:PERMITS] - (a:Answer) - [:CAN_BE] - (at: AnswerText) WHERE ID(n) = {0:d} RETURN ID(at),at.name".format( cde_index) result = utils.query_graph(query, g) values = result.values() return values
for col in search_functions.X_FT_STRUCTURE: search_score_df = pd.DataFrame(columns=['index', 'cde_id', col]) for result_type in search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params']: search_result_filtered = [(i[0], i[1]) for i in search_results if i[2][0] == result_type] search_result_df = pd.DataFrame(search_result_filtered, columns=['node_index', col]) q = search_functions.X_FT_STRUCTURE[col]['ft_postprocess_params'][ result_type]['query'](",".join([ str(node_index_int) for node_index_int in search_result_df['node_index'].tolist() ])) agg_type = search_functions.X_FT_STRUCTURE[col][ 'ft_postprocess_params'][result_type]['aggregation'] res = utils.query_graph(q, g) res_df = pd.DataFrame(res.values(), columns=['node_index', 'index', 'cde_id']) scored_results = pd.merge(res_df, search_result_df, how='left', on='node_index') if agg_type == 'max': agg_results = scored_results[['index', 'cde_id', col]].groupby( by=['index', 'cde_id'], axis=0, as_index=False).max() elif agg_type == 'sum': agg_results = scored_results[['index', 'cde_id', col]].groupby( by=['index', 'cde_id'], axis=0, as_index=False).sum() search_score_df = pd.concat([search_score_df, agg_results]) if search_score_df.shape[0] > 0: final_agg = search_score_df.groupby(by=['index', 'cde_id'],
def get_de_concepts(cde_id, g): query = "MATCH (n:CDE) - [:IS_CAT] - (d:DEC) - [] - (c:Concept) WHERE n.CDE_ID = {0:d} RETURN c.CODE".format( cde_id) codes = utils.query_graph(query, g).value() return codes
def get_de_concepts_list(cde_index_list, g): query = "MATCH (n:CDE) - [:IS_CAT] - (d:DEC) - [] - (c:Concept) WHERE ID(n) IN [{0:s}] ".format( ",".join([str(i) for i in cde_index_list])) query += "RETURN ID(n),COLLECT(c.CODE)" codes = utils.query_graph(query, g).values() return codes
def find_synonyms(input_str, g): query = "MATCH (n:Synonym) WHERE \"{0:s}\" CONTAINS n.name_lower RETURN n.name_lower".format( str(input_str).lower()) result = utils.query_graph(query, g) values = result.value() return values
def nameindex_query(input_string, g): query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node, score RETURN ID(node), score, LABELS(node)".format( str(input_string)) result = utils.query_graph(query, g) return result.values()