Esempio n. 1
0
def create_matched_result(result_no, cde_id, unique_values, graphdb):
    vd = create_matched_value_domain(cde_id, unique_values, graphdb)
    q = "MATCH (n:CDE) - [:IS_CAT] - (d:DEC) WHERE n.CDE_ID = {0:d} RETURN n.CDE_LONG_NAME, d.DEC_ID, d.name".format(
        cde_id)
    query_result = utils.query_graph(q, graphdb)
    cde_data = query_result.values()[0]
    conceptcode_query = "MATCH (d:DEC) - [:IS_PROP] - (c:Concept) WHERE d.DEC_ID = {0:d} RETURN c.CODE as ConceptCode \n".format(
        cde_data[1])
    conceptcode_query += "UNION ALL MATCH (d:DEC) - [:IS_OBJ] - (c:Concept) WHERE d.DEC_ID = {0:d} RETURN c.CODE as ConceptCode ".format(
        cde_data[1])
    query_result = utils.query_graph(conceptcode_query, graphdb)
    conceptcodes = query_result.value()
    unique_concepts = list(set(conceptcodes))
    r = {
        'resultNumber': result_no,
        'result': {
            'dataElement': {
                'id': cde_id,
                'name': cde_data[0]
            },
            'dataElementConcept': {
                'id':
                cde_data[1],
                'name':
                cde_data[2],
                'conceptCodes':
                ["ncit:{0:s}".format(str(u)) for u in unique_concepts]
            },
            'valueDomain': vd
        }
    }
    return r
Esempio n. 2
0
def score_enum_values(unique_values, cde_indices, g, MIN_SCORE=0):
    query = "CALL {\n"
    query += " MATCH (c:CDE) WHERE ID(c) IN [{0:s}] \n".format(",".join(
        [str(i) for i in cde_indices]))
    query += " RETURN ID(c) as cde_index, 0 AS g_max \n"
    for u in unique_values:
        query += " UNION ALL "
        query += " CALL {\n"
        query += " CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node as a, score ".format(
            str(u))
        query += " MATCH (a:AnswerText) - [:CAN_BE] - (ans:Answer) - [:PERMITS] - (c:CDE)\n "
        # query += " WHERE ID(c) IN [{0:s}] \n".format(",".join([str(i) for i in cde_indices]))
        query += " RETURN ID(c) as cde_index, CASE MAX(score) > {0:1.2f} WHEN TRUE THEN 1 ELSE 0 END as g".format(
            float(MIN_SCORE))
        query += " UNION ALL \n"
        query += " CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node as s, score ".format(
            str(u))
        query += " MATCH (s:Synonym) - [:IS_CALLED] - (con:Concept) - [:EQUALS] - (ans:Answer) - [:PERMITS] - (c:CDE)\n "
        # query += " WHERE ID(c) IN [{0:s}] \n".format(",".join([str(i) for i in cde_indices]))
        query += " RETURN ID(c) as cde_index, CASE MAX(score) > {0:1.2f} WHEN TRUE THEN 1 ELSE 0 END as g".format(
            float(MIN_SCORE))
        query += " }\n"
        query += " RETURN cde_index, MAX(g) AS g_max \n"
    query += "}\n"
    query += "RETURN cde_index,SUM(g_max) * 1.0 / {0:d}".format(
        len(unique_values))
    result = utils.query_graph(query, g)
    values = result.values()
    values.sort(key=lambda z: z[1], reverse=True)
    return values
Esempio n. 3
0
def enumeration_ansindex_single_search(single_unique_value, g):
    query = "CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node AS n, score\n".format(
        str(single_unique_value))
    query += " MATCH (n:AnswerText) - [:CAN_BE] - (a:Answer) - [:PERMITS] - (c:CDE) \n"
    query += " RETURN ID(c), c.CDE_ID, MAX(score)"
    result = utils.query_graph(query, g)
    return result.values()
Esempio n. 4
0
def nameindex_query_multiple(input_string_list, g, score_coef=1, MIN_SCORE=0):
    if isinstance(score_coef, list):
        if len(score_coef) != len(input_string_list):
            raise ValueError("nameindex query score coef length mismatch")
        else:
            score_coef_list = score_coef
    else:
        score_coef_list = [score_coef] * len(input_string_list)
    query = "CALL {\n"
    input_string = input_string_list[0]
    query += " CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node, score \n".format(
        str(input_string))
    query += " WHERE score > {0:1.2f}\n".format(MIN_SCORE / score_coef_list[0])
    query += " RETURN ID(node) as node_id, {0:1.2f} * score AS normal_score, LABELS(node) as node_labels\n".format(
        score_coef_list[0])
    for i in range(1, len(input_string_list)):
        input_string = input_string_list[i]
        query += " UNION ALL"
        query += " CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node, score \n".format(
            str(input_string))
        query += " WHERE score > {0:1.2f}\n".format(MIN_SCORE /
                                                    score_coef_list[i])
        query += " RETURN ID(node) as node_id, {0:1.2f} * score AS normal_score, LABELS(node) as node_labels\n".format(
            score_coef_list[i])
    query += "}\n"
    query += "RETURN node_id,MAX(normal_score),node_labels\n"
    result = utils.query_graph(query, g)
    return result.values()
Esempio n. 5
0
def score_value_domain(submitted_vd, annotated_res, g):
    annotated_vd = annotated_res['valueDomain']
    annotated_cde_id = annotated_res['dataElement']['id']
    query = 'MATCH (n:CDE) WHERE n.CDE_ID = {0:d} RETURN n.VALUE_DOMAIN_TYPE'.format(
        annotated_cde_id)
    result = utils.query_graph(query, g)
    value = result.value()
    if len(value) == 0:
        score = 0
    else:
        submitted_observed_values = [i['observedValue'] for i in submitted_vd]
        enumerated = value[0] == "Enumerated"
        if enumerated:
            check_column = 'conceptCode'
        else:
            check_column = 'value'
        if len(annotated_vd) > 0:
            mismatch_count = 0
            for annotated_value_dict in annotated_vd:
                if annotated_value_dict[
                        'observedValue'] in submitted_observed_values:
                    submitted_value_dict = submitted_vd[
                        submitted_observed_values.index(
                            annotated_value_dict['observedValue'])]
                    if submitted_value_dict['permissibleValue'][
                            check_column] != annotated_value_dict[
                                'permissibleValue'][check_column]:
                        mismatch_count += 1
                else:
                    mismatch_count += 1
            score = 1 - mismatch_count / len(annotated_vd)
        else:
            score = 0
    return score
Esempio n. 6
0
def full_str_match_synonym(col_name, g):
    string_list = expand_string_names(col_name)
    where_clause_list = ["n.lower = '" + i + "'" for i in string_list]
    where_clause = " OR ".join(where_clause_list)
    query = "MATCH (n:Synonym) WHERE " + where_clause + "RETURN n.name"
    result = utils.query_graph(query, g)
    return result.value()
Esempio n. 7
0
def get_CDEs(ft_result, update_column, g):
    node_type = ft_result[2][0]
    node_index = ft_result[0]
    query = X_FT_STRUCTURE[update_column]['ft_postprocess_params'][node_type][
        'query'](node_index)
    result = utils.query_graph(query, g)
    values = result.values()
    return values
Esempio n. 8
0
def enumeration_concept_single_search(single_unique_value, g):
    query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node AS n, score\n".format(
        str(single_unique_value))
    query += " MATCH (n:Synonym) - [:IS_CALLED] - (con:Concept) - [:EQUALS] - (a:Answer) - [:PERMITS] - (c:CDE)\n"
    query += " RETURN ID(c), c.CDE_ID, MAX(score)"
    result = utils.query_graph(query, g)
    values = result.values()
    values.sort(key=lambda z: z[2], reverse=True)
    return values
Esempio n. 9
0
def create_answer_count_df(cde_indices, g):
    answer_count_df = pd.DataFrame(columns=['index', 'answer_count'])
    CDE_INDEX = 0
    while CDE_INDEX < len(cde_indices):
        cde_index_batch = cde_indices[CDE_INDEX:min((CDE_INDEX +
                                                     1000), len(cde_indices))]
        q = "MATCH (n:CDE) - [:PERMITS] - (m:Answer) WHERE ID(n) IN [{0:s}] RETURN ID(n),COUNT(*)".format(
            ",".join(cde_index_batch.astype('int').astype('str')))
        result = utils.query_graph(q, g)
        answer_counts = result.values()
        q2 = "MATCH (n:CDE) WHERE ID(n) IN [{0:s}] AND n.DATATYPE = 'BOOLEAN' RETURN ID(n),2".format(
            ",".join(cde_index_batch.astype('int').astype('str')))
        result = utils.query_graph(q2, g)
        answer_counts2 = result.values()
        n1 = [i[0] for i in answer_counts]
        answer_counts2 = [i for i in answer_counts2 if i[0] not in n1]
        answer_count_df = pd.concat(
            (answer_count_df,
             pd.DataFrame(answer_counts + answer_counts2,
                          columns=['index', 'answer_count'])))
        CDE_INDEX += 1000
    return answer_count_df
Esempio n. 10
0
def classify_single_enum_value(col_value,cde_index,g, SEARCH_CUTOFF = 0):
    output_dict = {'observedValue':str(col_value),'permissibleValue':{}}
    query = "CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node as a, score ".format(str(col_value))
    query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:CAN_BE] - (a:AnswerText) WHERE ID(n) = {0:d} ".format(cde_index)
    query += "RETURN ID(ans), 'Answer', score, a.name"
    result = utils.query_graph(query,g)
    answer_values = result.values()
    query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node as s, score ".format(str(col_value))
    query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:EQUALS] - (con:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(n) = {0:d} ".format(cde_index)
    query += "RETURN ID(ans), 'Synonym', score, s.name, con.CODE"
    result = utils.query_graph(query,g)
    syn_values = result.values()
    all_results = answer_values + syn_values
    all_results = [i for i in all_results if i[2] > SEARCH_CUTOFF]
    if len(all_results) > 0:
        all_results.sort(key=lambda z: z[2],reverse=True)
        ans_index = all_results[0][0]
        ans_results = [i for i in all_results if i[0] == ans_index]
        # Now we need to choose the best synonym
        synonyms = [i for i in ans_results if i[1] == 'Synonym']
        if len(synonyms) > 0:  #Choose the best based on 1: search score, and 2: stringdist
            synonyms.sort(key=lambda z: (-z[2],stringdist.levenshtein_norm(str(z[3]).lower(),str(col_value).lower())))
            output_dict['permissibleValue']['value'] = str(synonyms[0][3])
            output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str(synonyms[0][4])
        else:
            query = "MATCH (a:Answer) - [:EQUALS] - (c:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(a) = {0:d} RETURN c.CODE,s.name".format(ans_index)
            result = utils.query_graph(query,g)
            values = result.values()
            if len(values) > 0:
                values.sort(key = lambda z: stringdist.levenshtein_norm(str(col_value).lower(),str(z[1]).lower()))
                output_dict['permissibleValue']['value'] = str(values[0][1])
                output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str(values[0][0])
            else:
                output_dict['permissibleValue']['value'] = str(ans_results[0][3])
                output_dict['permissibleValue']['conceptCode'] = None
    else:
        output_dict['permissibleValue']['value'] = 'NOMATCH'
        output_dict['permissibleValue']['conceptCode'] = None
    return output_dict
Esempio n. 11
0
def enumeration_exact_search(unique_values_list, g):
    unique_values_lower = [
        "'" + str(i).lower() + "'" for i in unique_values_list
    ]
    unique_values_lower = list(set(unique_values_lower))
    query = "MATCH (n:AnswerText) - [:CAN_BE] - (m:Answer) - [:PERMITS] - (c:CDE) WHERE n.name_lower in [{0:s}] ".format(
        ",".join(unique_values_lower))
    query += "WITH DISTINCT n.name_lower AS name_lower, c AS c_distinct "
    query += "RETURN ID(c_distinct), c_distinct.CDE_ID, COUNT(*)*1.0/{0:d}".format(
        len(unique_values_list))
    result = utils.query_graph(query, g)
    values = result.values()
    values.sort(key=lambda z: z[2], reverse=True)
    return values
Esempio n. 12
0
def create_matched_value_domain(cde_id, unique_values, graphdb):
    if len(unique_values) == 0:
        return []
    else:
        q = "MATCH (n:CDE) WHERE n.CDE_ID = {0:d} RETURN ID(n)".format(cde_id)
        query_result = utils.query_graph(q, graphdb)
        cde_node_indices = query_result.value()
        candidates = []
        for cde_index in cde_node_indices:
            c = value_classifiers.classify_values(unique_values, cde_index,
                                                  graphdb)
            neg_score = len([
                i for i in c if i['permissibleValue']['value'] in
                ['NOMATCH', 'NONCONFORMING']
            ])
            candidates.append({'score': neg_score, 'vd': c})
        candidates.sort(key=lambda z: z['score'])
        return candidates[0]['vd']
Esempio n. 13
0
def classify_values(col_values,cde_index,g):
    query = "MATCH (n:CDE) where ID(n) = {0:d} RETURN n.DATATYPE, n.VALUE_DOMAIN_TYPE, n.DISPLAY_FORMAT, n.CDE_ID".format(int(cde_index))
    result = utils.query_graph(query,g)
    values = result.values()
    value_domain_attributes = values[0]
    output_list = []
    if value_domain_attributes[1]=='Enumerated':
        for v in col_values:
            classification_dict = classify_single_enum_value(v,int(cde_index),g)
            output_list.append(classification_dict)
    elif value_domain_attributes[2] is not None:
        for v in col_values:
            classification_dict = classify_display_value(v,value_domain_attributes[2])
            output_list.append(classification_dict)
    else:
        for v in col_values:
            classification_dict = classify_datatype(v,value_domain_attributes[0])
            output_list.append(classification_dict)
    return (output_list)
Esempio n. 14
0
def score_value_match(value_dict_list, cde_index, g):
    query = "MATCH (n:CDE) WHERE ID(n) = {0:d} RETURN n.VALUE_DOMAIN_TYPE".format(
        int(cde_index))
    result = utils.query_graph(query, g)
    value_domain_type = result.value()[0]
    if value_domain_type == "Enumerated":
        correct = [
            i for i in value_dict_list
            if i['permissibleValue']['value'] != "NOMATCH"
        ]
        points = len(correct)
        pct = points / len(value_dict_list)
    else:
        correct = [
            i for i in value_dict_list
            if i['permissibleValue']['value'] == "CONFORMING"
        ]
        points = len(correct)
        pct = points / len(value_dict_list)
    return pct
Esempio n. 15
0
def enumeration_ansindex_search(value_list, g):
    value_set = list(set(value_list))
    query = "CALL {\n"
    v = value_set[0]
    query += " CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node AS n, score\n".format(
        str(v))
    query += " MATCH (n:AnswerText) - [:CAN_BE] - (a:Answer) - [:PERMITS] - (c:CDE)\n"
    query += " RETURN ID(c) AS cde_index, c.CDE_ID as cde_id, MAX(score) AS max_score \n"
    for v in value_set[1:len(value_list)]:
        query += " UNION ALL\n"
        query += " CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node AS n, score\n".format(
            str(v))
        query += " MATCH (n:AnswerText) - [:CAN_BE] - (a:Answer) - [:PERMITS] - (c:CDE)\n"
        query += " RETURN ID(c) AS cde_index, c.CDE_ID as cde_id, MAX(score) AS max_score \n"
    query += "}\n"
    query += "RETURN cde_index, cde_id, SUM(max_score) * 1.0 / {0:d}".format(
        len(value_set))
    result = utils.query_graph(query, g)
    values = result.values()
    if len(values) > 1:
        values.sort(key=lambda z: z[2], reverse=True)
    return values
Esempio n. 16
0
def build_initial_column_data(col_series,
                              g,
                              NAMEINDEX_SEARCH_REQD=25,
                              NAMEINDEX_CDE_REQD=5,
                              MIN_SCORE=0,
                              FOLLOW_ON_SEARCH_MIN_WORD_LEN=3,
                              PRINT_STATS=False):
    import time
    t = time.time()
    col_name = col_series.name
    search_string = utils.clean_string_for_fulltext(
        utils.lower_upper_split(
            utils.period_replace(utils.underscore_replace(col_name))))
    df = pd.DataFrame(columns=['index', 'cde_id'])
    result_types = list(
        set([
            j for i in search_functions.X_FT_STRUCTURE for j in
            search_functions.X_FT_STRUCTURE[i]['ft_postprocess_params']
        ]))
    result_type_dict = {
        i: [
            j for j in search_functions.X_FT_STRUCTURE
            if i in search_functions.X_FT_STRUCTURE[j]['ft_postprocess_params']
        ]
        for i in result_types
    }
    search_results = search_functions.nameindex_query_multiple([search_string],
                                                               g)
    if PRINT_STATS:
        print("Initial search complete, {0:d} results".format(
            len(search_results)))
        print("{0:1.2f} seconds.\n".format(time.time() - t))
    for col in search_functions.X_FT_STRUCTURE:
        search_score_df = pd.DataFrame(columns=['index', 'cde_id', col])
        for result_type in search_functions.X_FT_STRUCTURE[col][
                'ft_postprocess_params']:
            search_result_filtered = [(i[0], i[1]) for i in search_results
                                      if i[2][0] == result_type]
            search_result_df = pd.DataFrame(search_result_filtered,
                                            columns=['node_index', col])
            q = search_functions.X_FT_STRUCTURE[col]['ft_postprocess_params'][
                result_type]['query'](",".join([
                    str(node_index_int) for node_index_int in
                    search_result_df['node_index'].tolist()
                ]))
            agg_type = search_functions.X_FT_STRUCTURE[col][
                'ft_postprocess_params'][result_type]['aggregation']
            res = utils.query_graph(q, g)
            res_df = pd.DataFrame(res.values(),
                                  columns=['node_index', 'index', 'cde_id'])
            scored_results = pd.merge(res_df,
                                      search_result_df,
                                      how='left',
                                      on='node_index')
            if agg_type == 'max':
                agg_results = scored_results[['index', 'cde_id', col]].groupby(
                    by=['index', 'cde_id'], axis=0, as_index=False).max()
            elif agg_type == 'sum':
                agg_results = scored_results[['index', 'cde_id', col]].groupby(
                    by=['index', 'cde_id'], axis=0, as_index=False).sum()
            else:
                agg_results = pd.DataFrame(columns=['index', 'cde_id', col])
            search_score_df = pd.concat([search_score_df, agg_results])
        if search_score_df.shape[0] > 0:
            final_agg = search_score_df.groupby(by=['index', 'cde_id'],
                                                axis=0,
                                                as_index=False).max()
            df = pd.merge(df, final_agg, on=['index', 'cde_id'], how='outer')
        elif col not in df.columns.tolist():
            df[col] = 0
    df['secondary_search'] = [0] * df.shape[0]
    for col in search_functions.X_FT_STRUCTURE:
        df.loc[df[col] != df[col], col] = 0
    ordered_cols = [(col, search_functions.X_FT_STRUCTURE[col]['column_no'])
                    for col in search_functions.X_FT_STRUCTURE]
    ordered_cols.sort(key=lambda z: z[1])
    df = df[[o[0] for o in ordered_cols]]
    if (len(search_results) < NAMEINDEX_SEARCH_REQD) or (len([
            i for i in search_results
            if i[2][0] in ['CDE', 'CDE_Name', 'DEC', 'QuestionText']
    ]) < NAMEINDEX_CDE_REQD):
        new_df = pd.DataFrame(columns=['index', 'cde_id'])
        min_substr_length = max(
            int(np.floor(np.sqrt(len(search_string) * 1. / 2))),
            FOLLOW_ON_SEARCH_MIN_WORD_LEN)
        new_search_strings = [(search_string, 1)
                              ] + search_functions.create_new_search_strings(
                                  search_string, g, min_substr_length)
        if PRINT_STATS:
            print("New Search Strings created, {0:d} strings".format(
                len(new_search_strings)))
            print("{0:1.2f} seconds.\n".format(time.time() - t))
        new_search_results = search_functions.nameindex_query_multiple([
            utils.clean_string_for_fulltext(s[0]) for s in new_search_strings
        ], g, [s[1] for s in new_search_strings])
        if PRINT_STATS:
            print("Follow-on search complete, {0:d} results".format(
                len(new_search_results)))
            print("{0:1.2f} seconds.\n".format(time.time() - t))
        for col in search_functions.X_FT_STRUCTURE:
            search_score_df = pd.DataFrame(columns=['index', 'cde_id', col])
            for result_type in search_functions.X_FT_STRUCTURE[col][
                    'ft_postprocess_params']:
                search_result_filtered = [(i[0], i[1])
                                          for i in new_search_results
                                          if i[2][0] == result_type]
                search_result_df = pd.DataFrame(search_result_filtered,
                                                columns=['node_index', col])
                q = search_functions.X_FT_STRUCTURE[col][
                    'ft_postprocess_params'][result_type]['query'](",".join([
                        str(node_index_int) for node_index_int in
                        search_result_df['node_index'].tolist()
                    ]))
                agg_type = search_functions.X_FT_STRUCTURE[col][
                    'ft_postprocess_params'][result_type]['aggregation']
                res = utils.query_graph(q, g)
                res_df = pd.DataFrame(
                    res.values(), columns=['node_index', 'index', 'cde_id'])
                scored_results = pd.merge(res_df,
                                          search_result_df,
                                          how='left',
                                          on='node_index')
                if agg_type == 'max':
                    agg_results = scored_results[['index', 'cde_id',
                                                  col]].groupby(
                                                      by=['index', 'cde_id'],
                                                      axis=0,
                                                      as_index=False).max()
                elif agg_type == 'sum':
                    agg_results = scored_results[['index', 'cde_id',
                                                  col]].groupby(
                                                      by=['index', 'cde_id'],
                                                      axis=0,
                                                      as_index=False).sum()
                else:
                    agg_results = pd.DataFrame(
                        columns=['index', 'cde_id', col])
                search_score_df = pd.concat([search_score_df, agg_results])
            if search_score_df.shape[0] > 0:
                final_agg = search_score_df.groupby(by=['index', 'cde_id'],
                                                    axis=0,
                                                    as_index=False).max()
                new_df = pd.merge(new_df,
                                  final_agg,
                                  on=['index', 'cde_id'],
                                  how='outer')
            elif col not in new_df.columns.tolist():
                new_df[col] = 0
        new_df['secondary_search'] = [1] * new_df.shape[0]
        for col in search_functions.X_FT_STRUCTURE:
            new_df.loc[new_df[col] != new_df[col], col] = 0
        ordered_cols = [(col,
                         search_functions.X_FT_STRUCTURE[col]['column_no'])
                        for col in search_functions.X_FT_STRUCTURE]
        ordered_cols.sort(key=lambda z: z[1])
        new_df = new_df[[o[0] for o in ordered_cols]]
        new_df = new_df[~new_df['index'].isin(df['index'])]
        df = pd.concat([df, new_df])
    if PRINT_STATS:
        print("Initial df created, {0:d} rows".format(df.shape[0]))
        print("{0:1.2f} seconds.\n".format(time.time() - t))
    unique_values = utils.col_unique_values(col_series)
    if len(unique_values) > 0:
        unique_values_clean = [
            utils.clean_string_for_fulltext(i) for i in unique_values
        ]
        enum_search1 = search_functions.enumeration_concept_search(
            unique_values_clean, g)
        enum_search1_df = pd.DataFrame(
            [es for es in enum_search1 if es[2] > MIN_SCORE],
            columns=['index', 'cde_id', 'enum_concept_search'])
        enum_search2 = search_functions.enumeration_ansindex_search(
            unique_values_clean, g)
        enum_search2_df = pd.DataFrame(
            [es for es in enum_search2 if es[2] > MIN_SCORE],
            columns=['index', 'cde_id', 'enum_answer_search'])
        df = pd.merge(df, enum_search1_df, on=['index', 'cde_id'], how='outer')
        df = pd.merge(df, enum_search2_df, on=['index', 'cde_id'], how='outer')
    else:
        df['enum_concept_search'] = 0
        df['enum_answer_search'] = 0
    if PRINT_STATS:
        print("Enum searches complete, {0:d} rows".format(df.shape[0]))
        print("{0:1.2f} seconds.\n".format(time.time() - t))
    if df.shape[0] > 0:
        answer_count_df = search_functions.create_answer_count_df(
            df['index'].values, g)
        answer_count_df = pd.merge(df['index'],
                                   answer_count_df,
                                   on='index',
                                   how='outer')
        # print("Answer Count DF Created: {0:d} sec".format(int(time.time() - t)))
        # print("Answer Count DF size: {0:d}.\n".format(answer_count_df.shape[0]))
        n_ans = len(unique_values)
        n_lines = sum(col_series == col_series)
        answer_count_df.loc[
            answer_count_df['answer_count'] != answer_count_df['answer_count'],
            'answer_count'] = n_lines
        if answer_count_df.shape[0] > 0:
            answer_count_df = pd.DataFrame({
                'index':
                answer_count_df['index'],
                'answer_count_score':
                answer_count_df.apply(
                    lambda z: search_functions.nans_vs_nexp(n_ans, z[1]),
                    axis=1)
            })
        else:
            answer_count_df = pd.DataFrame({
                'index': [],
                'answer_count_score': []
            })
        df = pd.merge(df, answer_count_df, on='index', how='inner')
        if PRINT_STATS:
            print("Answer count complete, {0:d} rows".format(df.shape[0]))
            print("{0:1.2f} seconds.\n".format(time.time() - t))
        for c in df.columns:
            v = df[c] != df[c]
            if any(v):
                df.loc[v, c] = 0
        if n_ans > 0:
            query = "MATCH (n:CDE) WHERE ID(n) IN [{0:s}] RETURN DISTINCT ID(n), n.DATATYPE, n.DISPLAY_FORMAT, n.VALUE_DOMAIN_TYPE".format(
                ",".join([str(i) for i in df['index'].values]))
            result = utils.query_graph(query, g)
            values = result.values()
            temp_df = pd.DataFrame(values,
                                   columns=[
                                       'index', 'datatype', 'display_format',
                                       'value_domain_type'
                                   ])
            enum_ids = list(temp_df['index'].loc[temp_df['value_domain_type']
                                                 == 'Enumerated'].values)
            enum_scores = search_functions.score_enum_values(
                unique_values_clean, enum_ids, g)
            enum_score_df = pd.DataFrame(enum_scores,
                                         columns=['index', 'value_score'])
            temp_df = pd.merge(temp_df, enum_score_df, on='index', how='left')
            for display_format in DISPLAY_FORMATS:
                temp_df.loc[(temp_df['value_domain_type'] == 'NonEnumerated') &
                            (temp_df['display_format'] == display_format),
                            'value_score'] = len([
                                j for j in unique_values if datachecks.
                                check_display_format(str(j), display_format)
                            ]) / len(unique_values)
            for datatype in DATATYPES:
                temp_df.loc[(temp_df['value_domain_type'] == 'NonEnumerated') &
                            (temp_df['display_format'].isnull()) &
                            (temp_df['datatype'] == datatype),
                            'value_score'] = len([
                                j for j in unique_values
                                if datachecks.check_datatype(str(j), datatype)
                            ]) / len(unique_values)
            df = pd.merge(df,
                          temp_df[['index', 'value_score']],
                          on='index',
                          how='inner')
        else:
            df['value_score'] = 0
        if PRINT_STATS:
            print("Enum scores complete, {0:d} rows".format(df.shape[0]))
            print("{0:1.2f} seconds.\n".format(time.time() - t))
        df['index'] = df['index'].astype('int')
    for col in df.columns:
        df.loc[df[col] != df[col], col] = 0
    return df
Esempio n. 17
0
def get_enum_answers(cde_index, g):
    query = "MATCH (n:CDE) - [:PERMITS] - (a:Answer) - [:CAN_BE] - (at: AnswerText) WHERE ID(n) = {0:d} RETURN ID(at),at.name".format(
        cde_index)
    result = utils.query_graph(query, g)
    values = result.values()
    return values
Esempio n. 18
0
 for col in search_functions.X_FT_STRUCTURE:
     search_score_df = pd.DataFrame(columns=['index', 'cde_id', col])
     for result_type in search_functions.X_FT_STRUCTURE[col][
             'ft_postprocess_params']:
         search_result_filtered = [(i[0], i[1]) for i in search_results
                                   if i[2][0] == result_type]
         search_result_df = pd.DataFrame(search_result_filtered,
                                         columns=['node_index', col])
         q = search_functions.X_FT_STRUCTURE[col]['ft_postprocess_params'][
             result_type]['query'](",".join([
                 str(node_index_int) for node_index_int in
                 search_result_df['node_index'].tolist()
             ]))
         agg_type = search_functions.X_FT_STRUCTURE[col][
             'ft_postprocess_params'][result_type]['aggregation']
         res = utils.query_graph(q, g)
         res_df = pd.DataFrame(res.values(),
                               columns=['node_index', 'index', 'cde_id'])
         scored_results = pd.merge(res_df,
                                   search_result_df,
                                   how='left',
                                   on='node_index')
         if agg_type == 'max':
             agg_results = scored_results[['index', 'cde_id', col]].groupby(
                 by=['index', 'cde_id'], axis=0, as_index=False).max()
         elif agg_type == 'sum':
             agg_results = scored_results[['index', 'cde_id', col]].groupby(
                 by=['index', 'cde_id'], axis=0, as_index=False).sum()
         search_score_df = pd.concat([search_score_df, agg_results])
     if search_score_df.shape[0] > 0:
         final_agg = search_score_df.groupby(by=['index', 'cde_id'],
Esempio n. 19
0
def get_de_concepts(cde_id, g):
    query = "MATCH (n:CDE) - [:IS_CAT] - (d:DEC) - [] - (c:Concept) WHERE n.CDE_ID = {0:d} RETURN c.CODE".format(
        cde_id)
    codes = utils.query_graph(query, g).value()
    return codes
Esempio n. 20
0
def get_de_concepts_list(cde_index_list, g):
    query = "MATCH (n:CDE) - [:IS_CAT] - (d:DEC) - [] - (c:Concept) WHERE ID(n) IN [{0:s}] ".format(
        ",".join([str(i) for i in cde_index_list]))
    query += "RETURN ID(n),COLLECT(c.CODE)"
    codes = utils.query_graph(query, g).values()
    return codes
Esempio n. 21
0
def find_synonyms(input_str, g):
    query = "MATCH (n:Synonym) WHERE \"{0:s}\" CONTAINS n.name_lower RETURN n.name_lower".format(
        str(input_str).lower())
    result = utils.query_graph(query, g)
    values = result.value()
    return values
Esempio n. 22
0
def nameindex_query(input_string, g):
    query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node, score RETURN ID(node), score, LABELS(node)".format(
        str(input_string))
    result = utils.query_graph(query, g)
    return result.values()