def main(work_dir=None, model='rf', set_of_classes=(0, 1, 2, 3)):
    while work_dir is None or Path(work_dir).exists() is False:
        print("Unable to locate directory.")
        work_dir = input("Please enter working directory: ")

    # folder with features split by section
    work_dir = Path(work_dir)
    DATA_DIR = work_dir/'section_fm'
    GOLD_FILE = work_dir/'GOLD_multiclass.csv'




    ML_param_file = work_dir / 'data' / 'ML_model_settings' / 'ML_default_settings.json'
    if ML_param_file.exists():
        params = get_ML_parameters(use_default=False, dict_path=ML_param_file)
    else:
        params = get_ML_parameters(use_default=True)


    logging.info("Loading Data from: " + str(DATA_DIR))

    pathlist = Path(DATA_DIR).glob('*.csv')

    fm_by_section = {}
    lionc = []
    sections_writen = defaultdict(bool) # default = false

    for path in pathlist:
        section_name = path.stem
        lionc.append(section_name)
        fm_by_section[section_name] = pd.read_csv(path,index_col=0)
        fm_by_section[section_name].fillna(0, inplace=True)

    if len(lionc) < 1:
        logging.error("No files found at: " + str(DATA_DIR))
        exit()


    # load gold
    gold = pd.read_csv(GOLD_FILE,index_col=0)
    gold.fillna(0, inplace=True)
    tasks = [x for x in gold if x not in ['test','train']]

    frac_features_for_running_f1 = 0.01


    #set the following to use either RFECV or RFE
    run_f1_with_rfecv = True
    logging.info("frac_features_for_running_f1: " + str(frac_features_for_running_f1) + " with CV?: " + str(run_f1_with_rfecv))
    no_feature_elim = False  # if run_f1_with_rfecv == False can try to run without Feature elim

    logging.info("list of sections found:")
    logging.info(str(lionc))
    logging.info("model to run: " + str(model))

    rfecv_top_features = {}
    NUM_SECT_TO_COMBINE = len(lionc)  # add all sections
    sect_combinations = combinations(lionc,NUM_SECT_TO_COMBINE)

    for combo in sect_combinations:

        section_list = []
        for section in combo:
            section_list.append(fm_by_section[section])

        merged = combine_list_dfs(section_list)

        merged = normalize_df_columns(merged,0,tf=(lambda x: x ** (1/3)))

        train, test, features = rearrange_for_testing(merged, gold)

        p_avg, r_avg, f1_avg, f1_macro_avg = 0, 0, 0, 0

        print("features:", len(features))
        output_label_line = '%s %8s %8s %8s %8s %8s %8s' % ("Morbidity Results", "P-micro", "P-macro", "R-micro", "R-macro", "F1-micro", "F1-macro")
        logging.info(output_label_line)



        for task in tasks:
            train, test, features = rearrange_for_testing(merged, gold, task, set_of_classes)
            # filter features if desired
            features = [f for f in features if len(f)!=2]
            # features = [f for f in features if f[-1] != 'n']

            if run_f1_with_rfecv:
                preds, feat_important,num_feat = rfecv_classifier(model, train_data=train[features], train_class=train[task], test_data=test[features], CV_=3, fraction_feat_to_keep=frac_features_for_running_f1, LM_params=params, save_model=True)
                rfecv_top_features[task] = feat_important
            elif no_feature_elim:
                clf = set_up_classifier(model, 0, LM_params=params)
                clf.fit(train[features], train[task])
                preds = clf.predict(test[features])
            else:
                preds, feat_important,num_feat = rfe_classifier(model, train_data=train[features], train_class=train[task].astype(int), test_data=test[features], CV_=10, fraction_feat_to_keep=frac_features_for_running_f1, LM_params=params)




            results = CalculatePerformance.calculate_metrics(list(test[task]), list(preds), set_of_classes,output_type='values')
            f1 = results[4]
            f1_macro = results[5]

            f1_avg += f1 /len(tasks)
            f1_macro_avg += f1_macro/len(tasks)


            results = CalculatePerformance.calculate_metrics(list(test[task]), list(preds), set_of_classes,output_type='values')
            logging.info("task: " + str(task) + ' ' + CalculatePerformance.calculate_metrics(list(test[task]), list(preds),set_of_classes,output_type='text').strip())





    file_name = work_dir / 'models' / 'top_features.json'
    save_to_json(rfecv_top_features,file_name)

    logging.info("Averages: f1: %.6f, f1_macro: %.6f" % (f1_avg, f1_macro_avg))
Ejemplo n.º 2
0
def main(resource_bundle_dir):
    from pathlib import Path
    import operator
    from collections import defaultdict

    snomed_savefile = 'snomed_found.json'
    rxnorm_savefile = 'rxcui_found.json'

    rxnorm_count = defaultdict(int)
    rxnorm_text = {}

    snomed_count = defaultdict(int)
    snomed_text = {}

    print("Loading data from: " + resource_bundle_dir)
    pathlist = Path(resource_bundle_dir).glob('*.json')
    for path in pathlist:

        with open(str(path), 'r', encoding='UTF-8') as fp:
            line = fp.readline()
            cnt = 1
            while line:
                line = line.strip()
                if line == '"system": "http://www.nlm.nih.gov/research/umls/rxnorm",':
                    line = fp.readline().strip()
                    if line[0:9] == '"code": "':
                        code = line[9:-1]
                        rxnorm_count[code] += 1
                        #try to get text
                        for _ in range(3):
                            line = fp.readline().strip()
                            if line[0:6] == '"text"':
                                text = line[9:-1]
                                rxnorm_text[code] = text
                                break
                if line == '"system": "http://snomed.info/sct",':
                    line = fp.readline().strip()
                    if line[0:9] == '"code": "':
                        code = line[9:-1]
                        snomed_count[code] += 1
                        #try to get text
                        for _ in range(3):
                            line = fp.readline().strip()
                            if line[0:6] == '"text"':
                                text = line[9:-1]
                                snomed_text[code] = text
                                break

                line = fp.readline()

    rxnorm_count = sorted(rxnorm_count.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

    rxcui_description = {}
    for key, _ in rxnorm_count:
        rxcui_description[key] = rxnorm_text[key]

    snomed_count = sorted(snomed_count.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

    snomed_description = {}
    for key, _ in snomed_count:
        snomed_description[key] = snomed_text[key]

    print("\nrx_norm_list:")
    print("rx Entries: ", len(rxnorm_count))
    print(rxnorm_count)
    print(rxcui_description)
    print("\nsnomed_list:")
    print("Snomed Entries: ", len(snomed_count))
    print(snomed_count)
    print(snomed_description)

    # save data
    save_to_json(snomed_description, snomed_savefile, indent=4)
    save_to_json(rxcui_description, rxnorm_savefile, indent=4)

    file_ascii = 'snomed_found.txt'
    write_plain = open(file_ascii, 'w')

    for key, value in snomed_description.items():
        str1 = str(key) + ': ' + str(value) + '\n'
        write_plain.write(str1)
    write_plain.close()
def main(working_dir, depth=10):
    if depth > 0 and len(base_uri) == 0:
        exit("Please set base url for Snomed CT server in:" + __file__)

    log_settings(filename="SNOMED_LOOKUP.log", level=logging.INFO, filemode='w', stdout=True)
    working_dir = Path(str(working_dir))
    logging.info("Starting Snomed Lookup")

    # save progress after N base snomed ct lookups in case anything occurs during run
    SAVE_PROGRESS_EVERY_N = 10  # -1 disable save

    # dictionary with Snomed CT code to original text description of code
    # this will contain the snomed entries to search for
    snomed_file_to_lookup = working_dir /'data'/ 'snomed_found.json'
    logging.info("Loading data from: " + str(snomed_file_to_lookup))

    save_snomed_to_parents_file = working_dir / 'data' / 'snomed_parents_inferred.json'
    snomed_ancestor_file = working_dir / 'data' / ('snomed_ancestor_inferred.json')
    snomed_code_descriptions_from_query = working_dir / 'data' / 'snomed_description_from_query.json'

    snomed_to_description = load_dict_json(snomed_file_to_lookup)
    sctid_to_parents = load_dict_json(save_snomed_to_parents_file, create_local_if_not_found=True)
    snomed_to_ancestors = load_dict_json(snomed_ancestor_file, create_local_if_not_found=True)
    sctid_to_desc = load_dict_json(snomed_code_descriptions_from_query, create_local_if_not_found=True)

    list_of_snomed_to_lookup = list(snomed_to_description.keys())


    count = 0
    for sctid in list_of_snomed_to_lookup:
        if sctid not in sctid_to_desc:
            name = query_snomed_name(sctid)
            print(sctid, name)
            sctid_to_desc[sctid] = name
        count +=1
        if SAVE_PROGRESS_EVERY_N > 0 and count % int(SAVE_PROGRESS_EVERY_N) == 0 or count == len(list_of_snomed_to_lookup):  # save results once in a while
            save_to_json(sctid_to_desc, snomed_code_descriptions_from_query, indent=4)
            print("Saving results: ", count, "/", len(list_of_snomed_to_lookup), " entries processed.")



    count = 0
    for sctid in list_of_snomed_to_lookup:
        list_of_ancestors = list(set(get_snomed_ancestors(sctid, sctid_to_parents, sctid_to_desc, depth=depth, query_depth=depth)))
        if list_of_ancestors:
            snomed_to_ancestors[sctid] = list_of_ancestors
        count += 1
        if SAVE_PROGRESS_EVERY_N > 0 and count % int(SAVE_PROGRESS_EVERY_N) == 0:  # save results once in a while
            save_to_json(sctid_to_parents, save_snomed_to_parents_file, indent=4)
            save_to_json(snomed_to_ancestors, snomed_ancestor_file, indent=4)
            save_to_json(sctid_to_desc, snomed_code_descriptions_from_query, indent=4)
            print("Saving results: ", count, "/", len(list_of_snomed_to_lookup), " entries processed.")

    # save results

    save_to_json(sctid_to_parents, save_snomed_to_parents_file,indent=4,print_save_loc=True)
    save_to_json(snomed_to_ancestors, snomed_ancestor_file,indent=4,print_save_loc=True)
    save_to_json(sctid_to_desc, snomed_code_descriptions_from_query,indent=4,print_save_loc=True)
Ejemplo n.º 4
0
def main(data_dir=None, work_dir=None):
    while data_dir is None or Path(data_dir).exists() is False:
        print("Unable to locate directory.")
        data_dir = input(
            "Please enter data directory (FHIR JSON Resource Bundle): ")
    while work_dir is None or Path(work_dir).exists() is False:
        print("Unable to locate directory.")
        work_dir = input("Please enter working directory: ")

    data_dir = Path(data_dir)
    work_dir = Path(work_dir)
    pathlist = Path(data_dir).glob('*.json')

    log_settings(filename="json_based_reader.log", filemode='w')

    os.makedirs(work_dir / "output", exist_ok=True)
    print("Trying to load data from: " + str(data_dir))
    print("Working Directory: " + str(work_dir))

    lionc_words_record = defaultdict(list)
    lionc_characters_record = defaultdict(list)
    lionc_snomed_count_record = defaultdict(list)
    lionc_rxnorm_count_record = defaultdict(list)

    # track all the snomed ct, rxcui encountered
    sct_to_desc = {}
    rxcui_to_desc = {}

    for path in pathlist:
        path_in_str = str(path)
        report = load_dict_json(path_in_str)

        resource_to_section = {}
        section_to_resource = defaultdict(list)
        code_counts = defaultdict(int)
        code_negation_counts = defaultdict(int)

        lionc_words = defaultdict(int)
        lionc_characters = defaultdict(int)
        lionc_snomed_count = defaultdict(int)
        lionc_rxnorm_count = defaultdict(int)

        try:
            sections_and_references = report['entry'][0]['resource']['section']
        except KeyError:
            resource_to_section = defaultdict(lambda: '00000:0')

        # Read through first section defining Lion-C sections and references to uuid
        for lionc in sections_and_references:
            lionc_code = lionc['code']['coding'][0]['code']
            if not re_loinc.match(lionc_code):
                lionc_code = '00000-0'

            lionc_text = lionc['text']['div']
            word_char_count = text_word_counter(lionc_text)
            lionc_words[lionc_code] += word_char_count[0]
            lionc_characters[lionc_code] += word_char_count[1]

            if 'entry' in lionc:
                for item in lionc['entry']:  # references
                    reference = re_fhir_rsc.findall(item['reference'])[0]
                    resource_to_section[reference] = lionc_code
                    section_to_resource[lionc_code].append(reference)

        for i in range(1, len(report['entry'])):
            try:
                a_resource = report['entry'][i]['resource']
                resource_type = a_resource['resourceType']
                uuid = a_resource['id']
            except Exception as e:
                print(type(e))

            # Add new resource types if necessary.  Code locations need to be manually defined.
            try:
                if resource_type == 'Condition':
                    cct = ConditionEntry(a_resource)
                elif resource_type == 'FamilyMemberHistory':
                    cct = FamilyHistoryEntry(a_resource)
                elif resource_type == 'Medication':
                    cct = MedicationEntry(a_resource)
                elif resource_type == 'MedicationStatement':
                    cct = MedicationStatementEntry(a_resource)
                elif resource_type == 'Procedure':
                    cct = ProcedureEntry(a_resource)
                else:
                    print(resource_type, " was not included.")
            except KeyError as err:
                logging.info(err)
                logging.info("code value (rxcui/sct) not found in file:" +
                             path_in_str)
                logging.info(str(a_resource))

            # print(cct.return_codes())
            section = find_section_for_uuid(cct.uuid, resource_to_section)
            snomed_rxn_counts = cct.code_type_counts()
            lionc_snomed_count[section] += snomed_rxn_counts[0]
            lionc_rxnorm_count[section] += snomed_rxn_counts[1]

            combined_section_with_code, negation_status = entry_to_codes(
                cct,
                resource_to_section,
                sct_to_desc=sct_to_desc,
                rxcui_to_desc=rxcui_to_desc,
                incl_addtl_codes=INCL_ADDTL_CODES)
            for code in combined_section_with_code:
                code_counts[code] += 1
                if negation_status:
                    code_negation_counts[code] += 1

        for lionc in section_to_resource:
            # save to results for all records
            lionc_words_record[lionc].append(lionc_words[lionc])
            lionc_characters_record[lionc].append(lionc_characters[lionc])
            lionc_snomed_count_record[lionc].append(lionc_snomed_count[lionc])
            lionc_rxnorm_count_record[lionc].append(lionc_rxnorm_count[lionc])

        code_counts = OrderedDict(
            sorted(code_counts.items(), key=itemgetter(1), reverse=True))

        # output file (csv format with original file name)
        file_name = path.stem
        if file_name.find('.') > 0:
            file_name = file_name[:(file_name.find('.'))]
        output_path = work_dir / 'output' / (file_name + '.txt')
        with open(output_path, 'w') as output:
            output.write("code,count,negation\n")
            for k, v in code_counts.items():
                text = k + "," + str(v) + "," + str(
                    code_negation_counts[k]) + "\n"
                output.write(text)

    # after all records processed
    with open(work_dir / 'data' / 'RB_Section_Summary.txt', 'w') as fp:
        for lionc in lionc_snomed_count_record:
            words = lionc_words_record[lionc]
            chars = lionc_characters_record[lionc]
            scts = lionc_snomed_count_record[lionc]
            rxnorms = lionc_rxnorm_count_record[lionc]
            line1 = "%10s" * 5 % (str(lionc), 'words', 'chars', '#snomed',
                                  '#rxnorm')
            line2 = ("%10s" + "%10.3f" * 4) % ('', mean(words), mean(chars),
                                               mean(scts), mean(rxnorms))
            print(line1)
            print(line2)
            fp.write(line1 + '\n')
            fp.write(line2 + '\n')

    # save found terms
    save_to_json(sct_to_desc,
                 work_dir / 'data' / 'snomed_found.json',
                 indent=4)
    save_to_json(rxcui_to_desc,
                 work_dir / 'data' / 'rxcui_found.json',
                 indent=4)
Ejemplo n.º 5
0
def main(working_dir,
         find_ingreds=True,
         find_ATC=True,
         output_ATC_count=False):
    print("Starting RxNorm code lookup")
    working_dir = Path(working_dir)
    print("Loading Data from: " + str(working_dir))

    global cache_cui_to_ingredients, cache_cui_to_atc, manual_ingredient_entries
    rxnorm_savefile = working_dir / 'data' / 'rxcui_found.json'
    save_atc_file = working_dir / 'data' / 'rxcui_atc.json'
    ingredient_dict_file = working_dir / 'data' / 'rxcui_ingredient.json'
    manual_ingredient_entries_file = working_dir / 'data' / 'rxcui_ingred_manual_entries.json'
    ingredients_name_file = working_dir / 'data' / "rxcui_ingredient_names.json"
    rxcui_name_file = working_dir / 'data' / "rxcui_names.json"

    cache_cui_to_ingredients = load_dict_json(ingredient_dict_file,
                                              create_local_if_not_found=True)
    cache_cui_to_atc = load_dict_json(save_atc_file,
                                      create_local_if_not_found=True)
    manual_ingredient_entries = load_dict_json(manual_ingredient_entries_file,
                                               create_local_if_not_found=True)
    if not manual_ingredient_entries:
        print("Manual entries for expired etc. Rxcui can be added at:\n" +
              str(manual_ingredient_entries_file))
    ingredients_name = load_dict_json(ingredients_name_file,
                                      create_local_if_not_found=True)

    #rxcui_name = load_dict_json(rxcui_name_file, create_local_if_not_found=True)
    rxcui_name = {}

    # load rxcui to search for
    rxcui_to_lookup = load_dict_json(rxnorm_savefile,
                                     create_local_if_not_found=False)

    rxcui_to_atc = {}
    rxcui_to_ingredients = {}

    # count = 0
    # for rxcui in rxcui_to_lookup.keys():
    #     count += 1
    #     rxcui_name[rxcui] = query_rxnorm_name(rxcui)
    #     if count % 100 == 0 or count == len(rxcui_to_lookup):
    #         print("Working:", count)
    #         save_to_json(rxcui_name, rxcui_name_file, indent=4)
    #

    if find_ingreds:
        count = 0
        for rxcui in rxcui_to_lookup.keys():
            rxcui_to_ingredients[rxcui] = get_rxnorm_ingredients(rxcui)
            count += 1
            if count % 100 == 0 or count == len(rxcui_to_lookup):
                save_to_json(rxcui_to_ingredients,
                             ingredient_dict_file,
                             indent=4)
                save_to_json(ingredients_name, ingredients_name_file, indent=4)
                print("Stage 1/4: Ingredients Lookup: ", count, "/",
                      len(rxcui_to_lookup), " entries processed.")

        # try to look up missing entries because sometimes rxcui get retired
        for rxcui, ingredients in rxcui_to_ingredients.items():
            if ingredients:
                continue
            if rxcui in rxcui_to_lookup:
                original_text_from_FHIR = str(rxcui_to_lookup[rxcui])
            else:
                print("Can not find rxcui: ", rxcui)
                continue

            search_results = get_rxnorm_ingredients_using_multisearch(
                original_text_from_FHIR)
            search_results = list(set(search_results))
            if search_results:
                rxcui_to_ingredients[rxcui] = search_results
                print("Rxcui term found: ", rxcui, original_text_from_FHIR,
                      search_results)
            else:
                print("Rxcui term could NOT be found:",
                      original_text_from_FHIR)

    save_to_json(rxcui_to_ingredients, ingredient_dict_file, indent=4)
    save_to_json(ingredients_name, ingredients_name_file, indent=4)

    # try to find ingredients from original ingredients
    for rxcui, ingredients in rxcui_to_ingredients.items():
        new_ingredients = []
        for ingredient in ingredients:
            new_ingredients = new_ingredients + get_rxnorm_ingredients(
                ingredient)
        rxcui_to_ingredients[rxcui] = list(set(ingredients + new_ingredients))

    #find ATC codes
    if find_ATC:
        count = 0
        for code, ingredients in rxcui_to_ingredients.items():
            ATC = get_rxnorm_ATC(code)

            if not ATC:
                for ingredient in ingredients:
                    ATC = ATC + get_rxnorm_ATC(ingredient)

            if ATC:
                rxcui_to_atc[code] = list(set(ATC))
            else:
                print("ATC could not be found for ", code,
                      '(%d/%d)' % (count, len(rxcui_to_ingredients)))
            count += 1
            if count % 100 == 0 or count == len(rxcui_to_ingredients):
                save_to_json(rxcui_to_atc, save_atc_file, indent=4)
                print("Stage 4/4: ATC: ", count, "/",
                      len(rxcui_to_ingredients), " entries processed.")

    #save data
    if find_ingreds:
        save_to_json(rxcui_to_ingredients,
                     ingredient_dict_file,
                     indent=4,
                     print_save_loc=True)
        save_to_json(ingredients_name,
                     ingredients_name_file,
                     indent=4,
                     print_save_loc=True)
    if find_ATC:
        save_to_json(rxcui_to_atc,
                     save_atc_file,
                     indent=4,
                     print_save_loc=True)

    count = 0
    new_count = 0
    keys_to_del = []

    if output_ATC_count:
        ATC_struct = defaultdict(int)
        for key, ATCs in rxcui_to_atc.items():
            if not ATCs:
                continue
            for ATC in ATCs:
                for subcode in [ATC[0:1], ATC[0:3], ATC[0:4], ATC[0:5]]:
                    ATC_struct[subcode] += 1

        ATC_struct = sorted(ATC_struct.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
        print("ATC count:")
        print(ATC_struct)