Beispiel #1
0
def get_dataset(data_pickle_path, word_dict_path, predict_dict_path, save=False):
    all_events = CsvUtility.read_pickle(data_pickle_path, 'r')
    word_dict = CsvUtility.read_pickle(word_dict_path, 'r')
    predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r')
    print all_events[0]
    print len(word_dict), len(predict_dict), len(all_events)

    feature_dict = DictDoubleMap(list(word_dict))
    pred_dict = DictDoubleMap(list(predict_dict))

    feature_matrix = np.zeros((len(all_events), len(word_dict)))
    result_matrix = np.zeros((len(all_events), len(predict_dict)))

    for i_iter, event_line in enumerate(all_events):
        for event_item in event_line[0]:
            feature_matrix[i_iter][feature_dict.get_index_by_word(event_item)] += 1
        for pred_item in event_line[1]:
            result_matrix[i_iter][pred_dict.get_index_by_word(pred_item)] = 1

        if i_iter % 1000 == 0:
            print 'complete {0} of {1}'.format(i_iter, len(all_events))
    if save:
        CsvUtility.write_dict2csv(feature_dict.get_word2index(), Path+'/data-repository/', 'feature2index.csv')
        CsvUtility.write_dict2csv(pred_dict.get_word2index(), Path+'/data-repository/', 'predict2index.csv')
        CsvUtility.write_array2csv(feature_matrix, Path+'/data-repository/', 'feature_matrix.csv')
        CsvUtility.write_array2csv(result_matrix, Path+'/data-repository/', 'result_matrix.csv')

    return feature_matrix, result_matrix
def get_revert_labtest():
    labtest_df = pd.read_csv(os.path.join(Path,
                                          'MIMICIII_data/D_LABITEMS.csv'),
                             dtype=str)
    item_df = CsvUtility.read_pickle(
        Path + '/data-repository/lab_item_over.pkl', 'r')
    print item_df[:5]
    print type(list(item_df.index)[0])
    print labtest_df.shape
    print labtest_df[:5]
    print labtest_df.dtypes
    print labtest_df.describe()
    labtest_dict = labtest_df[['ITEMID', 'LABEL']]
    print labtest_dict.shape
    labtest_dict = labtest_dict.dropna()
    print labtest_dict.shape
    labtest_dict = labtest_dict.drop_duplicates()
    print labtest_dict.shape
    print labtest_dict[:5]
    # labtest_dict.to_csv("../data-repository/labtest_dict.csv", index=None)

    labtest_list = labtest_dict.values
    print labtest_list[:5]
    # print np.array(list(item_df.index), dtype=str)
    revert_labtest_dict = {}
    for i in range(len(labtest_list)):
        if labtest_list[i][0] in np.array(list(item_df.index), dtype=str):
            temp_str = remove_bracket_from_str(labtest_list[i][1])
            temp_str = remove_quotation_from_str(temp_str)
            temp_str = temp_str.replace(",", " ").strip().lower()
            revert_labtest_dict[temp_str] = labtest_list[i][0]

    print revert_labtest_dict
    print len(revert_labtest_dict)
    CsvUtility.write_dict2csv(dict(revert_labtest_dict),
                              Path + "/data-repository",
                              "revert_labtest_dict.csv")
            new_literature = ""
            for (key, value) in doc2word_list.entity_count.items():
                for i in range(value):
                    new_literature += key + ","
            new_docs.append(new_literature)
        i_count += 1
        if i_count % 5 == 0:
            end_time = clock()
            print('\rFile Completed {0} of {1}... Spend {2} s'.format(
                i_count, len(file_path_list), (end_time - start_time)))
            start_time = end_time

    # print "vocabulary size : ", len(vocabulary_count)
    print "using entity size : ", len(used_entity_count)
    print "num of docs having entity : ", len(doc2entity)
    # CsvUtility.write_dict2csv(raw_dict=vocabulary_count, csv_path=args.output_path,
    # file_name='literature_vocabulary.csv')
    CsvUtility.write_dict2csv(raw_dict=used_entity_count,
                              csv_path=args.output_path,
                              file_name='used_entity.csv')
    CsvUtility.write_dict2csv(raw_dict=doc2entity,
                              csv_path=args.output_path,
                              file_name='doc2entity.csv')
    CsvUtility.write_list2csv(new_docs,
                              csv_path=args.out_filter_file_path,
                              file_name='new_docs.csv')
    print '******************************************************************************'
#test code

#python select_relate_literature.py ../data-repository/literature_doc ../data-repository ../data-repository/new_literature entity_dict.csv
def get_revert_prescription():
    prescription_df = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/PRESCRIPTIONS.csv'),
                                  dtype=str)
    drug_df = CsvUtility.read_pickle(
        Path + '/data-repository/prescription_drug_over.pkl', 'r')
    # print type(list(drug_df.index)[0])
    # print np.array(list(drug_df.index), dtype=str)
    print prescription_df.shape
    print prescription_df[:5]
    print prescription_df.dtypes
    print prescription_df.describe()
    prescription_dict = prescription_df[[
        'FORMULARY_DRUG_CD', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC'
    ]]
    print prescription_dict.shape
    prescription_dict = prescription_dict.dropna()
    print prescription_dict.shape
    prescription_dict = prescription_dict.drop_duplicates()
    print prescription_dict.shape

    # print prescription_dict[:5]
    # prescription_dict.to_csv("../data-repository/prescription_dict.csv", index=None)

    stop_char = ['(', ')', '/', '/"', '-']
    stop_str = {
        "*nf*", "a", "b", "of", "and", "by", "to", "or", "the", "in", "with",
        "not", "classified", "for", "on", "from", "without", "as", "other",
        "than", "more", "at", "one", "all", "its", "may", "after", "any", "d",
        "be", "into", "their", "which", "an", "ec", "c", "e", "f", "g", "h",
        "i", "j", "k", "l", "m", "n", "o", "p", "q", "i", "s", "t", "u", "v",
        "w", "x", "y", "z", "vs.", "mg", "extended-release", ""
    }
    revert_prescrip_dict = {}
    prescrip_list = prescription_dict.values
    print prescrip_list[:5]
    for i in range(len(prescrip_list)):
        if prescrip_list[i][0] in list(drug_df.index):
            word_list_tmp = []
            prescrip_str = remove_bracket_from_str(prescrip_list[i][1])
            for stop_c in stop_char:
                prescrip_str = prescrip_str.replace(stop_c, ' ').strip()
            for word_tmp in prescrip_str.split(" "):
                tmp = word_tmp.lower()
                if len(tmp) > 0 and any(char.isalpha() for char in tmp):
                    if tmp.endswith("mg") and len(tmp) > 2 and is_number(
                            tmp[:-2]):
                        pass
                    elif tmp not in stop_str:
                        word_list_tmp.append(tmp.strip())
            words = " ".join(word_list_tmp).strip()
            if len(words) > 0 and words not in revert_prescrip_dict:
                revert_prescrip_dict[words] = prescrip_list[i][0]

            word_list_tmp = []
            prescrip_str = remove_bracket_from_str(prescrip_list[i][2])
            for stop_c in stop_char:
                prescrip_str = prescrip_str.replace(stop_c, ' ').strip()
            for word_tmp in prescrip_str.split(" "):
                tmp = word_tmp.lower()
                if len(tmp) > 0 and any(char.isalpha() for char in tmp):
                    if tmp.endswith("mg") and len(tmp) > 2 and is_number(
                            tmp[:-2]):
                        pass
                    elif tmp not in stop_str:
                        word_list_tmp.append(tmp.strip())
            words = " ".join(word_list_tmp).strip()
            if len(words) > 0 and words not in revert_prescrip_dict:
                revert_prescrip_dict[words] = prescrip_list[i][0]
    print revert_prescrip_dict
    print len(revert_prescrip_dict)

    CsvUtility.write_dict2csv(dict(revert_prescrip_dict),
                              Path + "/data-repository",
                              'revert_prescription_dict.csv')
def get_revert_diagnoses_procedures():
    word_count = {}
    stop_list = {
        "of", "and", "by", "to", "or", "the", "in", "with", "not",
        "classified", "for", "on", "from", "without", "as", "other", "than",
        "more", "at", "one", "all", "a", "its", "may", "after", "any", "d",
        "be", "into", "their", "which", "an", "*nf", "nf*", "but", "but", "",
        "-", "c", "c-c", "w", "e", "o", "b", "m", "g", "s", "h", "t-t", "un",
        "ve", "k", "u", "j", "t", "n"
    }
    diagnoses_df = CsvUtility.read_pickle(
        Path + '/data-repository/icd_diagnoses_over.pkl', 'r')
    procedures_df = CsvUtility.read_pickle(
        Path + '/data-repository/icd_procedures_over.pkl', 'r')
    data_diagnoses = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/D_ICD_DIAGNOSES.csv'),
                                 dtype=str)[["ICD9_CODE", "LONG_TITLE"]]
    data_procedures = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/D_ICD_PROCEDURES.csv'),
                                  dtype=str)[["ICD9_CODE", "LONG_TITLE"]]
    data_diagnoses.set_index(["ICD9_CODE"], inplace=True)
    data_procedures.set_index(["ICD9_CODE"], inplace=True)
    print diagnoses_df[:5]
    print diagnoses_df.shape
    print procedures_df[:5]
    print procedures_df.shape
    print data_diagnoses[:5]
    print data_diagnoses.shape
    print data_procedures[:5]
    print data_procedures.shape

    merge_diagnoses = pd.merge(diagnoses_df,
                               data_diagnoses,
                               how='inner',
                               left_index=True,
                               right_index=True)
    print merge_diagnoses[:10]
    print merge_diagnoses.shape

    merge_procedures = pd.merge(procedures_df,
                                data_procedures,
                                how='inner',
                                left_index=True,
                                right_index=True)
    print merge_procedures[:10]
    print merge_procedures.shape

    #combine the dianoses and procedures dataframe
    ICD_merge = pd.concat([merge_diagnoses, merge_procedures], axis=0)
    print ICD_merge[:5]

    icd_merge_list = np.array(ICD_merge.reset_index(), dtype=str)
    print icd_merge_list[:5]
    revert_diagnoses_procedures = {}
    for i in range(len(icd_merge_list)):
        wordlist = [
            re.sub("[^a-zA-Z-]", "", x.lower())
            for x in icd_merge_list[i][2].split(' ')
            if re.sub("[^a-zA-Z-]", "", x.lower()) not in stop_list
        ]
        revert_diagnoses_procedures[" ".join(wordlist)] = icd_merge_list[i][0]
        for word in wordlist:
            word_count[
                word] = word_count[word] + 1 if word in word_count else 1
    CsvUtility.write_dict2csv(revert_diagnoses_procedures,
                              Path + '/data-repository/',
                              'revert_diagnoses_procedures.csv')
    # CsvUtility.write_text2csv(word_count, '../data-repository/', 'revert_ICD_word_dict.csv')
    with open(Path + "/data-repository/revert_ICD_word_dict.csv", 'w') as w:
        for (key, value) in sorted(word_count.items(),
                                   key=lambda s: s[1],
                                   reverse=True):
            w.write(key + "," + str(value) + "\n")
    doc_maps = []
    for doc in file_contend:
        doc_maps.extend([[doc[2], doc[1]]])
    # print len(doc_maps), len(doc_maps[0])
    return doc_maps


if __name__ == '__main__':
    # get_good_docs('../data-repository/result/jack_1.csv', 10, 2)
    file_list = Directory.folder_process(Path + '/data-repository/result_0.8')

    merge_dict = dict({})
    doc_map = []
    for file_path in file_list:
        dict_tmp = get_good_docs(file_path, 80, 10)
        print 'this dict len : ', len(dict_tmp)
        merge_dict.update(dict_tmp)
        print 'after the merge : ', len(merge_dict)
        doc_map.extend(get_docs_frequence_kind_map(file_path=file_path))
    # draw_pl(x_y=doc_map, type='o')
    # print merge_dict
    texts = [[word for word in doc.split(' ')] for doc in merge_dict.values()]
    # pprint(texts[:5])
    dictionary = corpora.Dictionary(texts)
    dictionary.save(Path +
                    '/data-repository/available_word_in_literature.dict')
    print dictionary

    CsvUtility.write_dict2csv(merge_dict, Path + '/data-repository',
                              'selected_docs4LDA.csv')