def get_events_together():
    columns_name = [
        'hadm_id', 'subject_id', 'charttime', 'event_type', 'event'
    ]

    diag_columns = [
        'HADM_ID', 'SUBJECT_ID', 'DISCHTIME', 'DIAGNOSIS', 'ICD9_CODE'
    ]
    diag_events = get_all_diagnoses_event()
    diag_events = diag_events.ix[:, diag_columns]
    diag_events.columns = columns_name
    print diag_events[:5]

    lab_events = get_lab_event()
    lab_columns = ['HADM_ID', 'SUBJECT_ID', 'CHARTTIME', 'FLAG', 'ITEMID']
    lab_events = lab_events.ix[:, lab_columns]
    lab_events.columns = columns_name
    print lab_events[:5]

    medic_events = get_medication_event()
    medic_columns = [
        'HADM_ID', 'SUBJECT_ID', 'STARTDATE', 'DRUG_TYPE', 'FORMULARY_DRUG_CD'
    ]
    medic_events = medic_events.ix[:, medic_columns]
    medic_events.columns = columns_name
    print medic_events[:5]

    all_events = pd.concat([diag_events, lab_events, medic_events],
                           ignore_index=True)
    print all_events[:5]
    print all_events[-5:]
    print all_events.shape

    CsvUtility.write2pickle(path.join(Path, 'data-repository/allevents.pkl'),
                            all_events, 'w')
def filter_all_event():
    all_events_df = CsvUtility.read_pickle(
        path.join(Path, '/data-repository/allevents.pkl'), 'r')
    all_events_df['icd9_3'] = ''
    print all_events_df[:5]
    print all_events_df.shape
    # diagnoses_events = all_events_df[all_events_df['event_type'] == 'diagnosis']
    # print diagnoses_events[:5]
    # print diagnoses_events.shape
    # diagnoses_set = set(list(pd.read_csv('../data-repository/merge_diagnoses_dict.csv', header=None).index))
    # print len(diagnoses_set)
    # i=0
    # for index_iter in diagnoses_events.index:
    #     icd_code = diagnoses_events.ix[index_iter, 'event']
    #     assert len(icd_code) >= 3
    #     if len(icd_code) >= 3:
    #         if icd_code[:3] in diagnoses_set:
    #             all_events_df.ix[index_iter, 'icd9_3'] = all_events_df.ix[index_iter, 'event'][:3]
    #         else:
    #             all_events_df.drop(index_iter, axis=0, inplace=True)
    #     sys.stdout.write('\rROW {0} of {1}...'.format(i, diagnoses_events.shape[0]))
    #     i += 1
    # all_events_df.index = np.array(range(all_events_df.shape[0]))
    print all_events_df[:5]
    print all_events_df.shape
    CsvUtility.write2pickle(
        path.join(Path, '/data-repository/all_events_icd9.pkl'), all_events_df,
        'w')
Ejemplo n.º 3
0
def load_corpus(all_path=Path+'/data-repository/', train_perc=0.7):
    x, y = get_dataset(all_path + 'after_instance.pkl', all_path + 'event_instance_dict.pkl',
                       all_path + 'predict_diags_dict.pkl')
    train_size = int(x.shape[0] * train_perc)

    # shuffle the train set
    idx = np.random.permutation(x.shape[0])
    x_train = x[idx]
    y_train = y[idx]
    CsvUtility.write_array2csv(idx, Path+'/data-repository/', 'random_idx.csv')
    return x_train[:train_size], y_train[:train_size], x_train[train_size:], y_train[train_size:], idx
Ejemplo n.º 4
0
def reload_corpus(all_path=Path+'/data-repository/', train_perc=0.7, shuffle=False):
    x = CsvUtility.read_array_from_csv(all_path, 'feature_matrix.csv')
    y = CsvUtility.read_array_from_csv(all_path, 'result_matrix.csv')
    train_size = int(x.shape[0] * train_perc)
    # shuffle the train set
    if shuffle:
        idx = np.random.permutation(x.shape[0])
        CsvUtility.write_array2csv(idx, Path+'/data-repository/', 'random_idx.csv')
    else:
        idx = CsvUtility.read_array_from_csv(all_path, 'random_idx.csv')
    x_train = x[idx]
    y_train = y[idx]
    return x_train[:train_size], y_train[:train_size], x_train[train_size:], y_train[train_size:], idx
def get_drug_over(file_name, over_num):
    drug_df = pd.read_csv(os.path.join(Path, file_name),
                          dtype=str)[['HADM_ID', 'FORMULARY_DRUG_CD']]
    print drug_df[:5]
    print drug_df.shape
    drug_df.drop_duplicates(inplace=True)
    print drug_df.shape
    drug_count = drug_df['FORMULARY_DRUG_CD'].value_counts()
    drug_df = pd.DataFrame(drug_count[drug_count > over_num])
    drug_df.columns = ['COUNT']
    drug_df.index.name = 'FORMULARY_DRUG_CD'
    print drug_df[:5]
    print 'size:', drug_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/prescription_drug_over.pkl'), drug_df,
        'w')
def get_medication_event():
    medication_df = pd.read_csv(
        os.path.join(Path, 'MIMICIII_data/PRESCRIPTIONS.csv'))[[
            'SUBJECT_ID', 'HADM_ID', 'STARTDATE', 'DRUG_TYPE',
            'FORMULARY_DRUG_CD'
        ]]

    # print medication_df[:5]
    medication_df['DRUG_TYPE'] = ['prescription'] * medication_df.shape[0]
    # print medication_df[:5]
    # print medication_df.shape
    sub_df = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r')
    # drug_df = CsvUtility.read_pickle('../data-repository/prescription_drug_over.pkl', 'r')
    prescription_list = np.array(
        pd.read_csv(path.join(Path,
                              'data-repository/revert_prescription_dict.csv'),
                    index_col=[0],
                    header=None,
                    dtype=str)).flatten()
    medication_df = medication_df[
        medication_df['SUBJECT_ID'].isin(
            np.array(list(sub_df.index), dtype=str))
        & medication_df['FORMULARY_DRUG_CD'].isin(prescription_list)]
    # medication_df ['icd9_3'] = [''] * medication_df.shape[0]
    print medication_df.shape
    print len(set(list(medication_df['FORMULARY_DRUG_CD'])))
    return medication_df
def get_lab_event():
    labevent_df = pd.read_csv(
        os.path.join(Path, 'MIMICIII_data/LABEVENTS.csv'),
        dtype=str)[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'ITEMID', 'FLAG']]
    labevent_df = labevent_df[labevent_df['FLAG'] == 'abnormal']
    labevent_df['FLAG'] = ['labevent'] * labevent_df.shape[0]
    # labevent_df['SUBJECT_ID'] = labevent_df['SUBJECT_ID'].astype('str')
    # labevent_df['HADM_ID'] = labevent_df['HADM_ID'].astype('str')
    print labevent_df[-5:]
    print labevent_df.shape
    print labevent_df.dtypes
    sub_df = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r')

    # item_df = CsvUtility.read_pickle('../data-repository/lab_item_over.pkl', 'r')
    labtest_list = np.array(
        pd.read_csv(path.join(Path, 'data-repository/revert_labtest_dict.csv'),
                    index_col=[0],
                    header=None,
                    dtype=str)).flatten()
    print labtest_list
    print len(labtest_list)
    labevent_df = labevent_df[
        labevent_df['SUBJECT_ID'].isin(np.array(list(sub_df.index), dtype=str))
        & labevent_df['ITEMID'].isin(labtest_list)]
    # labevent_df['icd9_3'] = [''] * labevent_df.shape[0]
    print labevent_df.shape
    print len(set(list(labevent_df['ITEMID'])))
    return labevent_df
def subject_admission_over(filename, over_num):
    admission_df = pd.read_csv(os.path.join(Path, filename), dtype=str)
    # print admission_df[:5]
    # admission_df.filter(items=['SUBJECT_ID'], like=)
    print admission_df[:5]
    print admission_df.shape
    admission_df.drop_duplicates(inplace=True)
    print admission_df.shape
    sub_vc = admission_df['SUBJECT_ID'].value_counts()
    sub_df = pd.DataFrame(sub_vc[sub_vc > over_num])
    sub_df.columns = ['COUNT']
    sub_df.index.name = 'SUBJECT_ID'
    print sub_df[:5]
    print 'size: ', sub_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), sub_df,
        'w')
def icd_procedures_over(filename, over_num):
    procedures_df = pd.read_csv(os.path.join(Path, filename),
                                dtype=str)[['HADM_ID', 'ICD9_CODE']]
    print procedures_df[:5]
    print procedures_df.shape
    procedures_df.drop_duplicates(inplace=True)
    print procedures_df.shape
    procedure_count = procedures_df['ICD9_CODE'].value_counts()
    print procedure_count
    procedure_df = pd.DataFrame(procedure_count[procedure_count > over_num])
    procedure_df.columns = ['COUNT']
    procedure_df.index.name = 'ICD9_CODE'
    print procedure_df[:5]
    print 'size:', procedure_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/icd_procedures_over.pkl'),
        procedure_df, 'w')
def get_lab_item_over(file_name, over_num):
    labevent_df = pd.read_csv(os.path.join(Path, file_name),
                              dtype=str)[['HADM_ID', 'ITEMID', 'FLAG']]
    print labevent_df[:5]
    print labevent_df.shape
    labevent_df = labevent_df[labevent_df['FLAG'] == 'abnormal']
    print labevent_df.shape
    labevent_df.drop_duplicates(inplace=True)
    print labevent_df.shape
    item_count = labevent_df['ITEMID'].value_counts()
    item_df = pd.DataFrame(item_count[item_count > over_num])
    item_df.columns = ['COUNT']
    item_df.index.name = 'ITEMID'
    print item_df[:5]
    print 'size:', item_df.shape
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/lab_item_over.pkl'), item_df, 'w')
 def doc2sentences(self, rows_text, token):
     sentences_list = CsvUtility.text2sentence(raw_text=rows_text,
                                               token=token,
                                               stop_words=self.stop_word,
                                               stem_word=False)
     # for sentence in sentences_list:
     #     for word in sentence:
     #         self.word_count[word] = self.word_count[word] + 1 if word in self.word_count else 1
     return sentences_list
def get_revert_labtest():
    labtest_df = pd.read_csv(os.path.join(Path,
                                          'MIMICIII_data/D_LABITEMS.csv'),
                             dtype=str)
    item_df = CsvUtility.read_pickle(
        Path + '/data-repository/lab_item_over.pkl', 'r')
    print item_df[:5]
    print type(list(item_df.index)[0])
    print labtest_df.shape
    print labtest_df[:5]
    print labtest_df.dtypes
    print labtest_df.describe()
    labtest_dict = labtest_df[['ITEMID', 'LABEL']]
    print labtest_dict.shape
    labtest_dict = labtest_dict.dropna()
    print labtest_dict.shape
    labtest_dict = labtest_dict.drop_duplicates()
    print labtest_dict.shape
    print labtest_dict[:5]
    # labtest_dict.to_csv("../data-repository/labtest_dict.csv", index=None)

    labtest_list = labtest_dict.values
    print labtest_list[:5]
    # print np.array(list(item_df.index), dtype=str)
    revert_labtest_dict = {}
    for i in range(len(labtest_list)):
        if labtest_list[i][0] in np.array(list(item_df.index), dtype=str):
            temp_str = remove_bracket_from_str(labtest_list[i][1])
            temp_str = remove_quotation_from_str(temp_str)
            temp_str = temp_str.replace(",", " ").strip().lower()
            revert_labtest_dict[temp_str] = labtest_list[i][0]

    print revert_labtest_dict
    print len(revert_labtest_dict)
    CsvUtility.write_dict2csv(dict(revert_labtest_dict),
                              Path + "/data-repository",
                              "revert_labtest_dict.csv")
def get_simple_inference_penalty(net):
    # get loss from gamma with lda model
    # gamma = get_topicdist_lda(Path+'/data-repository/selected_docs4LDA.csv', 20)
    gamma = CsvUtility.read_array_from_csv(Path+'/data-repository', 'topicdist_result.csv')
    penalty = Variable(torch.FloatTensor([0.0]))
    gammas = Variable(torch.from_numpy(gamma)).float()
    latent_neuron_topics = np.array([])
    for para_iter, para in enumerate(net.parameters()):
        if para_iter == 0:
            latent_neuron_topics = para.abs().mm(gammas)
            # print 'latent_neuron_topics : ', latent_neuron_topics
            latent_neuron_topics = latent_neuron_topics / (latent_neuron_topics.sum(dim=1).view(-1, 1))

            # print 'Norm latent_neuron_topics : ', latent_neuron_topics
            penalty = Variable(torch.FloatTensor([1.0])) / (latent_neuron_topics.max(dim=1)[0].sum())

    return penalty, latent_neuron_topics.data.numpy()
def get_revert_diagnoses_procedures():
    word_count = {}
    stop_list = {
        "of", "and", "by", "to", "or", "the", "in", "with", "not",
        "classified", "for", "on", "from", "without", "as", "other", "than",
        "more", "at", "one", "all", "a", "its", "may", "after", "any", "d",
        "be", "into", "their", "which", "an", "*nf", "nf*", "but", "but", "",
        "-", "c", "c-c", "w", "e", "o", "b", "m", "g", "s", "h", "t-t", "un",
        "ve", "k", "u", "j", "t", "n"
    }
    diagnoses_df = CsvUtility.read_pickle(
        Path + '/data-repository/icd_diagnoses_over.pkl', 'r')
    procedures_df = CsvUtility.read_pickle(
        Path + '/data-repository/icd_procedures_over.pkl', 'r')
    data_diagnoses = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/D_ICD_DIAGNOSES.csv'),
                                 dtype=str)[["ICD9_CODE", "LONG_TITLE"]]
    data_procedures = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/D_ICD_PROCEDURES.csv'),
                                  dtype=str)[["ICD9_CODE", "LONG_TITLE"]]
    data_diagnoses.set_index(["ICD9_CODE"], inplace=True)
    data_procedures.set_index(["ICD9_CODE"], inplace=True)
    print diagnoses_df[:5]
    print diagnoses_df.shape
    print procedures_df[:5]
    print procedures_df.shape
    print data_diagnoses[:5]
    print data_diagnoses.shape
    print data_procedures[:5]
    print data_procedures.shape

    merge_diagnoses = pd.merge(diagnoses_df,
                               data_diagnoses,
                               how='inner',
                               left_index=True,
                               right_index=True)
    print merge_diagnoses[:10]
    print merge_diagnoses.shape

    merge_procedures = pd.merge(procedures_df,
                                data_procedures,
                                how='inner',
                                left_index=True,
                                right_index=True)
    print merge_procedures[:10]
    print merge_procedures.shape

    #combine the dianoses and procedures dataframe
    ICD_merge = pd.concat([merge_diagnoses, merge_procedures], axis=0)
    print ICD_merge[:5]

    icd_merge_list = np.array(ICD_merge.reset_index(), dtype=str)
    print icd_merge_list[:5]
    revert_diagnoses_procedures = {}
    for i in range(len(icd_merge_list)):
        wordlist = [
            re.sub("[^a-zA-Z-]", "", x.lower())
            for x in icd_merge_list[i][2].split(' ')
            if re.sub("[^a-zA-Z-]", "", x.lower()) not in stop_list
        ]
        revert_diagnoses_procedures[" ".join(wordlist)] = icd_merge_list[i][0]
        for word in wordlist:
            word_count[
                word] = word_count[word] + 1 if word in word_count else 1
    CsvUtility.write_dict2csv(revert_diagnoses_procedures,
                              Path + '/data-repository/',
                              'revert_diagnoses_procedures.csv')
    # CsvUtility.write_text2csv(word_count, '../data-repository/', 'revert_ICD_word_dict.csv')
    with open(Path + "/data-repository/revert_ICD_word_dict.csv", 'w') as w:
        for (key, value) in sorted(word_count.items(),
                                   key=lambda s: s[1],
                                   reverse=True):
            w.write(key + "," + str(value) + "\n")
Ejemplo n.º 15
0
def mlp_lda(penalty_rate=100):

    # Mimic Dataset
    print 'loading data...'
    train_x, train_y, test_x, test_y, idx = load_corpus()
    print 'loading ready...'
    print 'shape of train x:', train_x.shape
    print 'shape of train y:', train_y.shape
    print 'shape of test x:', test_x.shape
    print 'shape of test y:', test_y.shape

    # Hyper Parameters
    input_size = len(train_x[0])
    hidden_size = 128
    num_classes = 80
    num_epochs = 20
    batchsize = 10
    learning_rate = 0.001

    net = Net(input_size, hidden_size, num_classes)

    # Loss and Optimizer
    criterion = nn.MSELoss(size_average=False)
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
    print 'parameter size :'
    for para in net.parameters():
        print para.size()
        # print para

    train_dataset = Data.TensorDataset(data_tensor=torch.from_numpy(train_x),
                                       target_tensor=torch.from_numpy(train_y))
    train_loader = Data.DataLoader(dataset=train_dataset,
                                   batch_size=batchsize,
                                   shuffle=True,
                                   num_workers=2)
    test_dataset = Data.TensorDataset(data_tensor=torch.from_numpy(test_x),
                                      target_tensor=torch.from_numpy(test_y))
    test_loader = Data.DataLoader(dataset=test_dataset,
                                  batch_size=batchsize,
                                  shuffle=False,
                                  num_workers=2)

    # Train the Model
    neuron_topics = np.array([])
    for epoch in range(num_epochs):
        running_loss = 0.0
        count_isntance = 0
        for i, data_iter in enumerate(train_loader, 0):
            # Convert numpy array to torch Variable
            input_train_x, input_train_y = data_iter
            inputs = Variable(input_train_x).float()
            targets = Variable(input_train_y).float()

            # get the penalty from lda model
            penalty, neuron_topics = get_simple_inference_penalty(net)
            #penalty = 0

            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            #print 'criterion loss : ', loss
            #print 'penalty loss: ', penalty
            loss = loss + penalty_rate * penalty
            # print 'penalty loss : ', (penalty_rate * penalty).data.numpy()
            loss.backward()
            optimizer.step()

            running_loss += loss.data[0] / num_classes
            count_isntance += 1

            # print loss.data
            if (i + 1) % 100 == 1:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %
                      (epoch + 1, num_epochs, i + 1, train_x.shape[0] /
                       batchsize, running_loss / count_isntance))
                running_loss = 0.0
                count_isntance = 0

        CsvUtility.write_array2csv(neuron_topics, Path + '/data-repository',
                                   'neuron_topics_' + str(epoch) + '.csv')
    print 'finish training'

    # Test the Model
    res = []
    test_loss = 0.0
    test_count = 0
    for data_iter in test_loader:
        input_test_x, input_test_y = data_iter

        outputs = net(Variable(input_test_x).float())
        targets = Variable(input_test_y).float()
        # _, predicted = torch.max(outputs.data, 1)
        predicted = outputs.data

        st_loss = criterion(outputs, targets)
        test_loss += st_loss.data[0] / num_classes
        test_count += 1
        res.extend(list(predicted.numpy()))

    # save the first parameter
    paras = net.parameters()
    for i, para4 in enumerate(paras, 0):
        if i == 0:
            para4save = para4.data.numpy()

            print 'the first parameter: ', para4save.shape
            CsvUtility.write_array2csv(para4save, Path + '/data-repository',
                                       'temp_parameter.csv')

    # get the precision of test data
    print 'result shape:', len(res), len(res[0])
    print 'test loss:', test_loss / test_count

    auc_list, _ = get_auc_list(test_y, res)
    print 'AUC List:'
    print auc_list
def get_final_word_dict():
    MIMIC_word_dict = list(
        CsvUtility.read_pickle(
            Path + '/data-repository/event_instance_dict.pkl', 'r'))
    print MIMIC_word_dict[:10]
    print len(MIMIC_word_dict)
    diag_num = 0
    lab_num = 0
    drug_num = 0
    other_num = 0
    new_MIMIC_dict = {}

    for item in MIMIC_word_dict:
        if item.startswith("d_"):
            diag_num += 1
        elif item.startswith("l_"):
            lab_num += 1
        elif item.startswith("p_"):
            drug_num += 1
        else:
            other_num += 1
            print item
        new_MIMIC_dict[item[2:]] = item
    new_MIMIC_dict_df = pd.DataFrame.from_dict(dict(new_MIMIC_dict),
                                               orient='index')
    show_df(new_MIMIC_dict_df, 10)

    print 'diagnoses number :', diag_num, 'labtest number:', lab_num, 'drug number:', drug_num, 'other number:', other_num

    revert_diag_proce_df = pd.read_csv(
        Path + '/data-repository/revert_diagnoses_procedures.csv',
        header=None,
        dtype=str)
    revert_labtest_df = pd.read_csv(Path +
                                    '/data-repository/revert_labtest_dict.csv',
                                    header=None,
                                    dtype=str)
    revert_prescrip_df = pd.read_csv(
        Path + '/data-repository/revert_prescription_dict.csv',
        header=None,
        dtype=str)

    show_df(revert_diag_proce_df, 10)
    show_df(revert_labtest_df, 10)
    show_df(revert_prescrip_df, 10)

    concat_dict = pd.concat(
        [revert_diag_proce_df, revert_labtest_df, revert_prescrip_df],
        axis=0,
        ignore_index=True)
    show_df(concat_dict, 20)
    concat_dict.set_index(keys=[1], inplace=True)
    show_df(concat_dict, 10)
    print len(set(list(concat_dict.index)))

    merge_df = pd.merge(new_MIMIC_dict_df,
                        concat_dict,
                        how='left',
                        left_index=True,
                        right_index=True)
    show_df(merge_df, 10)

    print len(set(list(merge_df.index)))
    print len(merge_df['0_x'].unique())
    print len(merge_df['0_y'].unique())

    merge_df.drop_duplicates()
    show_df(merge_df)
    merge_df.to_csv(Path + '/data-repository/entity_dict.csv',
                    header=None,
                    index=None)
Ejemplo n.º 17
0
def get_sequence():
    print 'reading.....'
    all_events = CsvUtility.read_pickle('../data-repository/allevents.pkl',
                                        'r')
    print all_events.shape
    all_events.dropna(axis=0,
                      how='any',
                      subset=['subject_id', 'charttime', 'event', 'hadm_id'],
                      inplace=True)
    print all_events.shape
    print 'changing the order......'
    all_events = all_events.ix[:, [
        'subject_id', 'charttime', 'event_type', 'event', 'icd9_3', 'hadm_id'
    ]]
    print all_events.dtypes
    all_events = all_events.astype({'hadm_id': 'int64'})
    print all_events.dtypes
    print 'sorting ......'
    all_events.sort_values(
        by=['subject_id', 'hadm_id', 'charttime', 'event_type', 'event'],
        inplace=True)
    print all_events[:10]
    rows = np.array(all_events)

    prev_time = None
    prev_subject = None
    prev_hadm_id = None
    # temp diagnoses in each admission
    diags = set()
    # temp event sequence in each admission
    temp_event_seq = []
    event_seq = []
    # event sequence for each person
    all_seq = []
    # map the time to the events in all_seq
    all_days = []
    # whole set of events
    unique_events = set()
    # whole diagnoses count dict
    diag_count = defaultdict(lambda: 0)
    # get the static feature of a patient
    p_features = set_p_features()
    # count the length of sequence
    seq_len = 0
    seq_max = 0
    seq_min = 100000
    for i in rows[0]:
        print type(i)
    for i, row in enumerate(rows):
        # print i, row
        if row[2] == "diagnosis":
            event = row[2][:1] + "_" + str(row[4])
            if not row[2].startswith("E"):
                diag_count[event] += 1
        else:
            event = row[2][:1] + "_" + str(row[3])

        if row[0] is None or row[1] is None or row[5] is None:
            print 'delete None:', row
            continue
        elif type(row[1]) != str and math.isnan(row[1]):
            print 'delete nan:', row
            continue

        elif prev_time is None or prev_subject is None:
            print 'first event'
            pass

        elif (row[0] != prev_subject) or (NLP_Utility.strtime2datetime(
                row[1]) > prev_time + datetime.timedelta(365)):
            print 'change sequence', row, ' pre: ', prev_subject, prev_time
            if len(diags) > 0 and len(event_seq) > 4:
                # pre, suf = calculate_window(event_seq + temp_event_seq, all_days)
                # all_seq.append([p_features, event_seq, temp_event_seq, diags, pre, suf])
                temp_event_seq = [x for x in temp_event_seq if x not in diags]
                for item in event_seq:
                    unique_events.add(item)
                for item in temp_event_seq:
                    unique_events.add(item)
                all_days.append(len(temp_event_seq))
                all_seq.append([
                    p_features[prev_hadm_id], event_seq, temp_event_seq,
                    all_days, diags
                ])
                print '!!!__!!!', prev_subject
                print len(event_seq) + len(temp_event_seq), len(all_days), sum(
                    all_days)
                seq_len += len(all_days)
                seq_max = seq_max if seq_max > len(all_days) else len(all_days)
                seq_min = seq_min if seq_min < len(all_days) else len(all_days)
            diags = set()
            event_seq = []
            temp_event_seq = []
            all_days = []
        elif prev_hadm_id != row[5]:
            print 'change temp sequence:', row, ' prev: ', prev_hadm_id
            all_days.append(len(temp_event_seq))
            event_seq += temp_event_seq
            temp_event_seq = []
            diags = set()
        elif NLP_Utility.strtime2datetime(row[1]) != prev_time:
            # print 'just change time: ', prev_time, rows[1]
            all_days.append(len(temp_event_seq))
            event_seq += temp_event_seq
            temp_event_seq = []

        # print 'adding ....'
        temp_event_seq.append(event)

        prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
        prev_subject = row[0]
        prev_hadm_id = row[5]

        if row[2] == "diagnosis":
            diags.add(event)

        if i % 10000 == 0:
            print 'complete {0} of {1}'.format(i, len(rows))

    # Write down the vocalulary used and diagnoses that we want to predict
    predicted_diags = [
        y[0]
        for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True)
        [:num_pred_diag]
    ]

    # uniq = open('../data-repository/vocab', 'w')
    # uniq.write(' '.join(unique_events) + '\n')
    # uniq.write(' '.join(predicted_diags))
    # uniq.close()
    print len(all_seq)
    print all_seq[0]
    after_del_sequence = []
    for instance in all_seq:
        fil_diag = [diag for diag in instance[-1] if diag in predicted_diags]
        if len(fil_diag) > 0:
            after_del_sequence.append(instance)
            after_del_sequence[-1][-1] = fil_diag
    print 'num of seq: ', len(after_del_sequence)
    print 'max/min of seq: ', seq_max, seq_min
    print 'mean of seq: ', seq_len / len(after_del_sequence)
    CsvUtility.write2pickle('../data-repository/after_sequence.pickle',
                            after_del_sequence, 'w')
    CsvUtility.write2pickle('../data-repository/event_dict.pickle',
                            unique_events, 'w')

    print '************************************************************'

    #######################################################################################################

    def get_diag_sequence():
        pass
 def write_filtered_file(self):
     return CsvUtility.write_key_value_times(
         self.entity_count, self.out_filter_file_path,
         'F_' + os.path.basename(self.doc_path))
            new_literature = ""
            for (key, value) in doc2word_list.entity_count.items():
                for i in range(value):
                    new_literature += key + ","
            new_docs.append(new_literature)
        i_count += 1
        if i_count % 5 == 0:
            end_time = clock()
            print('\rFile Completed {0} of {1}... Spend {2} s'.format(
                i_count, len(file_path_list), (end_time - start_time)))
            start_time = end_time

    # print "vocabulary size : ", len(vocabulary_count)
    print "using entity size : ", len(used_entity_count)
    print "num of docs having entity : ", len(doc2entity)
    # CsvUtility.write_dict2csv(raw_dict=vocabulary_count, csv_path=args.output_path,
    # file_name='literature_vocabulary.csv')
    CsvUtility.write_dict2csv(raw_dict=used_entity_count,
                              csv_path=args.output_path,
                              file_name='used_entity.csv')
    CsvUtility.write_dict2csv(raw_dict=doc2entity,
                              csv_path=args.output_path,
                              file_name='doc2entity.csv')
    CsvUtility.write_list2csv(new_docs,
                              csv_path=args.out_filter_file_path,
                              file_name='new_docs.csv')
    print '******************************************************************************'
#test code

#python select_relate_literature.py ../data-repository/literature_doc ../data-repository ../data-repository/new_literature entity_dict.csv
def get_gamma_lda(docs_path, topic_num):
    selected_docs = pd.read_csv(docs_path, header=None, index_col=[0]).values
    print 'number of docs:', selected_docs.shape
    # print selected_docs[:5]
    texts = [[word for word in doc[0].split(' ')] for doc in selected_docs]
    # pprint(texts[:5])
    dictionary = corpora.Dictionary(texts)
    dictionary.save_as_text(Path+'/data-repository/available_word_in_literature.csv')
    print dictionary
    # print dictionary.token2id
    corpus = [dictionary.doc2bow(text) for text in texts]
    # print corpus[:5]
    # print len(corpus)
    lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=topic_num,
                                update_every=1, chunksize=1000, passes=1)

    # lda_model.print_topics(10, 10)
    # gamma = lda_model.get_topics().T
    gamma = lda_model.state.get_lambda()
    gamma = (gamma / gamma.sum(axis=0)).T
    print "shape of gamma :", gamma.shape
    CsvUtility.write_array2csv(gamma, Path+'/data-repository', 'gamma_from_LDA.csv')
    pprint(lda_model.show_topics(10, 10))

    # change the gamma, because the number of word is less than the number of feature
    # then insert zeros to change the size of gamma into bigger gamma with the same size
    # of (feature_size, topic_number)

    gamma_id2word = {}
    with open(Path+'/data-repository/available_word_in_literature.csv') as file:
        line_num = file.readline()
        # print line_num
        lines_contend = file.readlines()
        for line_n in lines_contend:
            line = line_n.split("\t")
            # print line
            if len(line) > 1:
                gamma_id2word[int(line[0])] = line[1]
    print 'original gamma size: ', len(gamma_id2word)
    id_list = gamma_id2word.keys()
    # print np.array(id_list).max()

    feature_word2id = {}
    feature_index = pd.read_csv(Path+'/data-repository/feature2index.csv',
                                header=None, index_col=None)
    # print feature_index.shape
    # print feature_index[:5]
    f_i = np.array(feature_index)
    # print f_i.shape, f_i[:, 1].max()
    # np.zeros((feature_index.shape[0], gamma_data.shape[1]))
    for i in range(f_i.shape[0]):
        feature_word2id[f_i[i][0]] = int(f_i[i][1])
    print 'new feature size: ', len(feature_word2id)

    change_index_result = np.zeros((feature_index.shape[0], gamma.shape[1]))
    for i in range(gamma.shape[0]):
        new_index = feature_word2id[gamma_id2word[i]]
        for j in range(gamma.shape[1]):
            change_index_result[new_index][j] += gamma[i][j]
        if i % 1000 == 0:
            print i, 'line'
    print change_index_result[:5]
    print 'after changing the size of result: ', change_index_result.shape
    CsvUtility.write_array2csv(change_index_result, Path+'/data-repository',
                               'gamma_result.csv')
    return change_index_result
def get_all_diagnoses_event():
    diagnoses_df = pd.read_csv(path.join(Path,
                                         'MIMICIII_data/DIAGNOSES_ICD.csv'),
                               dtype=str)
    procedures_df = pd.read_csv(path.join(Path,
                                          'MIMICIII_data/PROCEDURES_ICD.csv'),
                                dtype=str)
    print procedures_df[:5]
    print procedures_df.shape
    print diagnoses_df[:5]
    print diagnoses_df.shape
    diagnoses_df = pd.concat([diagnoses_df, procedures_df], axis=0)
    print diagnoses_df[:5]
    print diagnoses_df.shape
    admission_df = pd.read_csv(os.path.join(Path,
                                            'MIMICIII_data/ADMISSIONS.csv'),
                               dtype=str)
    # print admission_df[:5]
    diagnoses_event = pd.merge(
        diagnoses_df[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']],
        admission_df[['HADM_ID', 'DISCHTIME', 'DIAGNOSIS']],
        'left',
        on='HADM_ID')
    diagnoses_event['DIAGNOSIS'] = ['diagnosis'] * diagnoses_event.shape[0]
    print diagnoses_event[:10]
    print diagnoses_event.shape
    # print diagnoses_event.dtypes
    # print type(diagnoses_event.ix[0, 0])
    # new update:
    # here icd_diagnoses_over is useless, because the revert_diagnoses_dict already use the "over" to limit the dict
    # icd_df = CsvUtility.read_pickle('../data-repository/icd_diagnoses_over.pkl', 'r')
    diagnoses_list = np.array(
        pd.read_csv(path.join(
            Path, 'data-repository/revert_diagnoses_procedures.csv'),
                    index_col=[0],
                    header=None).values).flatten()
    # print diagnoses_list
    # print len(diagnoses_list)
    sub_df = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/subject_admission_over.pkl'), 'r')
    diagnoses_event = diagnoses_event[
        diagnoses_event['SUBJECT_ID'].isin(
            np.array(list(sub_df.index), dtype=str))
        & diagnoses_event['ICD9_CODE'].isin(diagnoses_list)]
    print diagnoses_event.shape
    print diagnoses_event[:10]
    ######################################
    # print 'additional process'
    # np_diagnoses_event = np.array(diagnoses_event)
    # new_diagnoses_event = []
    #
    # for i in range(len(np_diagnoses_event)):
    #     if np_diagnoses_event[i][2] != np.NaN and len(np_diagnoses_event[i][2]) >= 3 and np_diagnoses_event[i][2][:3] in diagnoses_set:
    #         new_line = []
    #         new_line.extend(np_diagnoses_event[i])
    #         new_line.append(np_diagnoses_event[i][2][:3])
    #         if re.match('^V.*', np_diagnoses_event[i][2]):
    #             new_line[4] = 'condition'
    #         if re.match('^7[89]\d.*', np_diagnoses_event[i][2]):
    #             new_line[4] = 'symptom'
    #         new_diagnoses_event.append(new_line)
    #     if i % 10000 == 0:
    #         print i
    # new_columns = list(diagnoses_event.columns)
    # new_columns.append('icd9_3')
    # print new_columns
    # print new_diagnoses_event[:5]
    # diagnoses_event = pd.DataFrame(new_diagnoses_event)
    # diagnoses_event.columns = new_columns

    ######################################
    ######################################
    # just add the 'condition' and 'symptom' and do not use the icd9_3 anymore..
    print "new additional processing ..."
    np_diagnosis_events = np.array(diagnoses_event)
    new_diagnosis_events = []
    for i in range(len(np_diagnosis_events)):
        new_diagnosis_events.append(np_diagnosis_events[i])
        if re.match('^V.*', np_diagnosis_events[i][2]):
            new_diagnosis_events[-1][4] = 'condition'
        elif re.match('^7[89]\d.*]', np_diagnosis_events[i][2]):
            new_diagnosis_events[-1][4] = 'symptom'
        if i % 10000 == 0:
            print "processing the ", i, "line"
    new_columns = list(diagnoses_event.columns)
    print new_columns
    diagnoses_event = pd.DataFrame(new_diagnosis_events, dtype=str)
    diagnoses_event.columns = new_columns
    ######################################

    print diagnoses_event[:10]
    print diagnoses_event.shape
    print len(set(list(diagnoses_event['ICD9_CODE'])))
    return diagnoses_event
def get_instance(time_before_diag=90):
    print 'reading.....'
    all_events = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/allevents.pkl'), 'r')
    print all_events.shape
    all_events.dropna(axis=0, how='any', inplace=True)
    print all_events.shape
    print 'changing the order......'
    all_events = all_events.ix[:, [
        'subject_id', 'charttime', 'event_type', 'event', 'hadm_id'
    ]]
    print all_events.dtypes
    # all_events = all_events.astype({'hadm_id': 'int64'})
    # print all_events.dtypes
    all_events['subject_id'] = all_events['subject_id'].astype('int64')
    for rr in all_events.ix[0, :]:
        print type(rr)
    print 'sorting ......'
    all_events.sort_values(
        by=['subject_id', 'charttime', 'event_type', 'event'], inplace=True)
    print all_events[:10]
    rows = np.array(all_events, dtype=str)

    prev_time = None
    prev_subject = None
    # temp diagnoses in each time
    tem_diags = set()
    # temp event sequence in each time
    temp_event_seq = []
    # event sequence for each person
    event_seq = []
    # map the time for each person
    event_days = []
    # first time for each person
    base_time = None
    # all instance
    all_seq = []
    # whole set of events
    unique_events = set()
    # whole diagnoses count dict
    diag_count = defaultdict(lambda: 0)
    # count the length of instance
    seq_max = 0
    seq_min = 100000
    for i in rows[0]:
        print type(i)
    for i, row in enumerate(rows):
        # print i, row
        # if row[2] == "diagnosis":
        #     event = row[2][:1] + "_" + str(row[4])
        # else:
        #     event = row[2][:1] + "_" + str(row[3])
        event = row[2][:1] + "_" + str(row[3])

        # if type(row[1]) != str and math.isnan(row[1]):
        #     print 'delete nan:', row
        #     continue
        if prev_time is None or prev_subject is None:
            print 'first event'
            base_time = NLP_Utility.strtime2datetime(row[1])
        elif row[0] != prev_subject or NLP_Utility.strtime2datetime(
                row[1]) != prev_time:
            if len(tem_diags) > 0:
                # why exclude the diagnoses?
                # temp_event_seq = [x for x in temp_event_seq if x not in tem_diags]
                this_days = (prev_time - base_time).days
                find_days = this_days - time_before_diag if this_days >= time_before_diag else 0
                start_position = get_first_index(event_days, find_days)
                t_event_seq = []
                # for i_pos in range(start_position, len(event_days)):
                #     t_event_seq.append(event_seq[i_pos])
                # unique_events.add(event_seq[i_pos])
                t_event_seq += event_seq[start_position:]
                # print len(event_seq[start_position:])
                # for test_event in event_seq[start_position:]:
                #     if test_event.startswith("p_"):
                #         print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
                # for item in temp_event_seq:
                #     # t_event_seq.append(item)
                #     unique_events.add(item)
                all_seq.append([t_event_seq, list(tem_diags)])
                for iter_diag in tem_diags:
                    diag_count[iter_diag] = diag_count[iter_diag] + 1
                seq_max = seq_max if seq_max > len(t_event_seq) else len(
                    t_event_seq)
                seq_min = seq_min if seq_min < len(t_event_seq) else len(
                    t_event_seq)
            if row[0] != prev_subject:
                # print 'change patient ', row, ' pre: ', prev_subject, row[0]
                event_seq = []
                event_days = []
                base_time = NLP_Utility.strtime2datetime(row[1])
            else:
                # print 'change time ', row, ' pre: ', prev_time, row[1]
                event_seq += temp_event_seq
                # print prev_time
                # print base_time
                # print type((prev_time - base_time).days)
                event_days += [(prev_time - base_time).days
                               ] * len(temp_event_seq)
            tem_diags = set()
            temp_event_seq = []
        # print 'adding ....'
        temp_event_seq.append(event)
        prev_time = datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
        prev_subject = row[0]
        if row[2] == "diagnosis":
            tem_diags.add(event)

        if i % 10000 == 0:
            print 'complete {0} of {1}'.format(i, len(rows))

    # Write down the vocalulary used and diagnoses that we want to predict
    predicted_diags = [
        y[0]
        for y in sorted(diag_count.items(), key=lambda x: x[1], reverse=True)
        [:num_pred_diag]
    ]
    print 'num of seq: ', len(all_seq)
    print all_seq[0]
    after_del_sequence = []
    for instance in all_seq:
        fil_diag = [diag for diag in instance[-1] if diag in predicted_diags]
        # if len(fil_diag) > 0:
        for item in instance[0]:
            unique_events.add(item)
        after_del_sequence.append(instance)
        after_del_sequence[-1][-1] = fil_diag
        for diag in fil_diag:
            unique_events.add(diag)
    print 'after limit the predict diagnoses, num of seq: ', len(
        after_del_sequence)
    print 'max/min of seq: ', seq_max, seq_min
    print 'number of unique items:', len(unique_events)
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/after_instance.pkl'),
        after_del_sequence, 'w')
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/event_instance_dict.pkl'),
        unique_events, 'w')
    CsvUtility.write2pickle(
        path.join(Path, 'data-repository/predict_diags_dict.pkl'),
        predicted_diags, 'w')
    print '************************************************************'
def get_revert_prescription():
    prescription_df = pd.read_csv(os.path.join(
        Path, 'MIMICIII_data/PRESCRIPTIONS.csv'),
                                  dtype=str)
    drug_df = CsvUtility.read_pickle(
        Path + '/data-repository/prescription_drug_over.pkl', 'r')
    # print type(list(drug_df.index)[0])
    # print np.array(list(drug_df.index), dtype=str)
    print prescription_df.shape
    print prescription_df[:5]
    print prescription_df.dtypes
    print prescription_df.describe()
    prescription_dict = prescription_df[[
        'FORMULARY_DRUG_CD', 'DRUG', 'DRUG_NAME_POE', 'DRUG_NAME_GENERIC'
    ]]
    print prescription_dict.shape
    prescription_dict = prescription_dict.dropna()
    print prescription_dict.shape
    prescription_dict = prescription_dict.drop_duplicates()
    print prescription_dict.shape

    # print prescription_dict[:5]
    # prescription_dict.to_csv("../data-repository/prescription_dict.csv", index=None)

    stop_char = ['(', ')', '/', '/"', '-']
    stop_str = {
        "*nf*", "a", "b", "of", "and", "by", "to", "or", "the", "in", "with",
        "not", "classified", "for", "on", "from", "without", "as", "other",
        "than", "more", "at", "one", "all", "its", "may", "after", "any", "d",
        "be", "into", "their", "which", "an", "ec", "c", "e", "f", "g", "h",
        "i", "j", "k", "l", "m", "n", "o", "p", "q", "i", "s", "t", "u", "v",
        "w", "x", "y", "z", "vs.", "mg", "extended-release", ""
    }
    revert_prescrip_dict = {}
    prescrip_list = prescription_dict.values
    print prescrip_list[:5]
    for i in range(len(prescrip_list)):
        if prescrip_list[i][0] in list(drug_df.index):
            word_list_tmp = []
            prescrip_str = remove_bracket_from_str(prescrip_list[i][1])
            for stop_c in stop_char:
                prescrip_str = prescrip_str.replace(stop_c, ' ').strip()
            for word_tmp in prescrip_str.split(" "):
                tmp = word_tmp.lower()
                if len(tmp) > 0 and any(char.isalpha() for char in tmp):
                    if tmp.endswith("mg") and len(tmp) > 2 and is_number(
                            tmp[:-2]):
                        pass
                    elif tmp not in stop_str:
                        word_list_tmp.append(tmp.strip())
            words = " ".join(word_list_tmp).strip()
            if len(words) > 0 and words not in revert_prescrip_dict:
                revert_prescrip_dict[words] = prescrip_list[i][0]

            word_list_tmp = []
            prescrip_str = remove_bracket_from_str(prescrip_list[i][2])
            for stop_c in stop_char:
                prescrip_str = prescrip_str.replace(stop_c, ' ').strip()
            for word_tmp in prescrip_str.split(" "):
                tmp = word_tmp.lower()
                if len(tmp) > 0 and any(char.isalpha() for char in tmp):
                    if tmp.endswith("mg") and len(tmp) > 2 and is_number(
                            tmp[:-2]):
                        pass
                    elif tmp not in stop_str:
                        word_list_tmp.append(tmp.strip())
            words = " ".join(word_list_tmp).strip()
            if len(words) > 0 and words not in revert_prescrip_dict:
                revert_prescrip_dict[words] = prescrip_list[i][0]
    print revert_prescrip_dict
    print len(revert_prescrip_dict)

    CsvUtility.write_dict2csv(dict(revert_prescrip_dict),
                              Path + "/data-repository",
                              'revert_prescription_dict.csv')
Ejemplo n.º 24
0
def get_dataset(data_pickle_path, word_dict_path, predict_dict_path, save=False):
    all_events = CsvUtility.read_pickle(data_pickle_path, 'r')
    word_dict = CsvUtility.read_pickle(word_dict_path, 'r')
    predict_dict = CsvUtility.read_pickle(predict_dict_path, 'r')
    print all_events[0]
    print len(word_dict), len(predict_dict), len(all_events)

    feature_dict = DictDoubleMap(list(word_dict))
    pred_dict = DictDoubleMap(list(predict_dict))

    feature_matrix = np.zeros((len(all_events), len(word_dict)))
    result_matrix = np.zeros((len(all_events), len(predict_dict)))

    for i_iter, event_line in enumerate(all_events):
        for event_item in event_line[0]:
            feature_matrix[i_iter][feature_dict.get_index_by_word(event_item)] += 1
        for pred_item in event_line[1]:
            result_matrix[i_iter][pred_dict.get_index_by_word(pred_item)] = 1

        if i_iter % 1000 == 0:
            print 'complete {0} of {1}'.format(i_iter, len(all_events))
    if save:
        CsvUtility.write_dict2csv(feature_dict.get_word2index(), Path+'/data-repository/', 'feature2index.csv')
        CsvUtility.write_dict2csv(pred_dict.get_word2index(), Path+'/data-repository/', 'predict2index.csv')
        CsvUtility.write_array2csv(feature_matrix, Path+'/data-repository/', 'feature_matrix.csv')
        CsvUtility.write_array2csv(result_matrix, Path+'/data-repository/', 'result_matrix.csv')

    return feature_matrix, result_matrix
from baseline_method.multi_logistic_model import MultiLogistic
from load_data import load_corpus, reload_corpus
from utility.csv_utility import CsvUtility

if __name__ == '__main__':
    print 'loading data...'
    train_x, train_y, test_x, test_y, idx = reload_corpus()
    print 'loading ready...'
    multi_logs = MultiLogistic(len(train_y[0]))
    print 'training...'
    multi_logs.training(training_x=train_x, training_y=train_y)
    print 'testing...'
    re_auc, re_list = multi_logs.testing(testing_x=test_x, testing_y=test_y)
    print re_auc[:-1]
    print re_auc[-1]
    CsvUtility.write2pickle('../data-repository/model_multilogisticCV.pkl',
                            [idx, re_auc, re_list], 'w')
    # except Exception:
    #     pass

    # first step
    print "prepare the dict of subject(patient), diagnosis, medication, labtest by limit minimal count number"
    subject_admission_over('MIMICIII_data/ADMISSIONS.csv', 1)
    print "============================================================================="
    icd_diagnoses_over('MIMICIII_data/DIAGNOSES_ICD.csv', 5)
    print "============================================================================="
    icd_procedures_over('MIMICIII_data/PROCEDURES_ICD.csv', 5)
    print "============================================================================="
    get_lab_item_over('MIMICIII_data/LABEVENTS.csv', 10)
    print "============================================================================="
    get_drug_over('MIMICIII_data/PRESCRIPTIONS.csv', 10)
    print "============================================================================="
    # get_all_diagnoses_event()
    # get_lab_event()
    # get_medication_event()

    # third step
    get_events_together()
    all_events = CsvUtility.read_pickle(
        path.join(Path, 'data-repository/allevents.pkl'), 'r')
    for i in all_events.ix[0, :]:
        print i
        print type(i)
    # filter_all_event()
    print '******************************************************************************'

# python select_relate_literature.py '../data-repository/BMC_Musuloskelet_Disord' '../data-repository' 'merge_diagnoses_word_dict.csv'
    doc_maps = []
    for doc in file_contend:
        doc_maps.extend([[doc[2], doc[1]]])
    # print len(doc_maps), len(doc_maps[0])
    return doc_maps


if __name__ == '__main__':
    # get_good_docs('../data-repository/result/jack_1.csv', 10, 2)
    file_list = Directory.folder_process(Path + '/data-repository/result_0.8')

    merge_dict = dict({})
    doc_map = []
    for file_path in file_list:
        dict_tmp = get_good_docs(file_path, 80, 10)
        print 'this dict len : ', len(dict_tmp)
        merge_dict.update(dict_tmp)
        print 'after the merge : ', len(merge_dict)
        doc_map.extend(get_docs_frequence_kind_map(file_path=file_path))
    # draw_pl(x_y=doc_map, type='o')
    # print merge_dict
    texts = [[word for word in doc.split(' ')] for doc in merge_dict.values()]
    # pprint(texts[:5])
    dictionary = corpora.Dictionary(texts)
    dictionary.save(Path +
                    '/data-repository/available_word_in_literature.dict')
    print dictionary

    CsvUtility.write_dict2csv(merge_dict, Path + '/data-repository',
                              'selected_docs4LDA.csv')