Ejemplo n.º 1
0
def get_sentences():
    data = load_description()
    ids = lmap(lambda x: x['icd10_code'].strip(), data)
    input2 = lmap(lambda x: x['short_desc'], data)
    desc_tokens = lmap(nltk.word_tokenize, input2)

    l = []
    for id_token, desc_token in zip(ids, desc_tokens):
        l.append([id_token] + desc_token)
    return l
Ejemplo n.º 2
0
def extract_data(data, word2idx):
    def tokens_to_idx(tokens):
        return list([word2idx[t] for t in tokens])

    ids = lmap(lambda x: x['order_number'], data)
    icd10_codes = lmap(lambda x: x['icd10_code'], data)
    desc_list = lmap(lambda x: x['short_desc'], data)
    train_tokens = lmap(tokens_to_idx, tokenize(desc_list))
    icd10_codes = lmap(lambda x: x.strip(), icd10_codes)

    return icd10_codes, ids, train_tokens
Ejemplo n.º 3
0
def fit_and_tokenize(input2):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(input2)
    input2 = lmap(tf.keras.preprocessing.text.text_to_word_sequence, input2)
    print(input2[0])
    enc_text = tokenizer.texts_to_sequences_generator(input2)
    return enc_text, tokenizer
Ejemplo n.º 4
0
def build_voca(data):
    short_desc_list = lmap(lambda x: x['short_desc'], data)
    all_text = tokenize(short_desc_list)
    voca = set(flatten(all_text))
    n_output_voca = len(voca)
    word2idx = {}
    for idx, word in enumerate(list(voca)):
        word2idx[word] = idx
    return n_output_voca, word2idx
Ejemplo n.º 5
0
def do_eval(w2v):
    data = load_description()

    ids = set(lmap(lambda x: x['icd10_code'].strip(), data))
    ap_list = []
    for e in data[:1000]:
        word = e['icd10_code'].strip()
        terms = nltk.word_tokenize(e['short_desc'])
        ranked_list = list(
            [w for w in w2v.most_similar(word, topn=50) if w[0] not in ids])

        def is_correct(w_pair):
            return w_pair[0] in terms

        AP = AP_from_binary(lmap(is_correct, ranked_list), len(terms))
        ap_list.append(AP)

    print("MAP", sum(ap_list) / len(ap_list))
Ejemplo n.º 6
0
def input_fn(data, tokenizer, max_seq):
    input2 = lmap(lambda x: x['short_desc'], data)
    ids = lmap(lambda x: x['order_number'], data)

    code_ids = list([[e] for e in ids])

    input2 = lmap(tf.keras.preprocessing.text.text_to_word_sequence, input2)
    enc_text = tokenizer.texts_to_sequences_generator(input2)

    def fit_to_length(e):
        e = e[:max_seq]
        e = e + (max_seq - len(e)) * [0]
        return [e]

    enc_desc_tokens = lmap(fit_to_length, enc_text)
    dataset = tf.data.Dataset.from_tensor_slices({
        str_code_id:
        code_ids,
        str_desc_tokens:
        enc_desc_tokens
    })
    dataset = dataset.shuffle(1000)
    dataset = dataset.repeat()
    return dataset
Ejemplo n.º 7
0
def train_loop(args):
    model_dir = os.path.join(model_path, "w2v_model")
    data = load_description()
    n_input_voca = data[-1]['order_number']
    input2 = lmap(lambda x: x['short_desc'], data)
    dim = 1000
    max_seq = 1
    batch_size = int(args.batch_size)
    epochs = int(args.epochs)
    lr = float(args.lr)
    print("learning rate", lr)
    print("Batch_size", batch_size)
    enc_text, tokenizer = fit_and_tokenize(input2)
    token_config = tokenizer.get_config()
    n_output_voca = len(token_config['word_index'])

    random.shuffle(data)
    train_size = int(0.1 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:]

    step_per_epoch = int(len(train_data) / batch_size)
    max_step = step_per_epoch * epochs
    config = tf.estimator.RunConfig().replace(
        keep_checkpoint_max=1,
        log_step_count_steps=10,
        save_checkpoints_steps=step_per_epoch)
    tf_logging = logging.getLogger('tensorflow')
    tf_logging.setLevel(logging.DEBUG)
    print("Building_estimator")
    estimator = tf.estimator.Estimator(model_dir=model_dir,
                                       model_fn=build_model_fn(
                                           lr, dim, max_seq, n_input_voca,
                                           n_output_voca),
                                       config=config)

    print("start training")
    # estimator.train(
    #     input_fn=lambda :input_fn(train_data, tokenizer, max_seq),
    #     steps=max_step
    # )
    estimator.predict(
        input_fn=lambda: input_fn(train_data, tokenizer, max_seq),
        steps=max_step)
Ejemplo n.º 8
0
def work(args):
    data = load_description()
    n_input_voca = data[-1]['order_number']
    input2 = lmap(lambda x: x['short_desc'], data)
    dim = 1000
    max_seq = 1
    batch_size = int(args.batch_size)
    epochs = int(args.epochs)
    lr = float(args.lr)
    print("learning rate", lr)
    print("Batch_size", batch_size)
    enc_text, tokenizer = fit_and_tokenize(input2)

    random.shuffle(data)
    train_size = int(0.1 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:]

    step_per_epoch = int(len(train_data) / batch_size)

    train_dataset = input_fn(train_data, tokenizer, max_seq)
    val_dataset = input_fn(val_data, tokenizer, max_seq)
    token_config = tokenizer.get_config()
    n_output_voca = len(token_config['word_index'])
    loss, model = build_model(dim, max_seq, n_input_voca, n_output_voca)
    model.add_loss(loss)
    #model = multi_gpu_model(model, 4, cpu_relocation=True)
    optimizer = tf.keras.optimizers.Adam(lr=lr,
                                         beta_1=0.9,
                                         beta_2=0.999,
                                         amsgrad=False)

    model.compile(optimizer=optimizer)
    model.fit(
        train_dataset,
        # validation_data=val_dataset,
        # validation_steps=3000,
        epochs=epochs,
        steps_per_epoch=step_per_epoch,
        batch_size=batch_size)
    model.save('my_model.h5')
Ejemplo n.º 9
0
def work2():
    data = load_description()
    n_input_voca = data[-1]['order_number']
    input2 = lmap(lambda x: x['short_desc'], data)
    dim = 100
    max_seq = 30

    all_text = tokenize(input2)
    voca = set(flatten(all_text))

    word2idx = {}
    for idx, word in enumerate(list(voca)):
        word2idx[word] = idx

    def tokens_to_idx(tokens):
        return list([word2idx[t] for t in tokens])

    random.shuffle(data)
    train_size = int(0.9 * len(data))
    train_data = data[:train_size]
    val_data = data[train_size:]
    train_ids = lmap(lambda x: x['order_number'], train_data)
    icd10_codes = lmap(lambda x: x['icd10_code'], train_data)
    train_desc = lmap(lambda x: x['short_desc'], train_data)
    train_tokens = lmap(tokens_to_idx, train_desc)

    n_output_voca = len(voca)
    print("n_output_voca", n_output_voca)
    W2 = np.random.rand(n_output_voca + 1, dim)
    W2 = np.random.normal(0, 1, [n_output_voca + 1, dim])
    W1 = np.zeros([n_input_voca + 1, dim])

    icd10_codes = lmap(lambda x: x.strip(), icd10_codes)

    add_subword = False

    code_id_to_code = {}
    code_to_code_id = {}
    for code_id, icd10_code, text_seq in zip(train_ids, icd10_codes,
                                             train_tokens):
        for idx in text_seq:
            W1[code_id] += W2[idx]

        code_id_to_code[code_id] = icd10_code
        code_to_code_id[icd10_code] = code_id

        l = len(icd10_code)
        if add_subword:
            for j in range(1, l - 1):
                substr = icd10_code[:j]
                if substr in code_id_to_code:
                    W1[code_id] += W1[code_to_code_id[substr]]

    new_w2 = list(
        [W2[i] / np.linalg.norm(W2[i]) for i in range(n_output_voca)])

    all_voca = []
    for code_id in train_ids:
        icd10_code = code_id_to_code[code_id]
        word = icd10_code
        emb = W1[code_id]
        all_voca.append((word, emb))

    print("Testing")
    AP_list = []
    for code_id, text_seq in zip(train_ids, train_tokens):
        a = W1[code_id] / np.linalg.norm(W1[code_id])
        l = []
        for j in range(n_output_voca):
            b = new_w2[j]
            e = j, np.dot(a, b)
            l.append(e)

        l.sort(key=lambda x: x[1], reverse=True)
        ranked_list = l[:50]
        terms = text_seq

        def is_correct(w_pair):
            return w_pair[0] in terms

        AP = AP_from_binary(lmap(is_correct, ranked_list), len(terms))
        AP_list.append(AP)
        print("1")
        if len(AP_list) > 100:
            break

    print(sum(AP_list) / len(AP_list))
Ejemplo n.º 10
0
def encode_data(vocabulary_set, text_list):
    encoder = tf.keras.preprocessing.text.text_to_word_sequence(vocabulary_set)
    return lmap(encoder.encode, text_list)