Beispiel #1
0
 start = time.clock()
 
 if code_type not in ['ICD', 'CHOP', 'DRG']:
     print("Code type has to be one of ICD|DRG|CHOP")
     exit(-2)
     
 print("Reading catalog")
 reader = CSVReader(catalog, ',')
 descriptions_de = {}
 dataset = reader.read_from_file()
 for record in dataset:
     descriptions_de[code_type + '_' + record['code'].replace('.', '').upper()] = record['text_de']
     
 print("Reading vectors and tokens..")
 
 vector_by_token = read_vectors(vector_file)
 res = read_code_vectors(vector_by_token, token_file)
 vectors_by_codes = res['vectors']
 tokens_by_codes = res['tokens']
 
 code_vocab = []
 for code in vectors_by_codes.keys():
     if(code.startswith(code_type)):
         code_vocab.append(code)
 
 vector_size = vectors_by_codes[code_vocab[0]][0].shape[0]
 
 print("Vector size is " + str(vector_size))
 
 average_vector_by_code = np.zeros((len(code_vocab), vector_size), dtype=np.float32)
 
def run (config):
    base_folder = config['base_folder']
    
    if config["only-fr-descriptions"] + config["only-it-descriptions"] + config["only-de-fr-descriptions"] + config["only-de-it-descriptions"] + config["only-fr-it-descriptions"] + config["only-de-fr-it-descriptions"] > 1:
        print("You can't select more than one language option.")
        return   
    
    if not config['skip-word2vec']:
        print("Tokenize catalogs..")
        if not os.path.exists(base_folder + 'tokenization'):
            os.makedirs(base_folder + 'tokenization')
        tokenize_catalogs(config)
        
        print("Vectorize catalogs..")
        if not os.path.exists(base_folder + 'vectorization'):
            os.makedirs(base_folder + 'vectorization')
        word2vec_trainset = config['all-tokens']
        if config['use-training-data-for-word2vec']:
            create_word2vec_training_data(config['training-set-word2vec'], config['all-tokens'], 
                                          base_folder + 'vectorization/train.txt',
                                          do_shuffle=config['shuffle-word2vec-traindata'],
                                          use_n_times=config['num-shuffles'],
                                          use_demographic_tokens=config['use_demographic_tokens'])
            word2vec_trainset = base_folder + 'vectorization/train.txt'
        call(["word2vec", "-train", word2vec_trainset, "-binary",
               "0", "-cbow", '1' if config['word2vec-cbow'] else '0', "-output", config['all-vectors'],
                "-size", str(config['word2vec-dim-size']), "-save-vocab",
                config['word2vec-vocab'], "-min-count", "1", "-threads", str(config['num-cores'])])
    
    print("\nRead vectors. Assign vectors to codes..")
    # one vector for each token in the vocabulary
    vector_by_token = read_vectors(config['all-vectors'])
    vocab = vector_by_token.keys()
    
    if config['store-everything']:
        json.dump({k: v.tolist() for k, v in vector_by_token.items()}, open(config['all-vectors'] + '.json','w'), indent=4, sort_keys=True)

    res = read_code_vectors(vector_by_token, config['all-tokens'])
    
    # for each code a list of vectors of its tokens
    vectors_by_code = res['vectors']
    # for each code a list of its tokens
    tokens_by_code = res['tokens']
    # for each code a vector that is the normalized sum of all vectors from all tokens from this code.
    vector_by_code = res['vector_by_code']
    
    if config['store-everything']:
        json.dump({k: v.tolist() for k, v in vectors_by_code.items()}, open(config['code-vectors'],'w'), sort_keys=True)
        json.dump(tokens_by_code, open(config['code-tokens'],'w'), indent=4, sort_keys=True)
    
    if not os.path.exists(base_folder + 'classification'):
        os.makedirs(base_folder + 'classification')
    
    reader = SequencePCReader(config['training-set'])
    reader.tokens_by_code = tokens_by_code
    reader.vocab = vocab
    reader.use_demographic_tokens = config['use_demographic_tokens']
    reader.use_all_tokens = config['use-all-tokens-in-embedding']

    reader.read_from_file(vectors_by_code, 'los', drg_out_file=config['training-set-drgs'], demo_variables_to_use=config['demo-variables'])
    codes = reader.data
    targets = reader.targets
    demo_data = reader.demo_data
    drgs = reader.drgs
    y = np.zeros(len(codes), dtype=np.float32)
    for i, target in enumerate(targets):
        y[i] = target

    codes_train, codes_test, demo_train, demo_test, y_train, y_test, drgs_train, drgs_test = train_test_split(codes, demo_data, y, drgs, test_size=0.33, random_state=42)
    calculate_drg_baseline(y_train, y_test, drgs_train, drgs_test)
    output_dim = 1
           
    print("Training data dimensionality: " + str(len(codes)) + " | " + str(len(codes[0])))
    print('Train LSTM Neural Net with Embedding..')
    vocab = reader.vocab
    codes_train = keras.preprocessing.sequence.pad_sequences(codes_train, maxlen=config['maxlen'], dtype='int', truncating='pre')
    codes_test = keras.preprocessing.sequence.pad_sequences(codes_test, maxlen=config['maxlen'], dtype='int', truncating='pre')
                 
    model, _ = train_and_evaluate_lstm_with_embedding(config, codes_train, codes_test, demo_train, demo_test, y_train, y_test, output_dim, 'los', vocab, 
                                                                  vector_by_token,
                                                                  vector_by_code)


    predictions = model.predict({'codes_input':codes_test, 'demo_input':demo_test}, verbose=0)
   
    error = predictions[:,0] - y_test
    
    plot_histogram(config, y_test, 'ACTUAL_LOS')
    plot_histogram(config, predictions[:,0], 'PREDICTED_LOS')
    plot_histogram(config, error, 'ERROR')
    
    mse = np.square(error).mean()
    mape = np.abs(error / y_test).mean()
    mae = np.abs(error).mean()

    if config['store-everything']:
        joblib.dump(model, base_folder + 'classification/' + config['classifier'] + '.pkl')
    
    print('Total test MAPE: ' + str(mape))
    print('Total test MSE: ' + str(mse))
    print('Total test MAE: ' + str(mae))
    return mse
def run (config):
    base_folder = config['base_folder']
    
    if config["only-fr-descriptions"] + config["only-it-descriptions"] + config["only-de-fr-descriptions"] + config["only-de-it-descriptions"] + config["only-fr-it-descriptions"] + config["only-de-fr-it-descriptions"] > 1:
        print("You can't select more than one language option.")
        return   
    
    if not config['skip-word2vec']:
        print("Tokenize catalogs..")
        if not os.path.exists(base_folder + 'tokenization'):
            os.makedirs(base_folder + 'tokenization')
        tokenize_catalogs(config)
        
        print("Vectorize catalogs..")
        if not os.path.exists(base_folder + 'vectorization'):
            os.makedirs(base_folder + 'vectorization')
        word2vec_trainset = config['all-tokens']
        if config['use-training-data-for-word2vec']:
            create_word2vec_training_data(config['training-set-word2vec'], config['all-tokens'], 
                                          base_folder + 'vectorization/train.txt',
                                          do_shuffle=config['shuffle-word2vec-traindata'],
                                          use_n_times=config['num-shuffles'],
                                          use_demographic_tokens=config['use_demographic_tokens'])
            word2vec_trainset = base_folder + 'vectorization/train.txt'
        call(["word2vec", "-train", word2vec_trainset, "-binary",
               "0", "-cbow", '1' if config['word2vec-cbow'] else '0', "-output", config['all-vectors'],
                "-size", str(config['word2vec-dim-size']), "-save-vocab",
                config['word2vec-vocab'], "-min-count", "1", "-threads", str(config['num-cores'])])
    
    print("\nRead vectors. Assign vectors to codes..")
    # one vector for each token in the vocabulary
    vector_by_token = read_vectors(config['all-vectors'])
    vocab = vector_by_token.keys()
    
    if config['store-everything']:
        json.dump({k: v.tolist() for k, v in vector_by_token.items()}, open(config['all-vectors'] + '.json','w'), indent=4, sort_keys=True)

    res = read_code_vectors(vector_by_token, config['all-tokens'])
    
    # for each code a list of vectors of its tokens
    vectors_by_code = res['vectors']
    # for each code a list of its tokens
    tokens_by_code = res['tokens']
    # for each code a vector that is the normalized sum of all vectors from all tokens from this code.
    vector_by_code = res['vector_by_code']
    
    if config['store-everything']:
        json.dump({k: v.tolist() for k, v in vectors_by_code.items()}, open(config['code-vectors'],'w'), sort_keys=True)
        json.dump(tokens_by_code, open(config['code-tokens'],'w'), indent=4, sort_keys=True)
    
    if not os.path.exists(base_folder + 'classification'):
        os.makedirs(base_folder + 'classification')
    total_score = 0.0 
    tasks = ['pdx', 'sdx', 'srg', 'drg']   
    for task in tasks:
        print('\n==== ' + task + ' ====')
        reader = None
        if config['classifier'] == 'lstm':
            reader = SequenceVectorizedPCReader(config['training-set'])
        elif config['classifier'] == 'lstm-embedding':
            reader = SequencePCReader(config['training-set'])
            reader.tokens_by_code = tokens_by_code
            reader.vocab = vocab
            reader.use_demographic_tokens = config['use_demographic_tokens']
            reader.use_all_tokens = config['use-all-tokens-in-embedding']
        else:
            reader = FlatVectorizedPCReader(config['training-set'])
        reader.read_from_file(vectors_by_code, task, drg_out_file=config['training-set-drgs'], demo_variables_to_use=config['demo-variables'])
        codes = reader.data
        targets = reader.targets
        excludes = reader.excludes
        demo_data = reader.demo_data
        classes = list(set(targets))
        y = np.zeros(len(codes), dtype=np.uint)
        for i, target in enumerate(targets):
            y[i] = classes.index(target)
        codes_train, codes_test, demo_train, demo_test, y_train, y_test, _, targets_test, _, excludes_test = train_test_split(codes, demo_data, y, targets, excludes, test_size=0.33, random_state=42)
        output_dim = len(set(targets))
        print('Number of classes: ' + str(output_dim))
        
        model, score = None, 0
        if config['classifier'] == 'random-forest':
            print("Training data dimensionality: " + str(codes.shape))
            print('Train Random Forest for ' + reader.code_type + ' classification task..')
            model, score = train_and_evaluate_random_forest(config, codes_train, codes_test, y_train, y_test)
        elif config['classifier'] == 'ffnn':
            print("Training data dimensionality: " + str(codes.shape))
            print('Train Feed Forward Neural Net for ' + reader.code_type + ' classification task..')
            model, scaler, score = train_and_evaluate_ffnn(config, codes_train, codes_test, y_train, y_test, output_dim, task)
            score = adjust_score(model, scaler, codes_test, classes, targets_test, excludes_test)
            plot_oracle(config, task, model, scaler, codes_test, classes, targets_test, excludes_test)
            plot_classification_confidence_histograms(config, task, model, scaler, codes_test, classes, targets_test, excludes_test)
        elif config['classifier'] == 'lstm':
            print("Training data dimensionality: " + str(len(codes)) + " | " + str(len(codes[0])) + " | " + str(len(codes[0][0])))
            print('Train LSTM Neural Net for ' + reader.code_type + ' classification task..')
            model, scaler, score = train_and_evaluate_lstm(config, codes_train, codes_test, y_train, y_test, output_dim, task)
            codes_test = pad_sequences(codes_test, maxlen=config['maxlen'], dim=len(codes_train[0][0]))
            score = adjust_score(model, scaler, codes_test, classes, targets_test, excludes_test)
        elif config['classifier'] == 'lstm-embedding':
            print("Training data dimensionality: " + str(len(codes)) + " | " + str(len(codes[0])))
            print('Train LSTM Neural Net with Embedding for ' + reader.code_type + ' classification task..')
            vocab = reader.vocab
            codes_train = keras.preprocessing.sequence.pad_sequences(codes_train, maxlen=config['maxlen'], dtype='int', truncating='pre')
            codes_test = keras.preprocessing.sequence.pad_sequences(codes_test, maxlen=config['maxlen'], dtype='int', truncating='pre')
                 
            model, score = train_and_evaluate_lstm_with_embedding(config, codes_train, codes_test, demo_train, demo_test, y_train, y_test, output_dim, task, vocab, 
                                                                  vector_by_token,
                                                                  vector_by_code)
            input_test = {'codes_input':codes_test, 'demo_input':demo_test}
            score = adjust_score(model, None, input_test, classes, targets_test, excludes_test)
            plot_oracle(config, task, model, None, input_test, classes, targets_test, excludes_test)
            plot_classification_confidence_histograms(config, task, model, None, input_test, classes, targets_test, excludes_test)

        total_score += score
        if config['store-everything']:
            joblib.dump(model, base_folder + 'classification/' + config['classifier'] + '.pkl')
    
    total_score /= len(tasks)
    print('Total average score over all tasks: ' + str(total_score))
    return total_score
Beispiel #4
0
def run(config):
    base_folder = config['base_folder']

    if config["only-fr-descriptions"] + config["only-it-descriptions"] + config[
            "only-de-fr-descriptions"] + config[
                "only-de-it-descriptions"] + config[
                    "only-fr-it-descriptions"] + config[
                        "only-de-fr-it-descriptions"] > 1:
        print("You can't select more than one language option.")
        return

    if not config['skip-word2vec']:
        print("Tokenize catalogs..")
        if not os.path.exists(base_folder + 'tokenization'):
            os.makedirs(base_folder + 'tokenization')
        tokenize_catalogs(config)

        print("Vectorize catalogs..")
        if not os.path.exists(base_folder + 'vectorization'):
            os.makedirs(base_folder + 'vectorization')
        word2vec_trainset = config['all-tokens']
        if config['use-training-data-for-word2vec']:
            create_word2vec_training_data(
                config['training-set-word2vec'],
                config['all-tokens'],
                base_folder + 'vectorization/train.txt',
                do_shuffle=config['shuffle-word2vec-traindata'],
                use_n_times=config['num-shuffles'],
                use_demographic_tokens=config['use_demographic_tokens'])
            word2vec_trainset = base_folder + 'vectorization/train.txt'
        call([
            "word2vec", "-train", word2vec_trainset, "-binary", "0", "-cbow",
            '1' if config['word2vec-cbow'] else '0', "-output",
            config['all-vectors'], "-size",
            str(config['word2vec-dim-size']), "-save-vocab",
            config['word2vec-vocab'], "-min-count", "1", "-threads",
            str(config['num-cores'])
        ])

    print("\nRead vectors. Assign vectors to codes..")
    # one vector for each token in the vocabulary
    vector_by_token = read_vectors(config['all-vectors'])
    vocab = vector_by_token.keys()

    if config['store-everything']:
        json.dump({k: v.tolist()
                   for k, v in vector_by_token.items()},
                  open(config['all-vectors'] + '.json', 'w'),
                  indent=4,
                  sort_keys=True)

    res = read_code_vectors(vector_by_token, config['all-tokens'])

    # for each code a list of vectors of its tokens
    vectors_by_code = res['vectors']
    # for each code a list of its tokens
    tokens_by_code = res['tokens']
    # for each code a vector that is the normalized sum of all vectors from all tokens from this code.
    vector_by_code = res['vector_by_code']

    if config['store-everything']:
        json.dump({k: v.tolist()
                   for k, v in vectors_by_code.items()},
                  open(config['code-vectors'], 'w'),
                  sort_keys=True)
        json.dump(tokens_by_code,
                  open(config['code-tokens'], 'w'),
                  indent=4,
                  sort_keys=True)

    if not os.path.exists(base_folder + 'classification'):
        os.makedirs(base_folder + 'classification')
    total_score = 0.0
    tasks = ['pdx', 'sdx', 'srg', 'drg']
    for task in tasks:
        print('\n==== ' + task + ' ====')
        reader = None
        if config['classifier'] == 'lstm':
            reader = SequenceVectorizedPCReader(config['training-set'])
        elif config['classifier'] == 'lstm-embedding':
            reader = SequencePCReader(config['training-set'])
            reader.tokens_by_code = tokens_by_code
            reader.vocab = vocab
            reader.use_demographic_tokens = config['use_demographic_tokens']
            reader.use_all_tokens = config['use-all-tokens-in-embedding']
        else:
            reader = FlatVectorizedPCReader(config['training-set'])
        reader.read_from_file(vectors_by_code,
                              task,
                              drg_out_file=config['training-set-drgs'],
                              demo_variables_to_use=config['demo-variables'])
        codes = reader.data
        targets = reader.targets
        excludes = reader.excludes
        demo_data = reader.demo_data
        classes = list(set(targets))
        y = np.zeros(len(codes), dtype=np.uint)
        for i, target in enumerate(targets):
            y[i] = classes.index(target)
        codes_train, codes_test, demo_train, demo_test, y_train, y_test, _, targets_test, _, excludes_test = train_test_split(
            codes,
            demo_data,
            y,
            targets,
            excludes,
            test_size=0.33,
            random_state=42)
        output_dim = len(set(targets))
        print('Number of classes: ' + str(output_dim))

        model, score = None, 0
        if config['classifier'] == 'random-forest':
            print("Training data dimensionality: " + str(codes.shape))
            print('Train Random Forest for ' + reader.code_type +
                  ' classification task..')
            model, score = train_and_evaluate_random_forest(
                config, codes_train, codes_test, y_train, y_test)
        elif config['classifier'] == 'ffnn':
            print("Training data dimensionality: " + str(codes.shape))
            print('Train Feed Forward Neural Net for ' + reader.code_type +
                  ' classification task..')
            model, scaler, score = train_and_evaluate_ffnn(
                config, codes_train, codes_test, y_train, y_test, output_dim,
                task)
            score = adjust_score(model, scaler, codes_test, classes,
                                 targets_test, excludes_test)
            plot_oracle(config, task, model, scaler, codes_test, classes,
                        targets_test, excludes_test)
            plot_classification_confidence_histograms(config, task, model,
                                                      scaler, codes_test,
                                                      classes, targets_test,
                                                      excludes_test)
        elif config['classifier'] == 'lstm':
            print("Training data dimensionality: " + str(len(codes)) + " | " +
                  str(len(codes[0])) + " | " + str(len(codes[0][0])))
            print('Train LSTM Neural Net for ' + reader.code_type +
                  ' classification task..')
            model, scaler, score = train_and_evaluate_lstm(
                config, codes_train, codes_test, y_train, y_test, output_dim,
                task)
            codes_test = pad_sequences(codes_test,
                                       maxlen=config['maxlen'],
                                       dim=len(codes_train[0][0]))
            score = adjust_score(model, scaler, codes_test, classes,
                                 targets_test, excludes_test)
        elif config['classifier'] == 'lstm-embedding':
            print("Training data dimensionality: " + str(len(codes)) + " | " +
                  str(len(codes[0])))
            print('Train LSTM Neural Net with Embedding for ' +
                  reader.code_type + ' classification task..')
            vocab = reader.vocab
            codes_train = keras.preprocessing.sequence.pad_sequences(
                codes_train,
                maxlen=config['maxlen'],
                dtype='int',
                truncating='pre')
            codes_test = keras.preprocessing.sequence.pad_sequences(
                codes_test,
                maxlen=config['maxlen'],
                dtype='int',
                truncating='pre')

            model, score = train_and_evaluate_lstm_with_embedding(
                config, codes_train, codes_test, demo_train, demo_test,
                y_train, y_test, output_dim, task, vocab, vector_by_token,
                vector_by_code)
            input_test = {'codes_input': codes_test, 'demo_input': demo_test}
            score = adjust_score(model, None, input_test, classes,
                                 targets_test, excludes_test)
            plot_oracle(config, task, model, None, input_test, classes,
                        targets_test, excludes_test)
            plot_classification_confidence_histograms(config, task, model,
                                                      None, input_test,
                                                      classes, targets_test,
                                                      excludes_test)

        total_score += score
        if config['store-everything']:
            joblib.dump(
                model, base_folder + 'classification/' + config['classifier'] +
                '.pkl')

    total_score /= len(tasks)
    print('Total average score over all tasks: ' + str(total_score))
    return total_score
Beispiel #5
0
    if code_type not in ['ICD', 'CHOP', 'DRG']:
        print("Code type has to be one of ICD|DRG|CHOP")
        exit(-2)

    print("Reading catalog")
    reader = CSVReader(catalog, ',')
    descriptions_de = {}
    dataset = reader.read_from_file()
    for record in dataset:
        descriptions_de[code_type + '_' + record['code'].replace(
            '.', '').upper()] = record['text_de']

    print("Reading vectors and tokens..")

    vector_by_token = read_vectors(vector_file)
    res = read_code_vectors(vector_by_token, token_file)
    vectors_by_codes = res['vectors']
    tokens_by_codes = res['tokens']

    code_vocab = []
    for code in vectors_by_codes.keys():
        if (code.startswith(code_type)):
            code_vocab.append(code)

    vector_size = vectors_by_codes[code_vocab[0]][0].shape[0]

    print("Vector size is " + str(vector_size))

    average_vector_by_code = np.zeros((len(code_vocab), vector_size),
                                      dtype=np.float32)