Beispiel #1
0
def view_tongue_data_augment():

    patient_tongue_dir = config['root_path'] + \
        config['original_path'] + 'tongue_9585'
    tongue_zhiliao_path = config['root_path'] + \
        config['original_path'] + 'tongue_zhiliao.list'
    yaopin_path = config['root_path'] + \
        config['original_path'] + 'yaopin.vocab'

    tongue_ids, tongue_image_arrays, tongue_yaofangs, tongue_image_shape = patient_tongue_generator.loadDatafromFile(
        patient_tongue_dir, tongue_zhiliao_path, image_normal_size=(224, 224))

    # fetch max(id) in yaopin.vocab as nb_yao
    with open(yaopin_path, 'r') as yaopin_file:
        nb_yao = max(
            int(line.split(' ')[0]) for line in yaopin_file.readlines())

    total_tongue_x, total_y = tongue2text_gen.data_tensorization(
        tongue_image_arrays, tongue_yaofangs, tongue_image_shape, nb_yao)

    datagen = image_augment.image_augment_gen()
    augmented_x, augmented_y = image_augment.data_tensoration_augment(
        datagen, total_tongue_x, total_y)

    print(np.shape(augmented_x))
    print(np.shape(augmented_y))
Beispiel #2
0
def tfidf_weights_test():

    patient_tongue_dir = config['root_path'] + \
        config['original_path'] + 'tongue_9585'
    tongue_zhiliao_path = config['root_path'] + \
        config['original_path'] + 'tongue_zhiliao.list'
    yaopin_path = config['root_path'] + \
        config['original_path'] + 'yaopin.vocab'

    tongue_ids, tongue_image_arrays, tongue_yaofangs, tongue_image_shape = patient_tongue_generator.loadDatafromFile(
        patient_tongue_dir, tongue_zhiliao_path, image_normal_size=(224, 224))
    patient_tongue_dir = config['root_path'] + \
        config['original_path'] + 'tongue_9585'
    tongue_zhiliao_path = config['root_path'] + \
        config['original_path'] + 'tongue_zhiliao.list'

    yaofangs_corpus = tfidf.list2corpus(tongue_yaofangs)
    word, weight = tfidf.get_tf_idf(yaofangs_corpus)

    weight = list(weight)

    min = 999999
    for i in range(len(weight)):
        for j in range(len(weight[i])):
            if weight[i][j] > 0.0 and weight[i][j] < min:
                min = weight[i][j]
    print(min)
Beispiel #3
0
def train_predict_tongue2text_basic_gen(train_new=True):
    '''
    @param train_new: flag of train a new model model and replace the model on disk 
    '''

    patient_tongue_dir = config['root_path'] + \
        config['original_path'] + 'tongue_9585'
    tongue_zhiliao_path = config['root_path'] + \
        config['original_path'] + 'tongue_zhiliao.list'
    yaopin_path = config['root_path'] + \
        config['original_path'] + 'yaopin.vocab'

    # tongue_ids: [01012045534615_1_4_7, ...]
    # tongue_image_array: [np.array(pixels matrix of image), np.array(pixels matrix of image2), ...]
    # tongue_yaofangs: [ [0,1,2,3], [4,5,6,7], ... ]
    tongue_ids, tongue_image_arrays, tongue_yaofangs, tongue_image_shape = patient_tongue_generator.loadDatafromFile(
        patient_tongue_dir, tongue_zhiliao_path, image_normal_size=(224, 224))

    # fetch max(id) in yaopin.vocab as nb_yao
    with open(yaopin_path, 'r') as yaopin_file:
        nb_yao = max(
            int(line.split(' ')[0]) for line in yaopin_file.readlines())


#     _use_data_augment = True # set for use image data augment, can only be use on service 225 with big memory
    _use_data_augment = False
    '''
    The part of train a new gen_model and storage it on disk,
    the new one will cover the old one
    '''
    # store keras layers_framework(optional, gen_frame_path==None or not)
    frame_name = 'tongue2text_cnnmlp_9585_act(bi)_t3_100it.json'
    gen_frame_path = config['root_path'] + \
        config['cache_path'] + 'keras/' + frame_name

    train_on_batch = False  # switch train_on_batch or not
    if train_new == True:
        _ = patient_tongue_generator.tongue_basic_gen_trainer(
            tongue_image_arrays,
            tongue_yaofangs,
            tongue_image_shape,
            nb_yao,
            gen_model_path=gen_frame_path,
            train_on_batch=train_on_batch,
            use_data_augment=_use_data_augment)
    '''
    The part of load a trained gen_model from disk,
    the trained gen_model will be reload and use to eval and predict directly,
    without retraining which is for time saving
    '''
    trained_gen_model = tongue2text_gen.loadStoredModel(gen_frame_path,
                                                        gen_frame_path.replace(
                                                            '.json', '.h5'),
                                                        compile_info={
                                                            'recompile':
                                                            True,
                                                            'aux_output':
                                                            False,
                                                            'use_tfidf_tensor':
                                                            False
                                                        })

    # test
    # gen_output: [ [0.8, 0.4., ...], [...], [...], ... ]
    gen_output = patient_tongue_generator.basic_gen_predictor_test(
        tongue_image_arrays, tongue_yaofangs, tongue_image_shape, nb_yao,
        trained_gen_model)
    #     print(gen_output[0])

    # yaopin_dict: {0:'麻黄',1:'桂枝',...}
    yaopin_dict = patient_tongue_generator.load_yaopin_dict(yaopin_path)
    #     print(yaopin_dict)

    #     test_tongue_ids = tongue_ids[: 500]
    #     test_yaofangs = tongue_yaofangs[: 500]

    #     test_tongue_ids = tongue_ids[2000 : 2500]
    #     test_yaofangs = tongue_yaofangs[2000 : 2500]

    #     test_tongue_ids = tongue_ids[4000 : 4500]
    #     test_yaofangs = tongue_yaofangs[4000 : 4500]

    #     test_tongue_ids = tongue_ids[6000 : 6500]
    #     test_yaofangs = tongue_yaofangs[6000 : 6500]

    test_tongue_ids = tongue_ids[len(tongue_ids) - 500:]
    test_yaofangs = tongue_yaofangs[len(tongue_yaofangs) - 500:]
    '''the evaluation criterion '''
    precisions = []
    recalls = []
    errors = []
    for i, output in enumerate(gen_output):
        # print test data label info:
        print('%d. \npatient tongue id: %s' % (i, test_tongue_ids[i]))
        print('label yaofang:')
        yaofang_label = patient_tongue_generator.sample_yaofang(
            test_yaofangs[i], yaopin_dict)
        print(' '.join(yaofang_label))

        #         output_index = patient_tongue_generator.dynamic_threshold_outputfilter(output)
        output_index = patient_tongue_generator.threshold_outputfilter(output)
        #         print('predicted yaofang ids: {0}'.format(output_index))
        yaofang_output = patient_tongue_generator.sample_yaofang(
            output_index, yaopin_dict)
        print('predicted yaofang:')
        print(' '.join(yaofang_output) + '\n')

        precision, recall, error = generator_eval.evaluator(
            test_yaofangs[i], output_index)
        precisions.append(precision)
        recalls.append(recall)
        errors.append(error)
        print('------Score: precision: %f, recall: %f, error: %f' %
              (precision, recall, error))

    print(
        '------Average Score: average precision: %f, average recall: %f, error: %f'
        % (np.average(precisions), np.average(recalls), np.average(errors)))
Beispiel #4
0
def train_predict_tongue2text_cnn2_withlda_gen(train_new=True):
    '''
    @param train_new: flag of train a new model model and replace the model on disk 
    '''

    patient_tongue_dir = config['root_path'] + \
        config['original_path'] + 'tongue_9585'
    tongue_zhiliao_path = config['root_path'] + \
        config['original_path'] + 'tongue_zhiliao.list'
    yaopin_path = config['root_path'] + \
        config['original_path'] + 'yaopin.vocab'

    tongue_ids, tongue_image_arrays, tongue_yaofangs, tongue_image_shape = patient_tongue_generator.loadDatafromFile(
        patient_tongue_dir, tongue_zhiliao_path, image_normal_size=(224, 224))

    # fetch max(id) in yaopin.vocab as nb_yao
    with open(yaopin_path, 'r') as yaopin_file:
        nb_yao = max(
            int(line.split(' ')[0]) for line in yaopin_file.readlines())

#     _use_tfidf_tensor = True  # set for use tfidf_tensor
    _use_tfidf_tensor = False

    #     _use_data_augment = True # set for use image data augment, can only be use on service 225 with big memory
    _use_data_augment = False
    '''
    TODO: storage model and load it from disk
    
    The part of train a new gen_model with lda and storage it on disk,
    the new one will cover the old one
    '''
    lda_model_name = 'tongue_9585_gensim_lda.topic'
    lda_model_path = config['root_path'] + \
        config['cache_path'] + 'nlp/' + lda_model_name
    #     _lda_replace = True  # first time is True, other is False if not needed
    _lda_replace = False

    if _use_tfidf_tensor == True:
        frame_name = 'tongue2text_cnn2passmlp_lda_9585_act(tfidf)_t3_100it.json'
    else:
        frame_name = 'tongue2text_cnn2passmlp_lda_9585_act(bi)_t3_100it.json'
    gen_frame_path = config['root_path'] + \
        config['cache_path'] + 'keras/' + frame_name

    if train_new == True:
        _ = patient_tongue_generator.tongue_gen_withlda_trainer(
            tongue_image_arrays,
            tongue_yaofangs,
            tongue_image_shape,
            nb_yao,
            lda_model_path,
            gen_model_path=gen_frame_path,
            lda_replace=_lda_replace,
            use_tfidf_tensor=_use_tfidf_tensor,
            use_data_augment=_use_data_augment)
    '''
    The part of load a trained gen_model from disk,
    the trained gen_model will be reload and use to eval and predict directly,
    without retraining which is for time saving
    '''
    trained_gen_model = tongue2text_gen.loadStoredModel(gen_frame_path,
                                                        gen_frame_path.replace(
                                                            '.json', '.h5'),
                                                        compile_info={
                                                            'recompile':
                                                            True,
                                                            'aux_output':
                                                            True,
                                                            'use_tfidf_tensor':
                                                            _use_tfidf_tensor
                                                        })

    # test
    # gen_output: [ [0.8, 0.4., ...], [...], [...], ... ]
    gen_output_list = patient_tongue_generator.gen_withlda_predictor_test(
        tongue_image_arrays,
        tongue_yaofangs,
        tongue_image_shape,
        nb_yao,
        trained_gen_model,
        lda_model_path,
        use_tfidf_tensor=_use_tfidf_tensor)
    # just get the gen_output, dropout the aux_output
    gen_output = gen_output_list[0]
    del (gen_output_list)
    #     print(gen_output[0])

    # yaopin_dict: {0:'麻黄',1:'桂枝',...}
    yaopin_dict = patient_tongue_generator.load_yaopin_dict(yaopin_path)
    #     print(yaopin_dict)

    #     test_tongue_ids = tongue_ids[: 500]
    #     test_yaofangs = tongue_yaofangs[: 500]

    #     test_tongue_ids = tongue_ids[2000 : 2500]
    #     test_yaofangs = tongue_yaofangs[2000 : 2500]

    #     test_tongue_ids = tongue_ids[4000 : 4500]
    #     test_yaofangs = tongue_yaofangs[4000 : 4500]

    #     test_tongue_ids = tongue_ids[6000 : 6500]
    #     test_yaofangs = tongue_yaofangs[6000 : 6500]

    test_tongue_ids = tongue_ids[len(tongue_ids) - 500:]
    test_yaofangs = tongue_yaofangs[len(tongue_yaofangs) - 500:]
    '''the evaluation criterion '''
    precisions = []
    recalls = []
    errors = []
    for i, output in enumerate(gen_output):
        # print test data label info:
        print('%d. \npatient tongue id: %s' % (i, test_tongue_ids[i]))
        print('label yaofang:')
        yaofang_label = patient_tongue_generator.sample_yaofang(
            test_yaofangs[i], yaopin_dict)
        print(' '.join(yaofang_label))

        #         output_index = patient_tongue_generator.dynamic_threshold_outputfilter(output)
        output_index = patient_tongue_generator.threshold_outputfilter(output)
        #         print('predicted yaofang ids: {0}'.format(output_index))
        yaofang_output = patient_tongue_generator.sample_yaofang(
            output_index, yaopin_dict)
        print('predicted yaofang:')
        print(' '.join(yaofang_output) + '\n')

        precision, recall, error = generator_eval.evaluator(
            test_yaofangs[i], output_index)
        precisions.append(precision)
        recalls.append(recall)
        errors.append(error)
        print('------Score: precision: %f, recall: %f, error: %f' %
              (precision, recall, error))

    print(
        '------Average Score: average precision: %f, average recall: %f, error: %f'
        % (np.average(precisions), np.average(recalls), np.average(errors)))
def train_predict_tongue2text_sklearn_gen(step=0):
    '''
    @param step: 0: keras train, 1: load keras model train sk_classifier and test 


    '''

    patient_tongue_dir = config['root_path'] + \
        config['original_path'] + 'tongue_9585'
    tongue_zhiliao_path = config['root_path'] + \
        config['original_path'] + 'tongue_zhiliao.list'
    yaopin_path = config['root_path'] + \
        config['original_path'] + 'yaopin.vocab'

    # tongue_ids: [01012045534615_1_4_7, ...]
    # tongue_image_array: [np.array(pixels matrix of image), np.array(pixels matrix of image2), ...]
    # tongue_yaofangs: [ [0,1,2,3], [4,5,6,7], ... ]
    tongue_ids, tongue_image_arrays, tongue_yaofangs, tongue_image_shape = patient_tongue_generator.loadDatafromFile(
        patient_tongue_dir, tongue_zhiliao_path, image_normal_size=(224, 224))

    # fetch max(id) in yaopin.vocab as nb_yao
    with open(yaopin_path, 'r') as yaopin_file:
        nb_yao = max(int(line.split(' ')[0])
                     for line in yaopin_file.readlines())

    frame_name = 'test_tongue2text_cnn2mlp_9585_act(bi)_t3_100it.json'
    if step == 0:
        '''
        The part of train a new sklearn_gen_model
        @todo: need to storage keras feature scratch and sklearn generator
            on disk together(with same type names)
        '''
        trained_gen_model = patient_tongue_generator.tongue_sklearn_gen_keras_trainer(
            tongue_image_arrays, tongue_yaofangs, tongue_image_shape, nb_yao)
        # store keras layers_framework(optional)
        gen_frame_path = config['root_path'] + \
            config['cache_path'] + 'keras/' + frame_name
        tongue2text_sklearn_gen.storageKerasModel(
            model=trained_gen_model, frame_path=gen_frame_path)

    if step == 1:
        '''
        The part of load keras model from disk first and train the sklearn classifier
        '''
        gen_frame_path = config['root_path'] + \
            config['cache_path'] + 'keras/' + frame_name
        gen_record_path = gen_frame_path.replace('.json', '.h5')

        print('load keras model from disk...')
        trained_tongue_gen_model = tongue2text_sklearn_gen.loadStoredKerasModel(
            gen_frame_path, gen_record_path, recompile=True)
        trained_tongue_gen_classifier = patient_tongue_generator.tongue_sklearn_gen_sk_trainer(
            tongue_image_arrays, tongue_yaofangs, tongue_image_shape, nb_yao, trained_tongue_gen_model)
        '''
        The part of test trained sklearn classifier-generator
        '''
        _proba_predict = True  # set the output_type of sklearn classifier-generator(proba or not)
        gen_output = patient_tongue_generator.sklearn_gen_predictor_test(
            tongue_image_arrays, tongue_yaofangs, tongue_image_shape,
            nb_yao, trained_tongue_gen_model, trained_tongue_gen_classifier,
            proba_predict=_proba_predict)
        print(gen_output[0])

        # yaopin_dict: {0:'麻黄',1:'桂枝',...}
        yaopin_dict = patient_tongue_generator.load_yaopin_dict(yaopin_path)
    #     print(yaopin_dict)

        test_tongue_ids = tongue_ids[:200]
        test_yaofangs = tongue_yaofangs[:200]
        '''the evaluation criterion '''
        precisions = []
        recalls = []
        errors = []
        for i, output in enumerate(gen_output):
            # print test data label info:
            print('%d. \npatient tongue id: %s' % (i, test_tongue_ids[i]))
            print('label yaofang:')
            yaofang_label = patient_tongue_generator.sample_yaofang(
                test_yaofangs[i], yaopin_dict)
            print(' '.join(yaofang_label))

            if _proba_predict == False:
                output_index = patient_tongue_generator.label_outputfilter(
                    output)
            else:
                output_index = patient_tongue_generator.threshold_outputfilter(
                    output)
    #         print('predicted yaofang ids: {0}'.format(output_index))
            yaofang_output = patient_tongue_generator.sample_yaofang(
                output_index, yaopin_dict)
            print('predicted yaofang:')
            print(' '.join(yaofang_output) + '\n')

            precision, recall, error = generator_eval.evaluator(
                test_yaofangs[i], output_index)
            precisions.append(precision)
            recalls.append(recall)
            errors.append(error)
            print('------Score: precision: %f, recall: %f, error: %f' %
                  (precision, recall, error))

        print('------Average Score: average precision: %f, average recall: %f, error: %f' %
              (np.average(precisions), np.average(recalls), np.average(errors)))