Example #1
0
    def process_data(self):

        batches = []
        patient_dict = {}
        index = 0
        for patient in self.patients:
            # get patient's visits
            patient_dict['pid_' + patient['pid']] = index
            index += 1
            visits = patient['visits']
            # sorting visits by admission date
            sorted_visits = sorted(visits, key=lambda visit: visit['admsn_dt'])

            # number of visits
            no_visits = len(visits)

            # generating batch sample: list of visits including concept codes,
            # label of last visit mortality
            ls_visits = []
            label = [int(sorted_visits[no_visits - 1]['Death'])]
            for visit in sorted_visits:
                codes = visit['DXs']
                if not self.dx_only:
                    codes.extend(visit['CPTs'])

                code_size = len(codes)
                # code padding
                if code_size < self.max_len_visit:
                    list_zeros = [0] * (self.max_len_visit - code_size)
                    codes.extend(list_zeros)
                ls_visits.append(codes)

            # visit padding
            if no_visits < self.max_visits:
                for i in range(self.max_visits - no_visits):
                    list_zeros = [0] * self.max_len_visit
                    ls_visits.append(list_zeros)
            # print(len(ls_visits))
            batches.append([
                np.array(ls_visits, dtype=np.int32),
                np.array(label, dtype=np.int32)
            ])

        b_patients = []
        b_label = []
        for batch in batches:
            b_patients.append(batch[0])
            b_label.append(batch[1])

        save_file({
            'patient': b_patients,
            'label': b_label
        }, join(cfg.dataset_dir, 'mortality_' + cfg.data_source + '.pickle'))

        dict_file = join(cfg.dataset_dir, 'mimic3_patient_dict')
        print('patient dict file location: ', dict_file)
        with open(dict_file + '.json', 'w') as fp:
            json.dump(patient_dict, fp)

        return b_patients, b_label
 def save_preprocess_file(self):
     random.shuffle(self.fixed_samples)
     dev_rate = 0.1
     train_len = int(len(self.fixed_samples) * (1 - dev_rate))
     train = self.fixed_samples[:train_len]
     dev = self.fixed_samples[train_len:]
     save_file(train, pjoin(self.data_dir, 'train.json'), mode='json')
     save_file(dev, pjoin(self.data_dir, 'dev.json'), mode='json')
Example #3
0
    def save_preprocess_file(self):
        train_samples = []
        dev_samples = []
        test_samples = []
        for sample in self.samples:
            if sample['data_type'] == 1:
                train_samples.append(sample)
            elif sample['data_type'] == 2:
                dev_samples.append(sample)
            else:
                test_samples.append(sample)

        save_file(train_samples,
                  pjoin(self.data_dir, 'train.json'),
                  mode='json')
        save_file(dev_samples, pjoin(self.data_dir, 'dev.json'), mode='json')
        save_file(test_samples, pjoin(self.data_dir, 'test.json'), mode='json')
Example #4
0
def train(cfg):
    # ======= data ===========
    loaded_data = load_file(cfg['processed_path'], 'processed_datasets', mode='pickle')
    if loaded_data is None and cfg['load_preproc']:
        train_data_obj = Dataset(cfg['train_data_path'], 'train', bpe_path=cfg['bpe_data_dir'])
        dev_data_obj = Dataset(cfg['dev_data_path'], 'dev', train_data_obj.bpe_enc)
        test_data_obj = Dataset(cfg['test_data_path'], 'test', train_data_obj.bpe_enc)
        dev_data_obj.sort_example()
        test_data_obj.sort_example()
        save_file({'train_data_obj': train_data_obj, 'dev_data_obj': dev_data_obj, 'test_data_obj': test_data_obj},
                  cfg['processed_path'])
    else:
        train_data_obj = loaded_data['train_data_obj']
        dev_data_obj = loaded_data['dev_data_obj']
        test_data_obj = loaded_data['test_data_obj']

    # 1. delete too long sentence, max len: 50
    train_data_obj.filter_example(cfg['max_sent_len'])

    # ========= build network ======
    if cfg['model_class'] is None:
        print('Did not find the model, please check '
              '1) module name 2) class name and 3) implementation of get_default_model_paramters(). exit!')
        exit()

    with tf.variable_scope("model") as scp:
        model = cfg['model_class'](
            cfg, train_data_obj.bpe_enc.n_vocab, 3,
            train_data_obj.bpe_enc.get_idx_from_token(Dataset.CLASSIFY_TOKEN),
            scp.name
        )

    # ======= build session =======
    graph_handler = GraphHandler(model, cfg)
    evaluator = Evaluator(model, cfg)
    performance_record = PerformanceRecoder(cfg['ckpt_dir'], cfg['save_model'], cfg['save_num'])

    sess = graph_handler.initialize()
    model.load_openai_pretrained_transformer_model(
        sess, cfg['pretrained_transformer_dir'])

    # ======== begin to train =======
    loss_task_ma, loss_lm_ma, accuracy_ma = MovingAverage(), MovingAverage(), MovingAverage()
    for example_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_iter(
            cfg['train_batch_size'], cfg['n_steps']):
        global_step_val = sess.run(model.global_step) + 1
        step_out = model.step(sess, example_batch, cfg['summary_period'])
        loss_task_ma(step_out['task_loss']), accuracy_ma(step_out['accuracy']), loss_lm_ma(step_out['lm_loss'])
        graph_handler.add_summary(step_out['summary'], global_step_val)
        if global_step_val % 100 == 0:
            log.info('data round: %d: %d/%d, global step:%d -- loss: %.4f, accu: %.4f' %
                     (data_round, idx_b, batch_num, global_step_val,
                      loss_task_ma.value, accuracy_ma.value))
            if 'lm_loss' in step_out:
                log.info('\tauxiliary language model perplexity: %.4f' % math.exp(loss_lm_ma.value))

        # eval
        if global_step_val % cfg['eval_period'] == 0:
            dev_res = evaluator.get_evaluation(sess, dev_data_obj, global_step_val)
            log.info('==> for dev, loss: %.4f, accuracy: %.4f' %
                     (dev_res['loss'], dev_res['accuracy']))
            criterion_metric = dev_res['accuracy']

            if not test_data_obj.no_label:
                test_res = evaluator.get_evaluation(sess, test_data_obj, global_step_val)
                log.info('~~> for test, loss: %.4f, accuracy: %.4f' %
                         (test_res['loss'], test_res['accuracy']))

            is_in_top, deleted_step = performance_record.update_top_list(global_step_val, criterion_metric, sess)
            if is_in_top:  # get prediction for non-labeled test data
                if test_data_obj.no_label and global_step_val > 0.4 * cfg['n_steps']:
                    evaluator.get_evaluation(sess, test_data_obj, global_step_val, write_predict=True)

            # todo: time count

    log.info(str(performance_record.top_list))
Example #5
0
def train():
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_data_path, 'train')
        dev_data_obj = Dataset(cfg.dev_data_path, 'dev', train_data_obj.dicts)
        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj
            }, cfg.processed_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']

    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove
    output_cls_num = len(
        train_data_obj.dicts['sub_cls']) if cfg.fine_grained else len(
            train_data_obj.dicts['cls'])

    with tf.variable_scope(network_type) as scope:
        if network_type in model_set:
            model = Model(emb_mat_token,
                          emb_mat_glove,
                          len(train_data_obj.dicts['token']),
                          len(train_data_obj.dicts['char']),
                          train_data_obj.max_lens['token'],
                          output_cls_num,
                          scope=scope.name)

    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)
    performRecoder = PerformRecoder(3)

    if cfg.gpu_mem is None:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
        graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                      allow_soft_placement=True)

    else:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem)
        graph_config = tf.ConfigProto(gpu_options=gpu_options)
    # graph_config.gpu_options.allow_growth = True
    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    global_step = 0
    for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter(
            num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0
        loss, summary, train_op = model.step(sess,
                                             sample_batch,
                                             get_summary=if_get_summary)

        if global_step % 100 == 0:
            _logger.add('data round: %d: %d/%d, global step:%d -- loss: %.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

        if if_get_summary:
            graphHandler.add_summary(summary, global_step)

        # Occasional evaluation
        if global_step % (cfg.eval_period or steps_per_epoch) == 0:
            # ---- dev ----
            dev_loss, dev_accu = evaluator.get_evaluation(
                sess, dev_data_obj, global_step)
            _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' %
                        (dev_loss, dev_accu))

            is_in_top, deleted_step = performRecoder.update_top_list(
                global_step, dev_accu, sess)

        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        # if this_epoch_time is not None and mean_epoch_time is not None:
        #     _logger.add('##> this epoch time: %f, mean epoch time: %f' % (this_epoch_time, mean_epoch_time))
    do_analyse_qc(_logger.path)
Example #6
0
def train():
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_data_path, 'train')
        dev_data_obj = Dataset(cfg.dev_data_path,
                               'dev',
                               dicts=train_data_obj.dicts)
        test_data_obj = Dataset(cfg.test_data_path,
                                'test',
                                dicts=train_data_obj.dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)

        train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    train_data_obj.filter_data()
    dev_data_obj.filter_data()
    test_data_obj.filter_data()

    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove

    with tf.variable_scope(cfg.base_name) as scope:
        model = Model(emb_mat_token, emb_mat_glove,
                      len(train_data_obj.dicts['token']),
                      len(train_data_obj.dicts['char']),
                      train_data_obj.max_lens['token'], scope.name)
    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)
    performRecoder = PerformRecoder(cfg.save_num)

    if cfg.gpu_mem is None:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
        graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                      allow_soft_placement=True)

    elif cfg.gpu_mem < 1.:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem)
        graph_config = tf.ConfigProto(gpu_options=gpu_options)
    else:
        gpu_options = tf.GPUOptions()
        graph_config = tf.ConfigProto(gpu_options=gpu_options)

    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    global_step = 0

    for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter(
            num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0
        loss, summary = model.step(sess,
                                   sample_batch,
                                   get_summary=if_get_summary,
                                   global_step_value=global_step)
        if global_step % 100 == 0:
            _logger.add(
                'data round: %d: %d/%d, global step:%d -- loss_sl: %.4f, loss_rl: %.4f'
                %
                (data_round, idx_b, batch_num, global_step, loss[0], loss[1]))

        if if_get_summary:
            graphHandler.add_summary(summary, global_step)

        # Occasional evaluation
        evaluation = False
        if cfg.model_dir_suffix == 'test':
            if global_step % (cfg.eval_period or steps_per_epoch) == 0:
                evaluation = True
        elif is_base_training:
            if global_step > cfg.num_steps - 350000 and (
                    global_step % (cfg.eval_period or steps_per_epoch) == 0):
                evaluation = True
        else:
            if global_step % (cfg.eval_period or steps_per_epoch) == 0:
                if cfg.load_model:
                    evaluation = True
                else:
                    if global_step > 250000:
                        evaluation = True
        if evaluation:
            # ---- dev ----
            dev_loss, dev_accu, dev_perc = evaluator.get_evaluation(
                sess, dev_data_obj, global_step)
            _logger.add(
                '==> for dev, loss: %.4f %.4f, perc: %.4f, accuracy: %.4f' %
                (dev_loss[0], dev_loss[1], dev_perc, dev_accu))
            # ---- test ----
            test_loss, test_accu, test_perc = evaluator.get_evaluation(
                sess, test_data_obj, global_step)
            _logger.add(
                '~~> for test, loss: %.4f %.4f, perc: %.4f, accuracy: %.4f' %
                (test_loss[0], test_loss[1], test_perc, test_accu))

            is_in_top, deleted_step = performRecoder.update_top_list(
                global_step, dev_accu, sess)

        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        if this_epoch_time is not None and mean_epoch_time is not None:
            _logger.add('##> this epoch time: %f, mean epoch time: %f' %
                        (this_epoch_time, mean_epoch_time))

        if is_base_training and global_step >= 200000 and global_step % 50000 == 0 and cfg.save_model:
            graphHandler.save(sess, global_step)

    _logger.writeToFile()
    do_analyse_snli_rl(_logger.path)
Example #7
0
def multi_test():
    assert cfg.load_path is not None
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_data_path, 'train')
        dev_data_obj = Dataset(cfg.dev_data_path,
                               'dev',
                               dicts=train_data_obj.dicts)
        test_data_obj = Dataset(cfg.test_data_path,
                                'test',
                                dicts=train_data_obj.dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)

        train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    train_data_obj.filter_data()
    dev_data_obj.filter_data()
    test_data_obj.filter_data()

    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove

    with tf.variable_scope(cfg.base_name) as scope:
        model = Model(emb_mat_token, emb_mat_glove,
                      len(train_data_obj.dicts['token']),
                      len(train_data_obj.dicts['char']),
                      train_data_obj.max_lens['token'], scope.name)
    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)

    if cfg.gpu_mem is None:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
        graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                      allow_soft_placement=True)

    elif cfg.gpu_mem < 1.:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem)
        graph_config = tf.ConfigProto(gpu_options=gpu_options)
    else:
        gpu_options = tf.GPUOptions()
        graph_config = tf.ConfigProto(gpu_options=gpu_options)

    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    repeat_num = 10
    time_counter = TimeCounter()

    for t in range(repeat_num):
        # ---- dev ----
        test_loss, test_accu, test_perc = evaluator.get_evaluation(
            sess, test_data_obj, None, time_counter=time_counter)
        _logger.add(
            '==> for test, loss: %.4f %.4f, perc: %.4f, accuracy: %.4f' %
            (test_loss[0], test_loss[1], test_perc, test_accu))
        print(time_counter.update_data_round(t + 1))
Example #8
0
 def save_dict(self, path):
     save_file(self.dicts, path, 'token and char dict data', 'pickle')
Example #9
0
def test():
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        raw_data = RawDataProcessor(cfg.data_dir)
        train_data_list = raw_data.get_data_list('train')
        dev_data_list = raw_data.get_data_list('dev')
        test_data_list = raw_data.get_data_list('test')

        train_data_obj = Dataset(train_data_list, 'train')
        dev_data_obj = Dataset(dev_data_list, 'dev', train_data_obj.dicts)
        test_data_obj = Dataset(test_data_list, 'test', train_data_obj.dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)
        train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    train_data_obj.filter_data(True)
    dev_data_obj.filter_data(True)
    test_data_obj.filter_data(True)

    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove

    with tf.variable_scope(network_type) as scope:
        if network_type in model_set:
            model = Model(emb_mat_token, emb_mat_glove,
                          len(train_data_obj.dicts['token']),
                          len(train_data_obj.dicts['char']),
                          train_data_obj.max_lens['token'], scope.name)

    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)

    if cfg.gpu_mem is None:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
        graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                      allow_soft_placement=True)

    else:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem)
        graph_config = tf.ConfigProto(gpu_options=gpu_options)
    # graph_config.gpu_options.allow_growth = True
    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # todo: test model

    # ---- dev ----
    dev_loss, dev_accu, dev_sent_accu = evaluator.get_evaluation(
        sess, dev_data_obj, 1)
    _logger.add(
        '==> for dev, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' %
        (dev_loss, dev_accu, dev_sent_accu))
    # ---- test ----
    test_loss, test_accu, test_sent_accu = evaluator.get_evaluation(
        sess, test_data_obj, 1)
    _logger.add(
        '~~> for test, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' %
        (test_loss, test_accu, test_sent_accu))

    # ---- train ----
    train_loss, train_accu, train_sent_accu = evaluator.get_evaluation(
        sess, train_data_obj, 1)
    _logger.add(
        '--> for test, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' %
        (train_loss, train_accu, train_sent_accu))
Example #10
0
def train():
    n_fold_val = 10
    output_model_params()

    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        data_obj = Dataset(cfg.data_path, cfg.dataset_type)
        save_file({'data_obj': data_obj}, cfg.processed_path)
    else:
        data_obj = data['data_obj']

    data_obj.split_dataset_to_blocks(n_fold_val)

    # for block len
    if cfg.block_len is None and cfg.context_fusion_method == 'block':
        _logger.add()
        _logger.add('calculating block length for dataset')
        statistic = data_obj.get_statistic()
        expected_n = statistic['mean'] + statistic['std'] * math.sqrt(
            2. * math.log(1. * cfg.train_batch_size))
        dy_block_len = math.ceil(math.pow(2 * expected_n,
                                          1.0 / 3)) + 1  # fixme: change length
        cfg.block_len = dy_block_len
        _logger.add('block length is %d' % dy_block_len)

    emb_mat_token, emb_mat_glove = data_obj.emb_mat_token, data_obj.emb_mat_glove
    output_cls_num = data_obj.class_num
    steps_per_epoch = int(
        math.ceil(1.0 * data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    dev_performance_list = []
    for n_th_fold in range(n_fold_val):
        time_accu_recorder = TimeAccuRecorder(data_obj.dataset_type, n_th_fold,
                                              cfg.answer_dir)

        g = tf.Graph()
        with g.as_default():
            with tf.variable_scope("%s_%s" %
                                   (cfg.dataset_type, network_type)) as scope:
                if network_type in model_set:
                    model = Model(emb_mat_token,
                                  emb_mat_glove,
                                  len(data_obj.dicts['token']),
                                  len(data_obj.dicts['char']),
                                  data_obj.max_lens['token'],
                                  output_cls_num,
                                  scope=scope.name)
                else:
                    assert RuntimeError

                graphHandler = GraphHandler(model)
                evaluator = Evaluator(model)
                performRecoder = PerformRecoder(1)

                if cfg.gpu_mem is None:
                    gpu_options = tf.GPUOptions(
                        per_process_gpu_memory_fraction=cfg.gpu_mem,
                        allow_growth=True)
                    graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                                  allow_soft_placement=True)

                else:
                    gpu_options = tf.GPUOptions(
                        per_process_gpu_memory_fraction=cfg.gpu_mem)
                    graph_config = tf.ConfigProto(gpu_options=gpu_options)
                # graph_config.gpu_options.allow_growth = True
                sess = tf.Session(config=graph_config)
                graphHandler.initialize(sess)

                global_step = 0
                for sample_batch, batch_num, data_round, idx_b in \
                        data_obj.generate_batch_sample_iter(n_th_fold, num_steps):
                    global_step = sess.run(model.global_step) + 1
                    if_get_summary = global_step % (cfg.log_period
                                                    or steps_per_epoch) == 0
                    loss, summary, train_op = model.step(
                        sess, sample_batch, get_summary=if_get_summary)
                    # if global_step % 100 == 0:
                    _logger.add('cross validation index: %d' % n_th_fold)
                    _logger.add(
                        'data round: %d: %d/%d, global step:%d -- loss: %.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

                    if if_get_summary:
                        graphHandler.add_summary(summary, global_step)

                    # Occasional evaluation
                    if global_step % (cfg.eval_period or steps_per_epoch) == 0:
                        # ---- dev ----
                        dev_loss, dev_accu = evaluator.get_evaluation(
                            sess, data_obj, n_th_fold, global_step)
                        _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' %
                                    (dev_loss, dev_accu))

                        # record time vs. accuracy
                        time_accu_recorder.add_data(
                            cfg.time_counter.global_training_time, dev_accu)
                        is_in_top, deleted_step = performRecoder.update_top_list(
                            global_step, dev_accu, sess)
                    this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
                        data_round)
                    # if this_epoch_time is not None and mean_epoch_time is not None:
                    #     _logger.add('##> this epoch time: %f, mean epoch time: %f' % (this_epoch_time, mean_epoch_time))
                dev_performance_list.append(performRecoder.best_result)
                _logger.add("%d th x val accuracy is %.4f" %
                            (n_th_fold, performRecoder.best_result))
                time_accu_recorder.save_to_file()

    if len(dev_performance_list) > 0:
        dev_performance_array = np.array(dev_performance_list)
        xval_average = np.mean(dev_performance_array)
        xval_std = np.std(dev_performance_array)
    else:
        xval_average = 0
        xval_std = 0
    dev_performance_list_str = [str(elem) for elem in dev_performance_list]
    _logger.add("all accuracies: %s" % ', '.join(dev_performance_list_str))
    _logger.add('%d fold cross validation average accuracy is %f, standard variance is %f' % \
                (n_fold_val, xval_average, xval_std))
    _logger.writeToFile()
Example #11
0
def train():
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'data', 'pickle')
    if not ifLoad or not loadFile:
        data_object = Dataset(cfg.train_data_path, cfg.dev_data_path)
        data_object.save_dict(cfg.dict_path)
        save_file({'data_obj': data_object}, cfg.processed_path)
    else:
        data_object = data['data_obj']

    emb_mat_token, emb_mat_glove = data_object.emb_mat_token, data_object.emb_mat_glove

    with tf.variable_scope(network_type) as scope:
        if network_type in model_set:
            model = Model(emb_mat_token, emb_mat_glove,
                          len(data_object.dicts['token']),
                          len(data_object.dicts['char']),
                          data_object.max_lens['token'], scope.name)

    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)
    performRecoder = PerformRecoder(5)

    if cfg.gpu_mem < 1.:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
    else:
        gpu_options = tf.GPUOptions()
    graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                  allow_soft_placement=True)
    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * len(data_object.digitized_train_data_list) /
                  cfg.train_batch_size))
    num_steps = steps_per_epoch * cfg.max_epoch or cfg.num_steps

    global_step = 0
    # debug or not
    if cfg.debug:
        sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    for sample_batch, batch_num, data_round, idx_b in Dataset.generate_batch_sample_iter(
            data_object.digitized_train_data_list, num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0
        loss, summary, train_op = model.step(sess,
                                             sample_batch,
                                             get_summary=if_get_summary)

        if global_step % 10 == 0:
            _logger.add('data round: %d: %d/%d, global step:%d -- loss: %.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

        if if_get_summary:
            graphHandler.add_summary(summary, global_step)

        # Occasional evaluation
        if global_step % (cfg.eval_period or steps_per_epoch) == 0:
            # ---- dev ----
            dev_loss, dev_accu = evaluator.get_evaluation(
                sess, data_object.digitized_dev_data_list, 'dev', global_step)
            _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' %
                        (dev_loss, dev_accu))
            # ---- test ----
            if cfg.test_data_name != None:
                test_loss, test_accu = evaluator.get_evaluation(
                    sess, data_object.digitized_test_data_list, 'test',
                    global_step)
                _logger.add('~~> for test, loss: %.4f, accuracy: %.4f' %
                            (test_loss, test_accu))

            is_in_top, deleted_step = performRecoder.update_top_list(
                global_step, dev_accu, sess)
        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        if this_epoch_time is not None and mean_epoch_time is not None:
            _logger.add('##> this epoch time: %f, mean epoch time: %f' %
                        (this_epoch_time, mean_epoch_time))
Example #12
0
def train():
    output_model_params()
    loadFile = True
    ifLoad, data = load_file(cfg.processed_path, 'processed data', 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_dataset_path, data_type='train')
        dev_data_obj = Dataset(cfg.dev_dataset_path,
                               data_type='dev',
                               dicts=train_data_obj.dicts)
        save_file({
            'train_data': train_data_obj,
            'dev_data': dev_data_obj
        }, cfg.processed_path)
    else:
        train_data_obj = data['train_data']
        dev_data_obj = data['dev_data']

    train_data_obj.filter_data()
    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove

    # for block len
    if cfg.block_len is None and cfg.context_fusion_method == 'block':
        _logger.add()
        _logger.add('calculating block length for dataset')
        statistic = train_data_obj.get_statistic()
        expected_n = statistic['mean'] + statistic['std'] * math.sqrt(
            2. * math.log(1. * cfg.train_batch_size))
        dy_block_len = math.ceil(math.pow(2 * expected_n,
                                          1.0 / 3)) + 1  # fixme: change length
        cfg.block_len = dy_block_len
        _logger.add('block length is %d' % dy_block_len)

    with tf.variable_scope(network_type) as scope:
        model = Model(emb_mat_token, emb_mat_glove,
                      len(train_data_obj.dicts['token']),
                      cfg.word_embedding_length, cfg.hidden_units_num,
                      scope.name)

    graph_handler = GraphHandler(model)
    evaluator = Evaluator(model)
    perform_recoder = PerformRecoder(3)

    if cfg.gpu_mem is None:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
        graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                      allow_soft_placement=True)

    else:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem)
        graph_config = tf.ConfigProto(gpu_options=gpu_options)
    sess = tf.Session(config=graph_config)

    graph_handler.initialize(sess)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    global_step = 0
    for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter(
            num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0

        loss, summary, train_op = model.step(sess,
                                             sample_batch,
                                             get_summary=if_get_summary)
        if global_step % 100 == 0:
            _logger.add('data round: %d: %d/%d, global step:%d -- loss: %.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

        if if_get_summary:
            graph_handler.add_summary(summary, global_step)

        # period evaluation
        if (global_step > (cfg.num_steps - 20000) or cfg.model_dir_suffix == 'test') and \
                                global_step % (cfg.eval_period or steps_per_epoch) == 0:
            # ---- dev ----
            dev_loss, dev_accu = evaluator.get_evaluation(
                sess, dev_data_obj, global_step)
            _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' %
                        (dev_loss, dev_accu))
            is_in_top, deleted_step = perform_recoder.update_top_list(
                global_step, dev_accu, sess)

        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        if this_epoch_time is not None and mean_epoch_time is not None:
            _logger.add('##> this epoch time: %f, mean epoch time: %f' %
                        (this_epoch_time, mean_epoch_time))
    _logger.writeToFile()
    do_analysis_squad_sim(_logger.path)
Example #13
0
def train():
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        train_data_obj = Dataset(cfg.train_data_path, 'train')
        dev_data_obj = Dataset(cfg.dev_data_path,
                               'dev',
                               dicts=train_data_obj.dicts)
        test_data_obj = Dataset(cfg.test_data_path,
                                'test',
                                dicts=train_data_obj.dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)

        train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    train_data_obj.filter_data()
    dev_data_obj.filter_data()
    test_data_obj.filter_data()

    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove

    with tf.variable_scope(network_type) as scope:
        model = Model(emb_mat_token, emb_mat_glove,
                      len(train_data_obj.dicts['token']),
                      len(train_data_obj.dicts['char']),
                      train_data_obj.max_lens['token'], scope.name)

    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)
    performRecoder = PerformRecoder(5)

    sess = tf.Session(config=sess_config_gene(cfg.gpu_mem))

    graphHandler.initialize(sess)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    global_step = 0
    for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter(
            num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0
        loss, summary, train_op = model.step(sess,
                                             sample_batch,
                                             get_summary=if_get_summary)
        if global_step % 100 == 0:
            _logger.add('data round: %d: %d/%d, global step:%d -- loss: %.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

        if if_get_summary:
            graphHandler.add_summary(summary, global_step)

        # # occasional saving
        # if global_step % (cfg.save_period or steps_per_epoch) == 0:
        #     graphHandler.save(sess, global_step)

        # Occasional evaluation
        if (global_step > (cfg.num_steps - 300000) or cfg.model_dir_prefix=='test') and \
                                global_step % (cfg.eval_period or steps_per_epoch) == 0:
            # ---- dev ----
            dev_loss, dev_accu = evaluator.get_evaluation(
                sess, dev_data_obj, global_step)
            _logger.add('==> for dev, loss: %.4f, accuracy: %.4f' %
                        (dev_loss, dev_accu))
            # ---- test ----
            test_loss, test_accu = evaluator.get_evaluation(
                sess, test_data_obj, global_step)
            _logger.add('~~> for test, loss: %.4f, accuracy: %.4f' %
                        (test_loss, test_accu))
            is_in_top, deleted_step = performRecoder.update_top_list(
                global_step, dev_accu, sess)

        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        if this_epoch_time is not None and mean_epoch_time is not None:
            _logger.add('##> this epoch time: %f, mean epoch time: %f' %
                        (this_epoch_time, mean_epoch_time))

            # if global_step % (cfg.save_period or steps_per_epoch) != 0:
            #     graphHandler.save(sess, global_step)
    _logger.writeToFile()
    do_analyse_snli(_logger.path)
Example #14
0
def train():
    output_model_params()
    loadFile = True
    ifLoad, data = False, None
    if loadFile:
        ifLoad, data = load_file(cfg.processed_path, 'processed data',
                                 'pickle')
    if not ifLoad or not loadFile:
        raw_data = RawDataProcessor(cfg.data_dir)
        train_data_list = raw_data.get_data_list('train')
        dev_data_list = raw_data.get_data_list('dev')
        test_data_list = raw_data.get_data_list('test')

        train_data_obj = Dataset(train_data_list, 'train')
        dev_data_obj = Dataset(dev_data_list, 'dev', train_data_obj.dicts)
        test_data_obj = Dataset(test_data_list, 'test', train_data_obj.dicts)

        save_file(
            {
                'train_data_obj': train_data_obj,
                'dev_data_obj': dev_data_obj,
                'test_data_obj': test_data_obj
            }, cfg.processed_path)
        train_data_obj.save_dict(cfg.dict_path)
    else:
        train_data_obj = data['train_data_obj']
        dev_data_obj = data['dev_data_obj']
        test_data_obj = data['test_data_obj']

    train_data_obj.filter_data(cfg.only_sentence, cfg.fine_grained)
    dev_data_obj.filter_data(True, cfg.fine_grained)
    test_data_obj.filter_data(True, cfg.fine_grained)

    # for block len
    if cfg.block_len is None and cfg.context_fusion_method == 'block':
        _logger.add()
        _logger.add('calculating block length for dataset')
        statistic = train_data_obj.get_statistic()
        expected_n = statistic['mean'] + statistic['std'] * math.sqrt(
            2. * math.log(1. * cfg.train_batch_size))
        dy_block_len = math.ceil(math.pow(2 * expected_n,
                                          1.0 / 3)) + 1  # fixme: change length
        cfg.block_len = dy_block_len
        _logger.add('block length is %d' % dy_block_len)

    emb_mat_token, emb_mat_glove = train_data_obj.emb_mat_token, train_data_obj.emb_mat_glove

    with tf.variable_scope(network_type) as scope:
        if network_type in model_set:
            model = Model(emb_mat_token, emb_mat_glove,
                          len(train_data_obj.dicts['token']),
                          len(train_data_obj.dicts['char']),
                          train_data_obj.max_lens['token'], scope.name)

    graphHandler = GraphHandler(model)
    evaluator = Evaluator(model)
    performRecoder = PerformRecoder(3)

    if cfg.gpu_mem is None:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem, allow_growth=True)
        graph_config = tf.ConfigProto(gpu_options=gpu_options,
                                      allow_soft_placement=True)

    else:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_mem)
        graph_config = tf.ConfigProto(gpu_options=gpu_options)
    # graph_config.gpu_options.allow_growth = True
    sess = tf.Session(config=graph_config)
    graphHandler.initialize(sess)

    # begin training
    steps_per_epoch = int(
        math.ceil(1.0 * train_data_obj.sample_num / cfg.train_batch_size))
    num_steps = cfg.num_steps or steps_per_epoch * cfg.max_epoch

    global_step = 0
    for sample_batch, batch_num, data_round, idx_b in train_data_obj.generate_batch_sample_iter(
            num_steps):
        global_step = sess.run(model.global_step) + 1
        if_get_summary = global_step % (cfg.log_period or steps_per_epoch) == 0
        loss, summary, train_op = model.step(sess,
                                             sample_batch,
                                             get_summary=if_get_summary)
        if global_step % 100 == 0:
            _logger.add('data round: %d: %d/%d, global step:%d -- loss: %.4f' %
                        (data_round, idx_b, batch_num, global_step, loss))

        if if_get_summary:
            graphHandler.add_summary(summary, global_step)

        # Occasional evaluation
        if global_step % (cfg.eval_period or steps_per_epoch) == 0:
            # ---- dev ----
            dev_loss, dev_accu, dev_sent_accu = evaluator.get_evaluation(
                sess, dev_data_obj, global_step)
            _logger.add(
                '==> for dev, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f'
                % (dev_loss, dev_accu, dev_sent_accu))
            # ---- test ----
            test_loss, test_accu, test_sent_accu = evaluator.get_evaluation(
                sess, test_data_obj, global_step)
            _logger.add(
                '~~> for test, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f'
                % (test_loss, test_accu, test_sent_accu))
            # ---- train ----
            # train_loss, train_accu, train_sent_accu = evaluator.get_evaluation(
            #     sess, train_data_obj, global_step
            # )
            # _logger.add('--> for train, loss: %.4f, accuracy: %.4f, sentence accuracy: %.4f' %
            #             (train_loss, train_accu, train_sent_accu))
            is_in_top, deleted_step = performRecoder.update_top_list(
                global_step, dev_accu, sess)
            if is_in_top and global_step > 30000:  # todo-ed: delete me to run normally
                # evaluator.get_evaluation_file_output(sess, dev_data_obj, global_step, deleted_step)
                evaluator.get_evaluation_file_output(sess, test_data_obj,
                                                     global_step, deleted_step)
        this_epoch_time, mean_epoch_time = cfg.time_counter.update_data_round(
            data_round)
        if this_epoch_time is not None and mean_epoch_time is not None:
            _logger.add('##> this epoch time: %f, mean epoch time: %f' %
                        (this_epoch_time, mean_epoch_time))
            # finally save
            # if global_step % (cfg.save_period or steps_per_epoch) != 0:
            #     graphHandler.save(sess, global_step)
    do_analyse_sst(_logger.path)
Example #15
0
    def process_data(self):
        batches = []
        for patient in self.patients:
            visits = patient['visits']

            selected_visits = sorted(visits,
                                     key=lambda visit: visit['admsn_dt'])
            selected_codes = []

            # reduced window
            if self.is_reduced_window:
                reduced_window = random.RandomState(1).randint(
                    self.skip_window)
            else:
                reduced_window = 0
            # actual window
            actual_window = self.skip_window - reduced_window

            # concat all codes togeter for one patient, and the format is [code, date]
            for s_visit in selected_visits:
                dt = datetime.datetime.strptime(s_visit['admsn_dt'], "%Y%m%d")
                codes = s_visit['DXs']
                if not self.dx_only:
                    codes.extend(s_visit['CPTs'])
                for code in codes:
                    selected_codes.append([code, dt])

            # sampling codes based on their frequncy in dataset
            if self.is_sample:
                sampled_codes = [
                    code for code in selected_codes
                    if self.word_sample[code[0]] > random.rand() * 2**32
                ]
            else:
                sampled_codes = selected_codes

            # generate batch samples
            for pos, word in enumerate(sampled_codes):

                # now go over all words from the actual window, predicting each one in turn
                start = max(0, pos - actual_window)
                window_pos = enumerate(
                    sampled_codes[start:(pos + actual_window + 1)], start)

                context_indices = [[word2[0], (word2[1] - word[1]).days]
                                   for pos2, word2 in window_pos
                                   if (word2[0] is not None and pos2 != pos)]

                context_len = len(context_indices)
                if context_len > 0:
                    # if context lenth is less than two times actual window, and padding
                    if context_len < 2 * actual_window:
                        for i in range(2 * actual_window - context_len):
                            context_indices.append([0, 0])

                    intervals = np.zeros(
                        (2 * actual_window, 2 * actual_window))
                    for i in range(2 * actual_window):
                        for j in range(2 * actual_window):
                            if i > j:
                                code_i = context_indices[i][0]
                                code_j = context_indices[j][0]
                                interval_i = context_indices[i][1]
                                interval_j = context_indices[j][1]
                                if code_i > 0 and code_j > 0:
                                    intervals[i, j] = np.abs(interval_i -
                                                             interval_j) + 1
                                    intervals[j, i] = np.abs(interval_i -
                                                             interval_j) + 1

                    batches.append([
                        np.array(context_indices, dtype=np.int32), intervals,
                        np.array([word[0]], dtype=np.int32)
                    ])

        contexts = []
        intervals = []
        labels = []
        for batch in batches:
            contexts.append(batch[0])
            intervals.append(batch[1])
            labels.append(batch[2])

        save_file(
            {
                'contexts': contexts,
                'intervals': intervals,
                'labels': labels
            }, cfg.processed_path)
        return contexts, intervals, labels