Ejemplo n.º 1
0
def prep_train_validate_data_no_smote(data_dir, num_folds, fold_id, batch_size):
    data_loader = SeqDataLoader(data_dir, num_folds, fold_id, classes=5)

    X_train, y_train, X_test, y_test = data_loader.load_data()

    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    y_train = y_train.reshape(y_train.shape[0], 1)
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    y_test = y_test.reshape(y_test.shape[0], 1)


    total_training_length = (X_train.shape[0] // batch_size) * batch_size
    total_test_length = (X_test.shape[0] // batch_size) * batch_size

    X_train = X_train[:total_training_length]
    y_train = y_train[:total_training_length]
    X_test = X_test[:total_test_length]
    y_test = y_test[:total_test_length]

    return X_train, y_train, X_test, y_test
Ejemplo n.º 2
0
def run_program(hparams, FLAGS):
    # 参数:超参数,flog标志
    # load dataset 加载数据集
    num_folds = FLAGS.num_folds
    data_dir = FLAGS.data_dir
    if '13' in data_dir:
        data_version = 2013
    else:
        n_oversampling = 30000
        data_version = 2018

    output_dir = FLAGS.output_dir
    classes = FLAGS.classes
    n_classes = len(classes)

    path, channel_ename = os.path.split(data_dir)
    traindata_dir = os.path.join(
        os.path.abspath(os.path.join(data_dir, os.pardir)), 'traindata/')
    print(str(datetime.now()))

    def evaluate_model(hparams, X_test, y_test, classes):
        acc_track = []
        n_classes = len(classes)
        y_true = []
        y_pred = []
        alignments_alphas_all = []  # (batch_num,B,max_time_step,max_time_step)
        for batch_i, (source_batch, target_batch) in enumerate(
                batch_data(X_test, y_test, hparams.batch_size)):
            # if source_batch.shape[1] != hparams.max_time_step:
            #     print ("Num of steps is: ", source_batch.shape[1])
            # try:
            pred_outputs_ = sess.run(pred_outputs,
                                     feed_dict={
                                         inputs: source_batch,
                                         keep_prob_: 1.0
                                     })

            alignments_alphas = sess.run(dec_states.alignment_history.stack(),
                                         feed_dict={
                                             inputs: source_batch,
                                             dec_inputs: target_batch[:, :-1],
                                             keep_prob_: 1.0
                                         })

            # acc_track.append(np.mean(dec_input == target_batch))
            pred_outputs_ = pred_outputs_[:, :hparams.
                                          max_time_step]  # remove the last prediction <EOD>
            target_batch_ = target_batch[:, 1:
                                         -1]  # remove the last <EOD> and the first <SOD>
            acc_track.append(pred_outputs_ == target_batch_)

            alignments_alphas = alignments_alphas.transpose((1, 0, 2))
            alignments_alphas = alignments_alphas[:, :hparams.max_time_step]
            alignments_alphas_all.append(alignments_alphas)

            _y_true = target_batch_.flatten()
            _y_pred = pred_outputs_.flatten()

            y_true.extend(_y_true)
            y_pred.extend(_y_pred)

        cm = confusion_matrix(y_true, y_pred, labels=range(n_classes))
        ck_score = cohen_kappa_score(y_true, y_pred)
        acc_avg, acc, f1_macro, f1, sensitivity, specificity, PPV = evaluate_metrics(
            cm, classes)
        # print ("batch_i: {}").format(batch_i)
        print(
            'Average Accuracy -> {:>6.4f}, Macro F1 -> {:>6.4f} and Cohen\'s Kappa -> {:>6.4f} on test set'
            .format(acc_avg, f1_macro, ck_score))
        for index_ in range(n_classes):
            print(
                "\t{} rhythm -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1 : {:1.4f} Accuracy: {:1.4f}"
                .format(classes[index_], sensitivity[index_],
                        specificity[index_], PPV[index_], f1[index_],
                        acc[index_]))
        print(
            "\tAverage -> Sensitivity: {:1.4f}, Specificity: {:1.4f}, Precision (PPV): {:1.4f}, F1-score: {:1.4f}, Accuracy: {:1.4f}"
            .format(np.mean(sensitivity), np.mean(specificity), np.mean(PPV),
                    np.mean(f1), np.mean(acc)))

        return acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all

    def count_prameters():
        print(
            '# of Params: ',
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.trainable_variables()
            ]))

    # folds = [4,5,6,7]
    # # folds = [8,9,10,11]
    # # folds = [12,13,14,15]
    # # folds = [16,17,18,19]
    # folds = [8]
    # for fold_idx in folds:
    for fold_idx in range(num_folds):
        start_time_fold_i = time.time()
        data_loader = SeqDataLoader(data_dir,
                                    num_folds,
                                    fold_idx,
                                    classes=classes)
        X_train, y_train, X_test, y_test = data_loader.load_data(
            seq_len=hparams.max_time_step)

        # preprocessing
        char2numY = dict(zip(classes, range(len(classes))))
        pre_f1_macro = 0

        # <SOD> is a token to show start of decoding  and <EOD> is a token to indicate end of decoding
        char2numY['<SOD>'] = len(char2numY)
        char2numY['<EOD>'] = len(char2numY)
        num2charY = dict(zip(char2numY.values(), char2numY.keys()))

        # over-sampling: SMOTE:
        X_train = np.reshape(X_train,
                             [X_train.shape[0] * X_train.shape[1], -1])
        y_train = y_train.flatten()

        if data_version == 2018:
            # extract just undersamples For 2018
            under_sample_len = 35000  #30000
            Ws = np.where(y_train == char2numY['W'])[0]
            len_W = len(np.where(y_train == char2numY['W'])[0])
            permute = np.random.permutation(len_W)
            len_r = len_W - under_sample_len if (len_W -
                                                 under_sample_len) > 0 else 0
            permute = permute[:len_r]
            y_train = np.delete(y_train, Ws[permute], axis=0)
            X_train = np.delete(X_train, Ws[permute], axis=0)

            under_sample_len = 35000  #40000
            N2s = np.where(y_train == char2numY['N2'])[0]
            len_N2 = len(np.where(y_train == char2numY['N2'])[0])
            permute = np.random.permutation(len_N2)
            len_r = len_N2 - under_sample_len if (len_N2 -
                                                  under_sample_len) > 0 else 0
            permute = permute[:len_r]
            y_train = np.delete(y_train, N2s[permute], axis=0)
            X_train = np.delete(X_train, N2s[permute], axis=0)

        nums = []
        for cl in classes:
            nums.append(len(np.where(y_train == char2numY[cl])[0]))

        if (os.path.exists(traindata_dir) == False):
            os.mkdir(traindata_dir)
        fname = os.path.join(
            traindata_dir, 'trainData_' + channel_ename + '_SMOTE_all_10s_f' +
            str(fold_idx) + '.npz')

        if (os.path.isfile(fname)):
            X_train, y_train, _ = data_loader.load_npz_file(fname)

        else:
            if data_version == 2013:
                n_osamples = nums[2] - 7000
                ratio = {
                    0: n_osamples if nums[0] < n_osamples else nums[0],
                    1: n_osamples if nums[1] < n_osamples else nums[1],
                    2: nums[2],
                    3: n_osamples if nums[3] < n_osamples else nums[3],
                    4: n_osamples if nums[4] < n_osamples else nums[4]
                }

            if data_version == 2018:
                ratio = {
                    0: n_oversampling if nums[0] < n_oversampling else nums[0],
                    1: n_oversampling if nums[1] < n_oversampling else nums[1],
                    2: nums[2],
                    3: n_oversampling if nums[3] < n_oversampling else nums[3],
                    4: n_oversampling if nums[4] < n_oversampling else nums[4]
                }

            # ratio = {0: 40000 if nums[0] < 40000 else nums[0], 1: 27000 if nums[1] < 27000 else nums[1], 2: nums[2],
            #          3: 30000 if nums[3] < 30000 else nums[3], 4: 27000 if nums[4] < 27000 else nums[4]}
            sm = SMOTE(random_state=12, ratio=ratio)
            # sm = SMOTE(random_state=12, ratio=ratio)
            # sm = RandomUnderSampler(random_state=12,ratio=ratio)
            X_train, y_train = sm.fit_sample(X_train, y_train)
            data_loader.save_to_npz_file(X_train, y_train,
                                         data_loader.sampling_rate, fname)

        X_train = X_train[:(X_train.shape[0] // hparams.max_time_step) *
                          hparams.max_time_step, :]
        y_train = y_train[:(X_train.shape[0] // hparams.max_time_step) *
                          hparams.max_time_step]

        X_train = np.reshape(X_train, [-1, X_test.shape[1], X_test.shape[2]])
        y_train = np.reshape(y_train, [
            -1,
            y_test.shape[1],
        ])

        # shuffle training data_2013
        permute = np.random.permutation(len(y_train))
        X_train = np.asarray(X_train)
        X_train = X_train[permute]
        y_train = y_train[permute]

        # add '<SOD>' to the beginning of each label sequence, and '<EOD>' to the end of each label sequence (both for training and test sets)
        y_train = [[char2numY['<SOD>']] + [y_ for y_ in date] +
                   [char2numY['<EOD>']] for date in y_train]
        y_train = np.array(y_train)

        y_test = [[char2numY['<SOD>']] + [y_ for y_ in date] +
                  [char2numY['<EOD>']] for date in y_test]
        y_test = np.array(y_test)

        print('The training set after oversampling: ', classes)
        for cl in classes:
            print(cl, len(np.where(y_train == char2numY[cl])[0]))

        # training and testing the model
        if (os.path.exists(FLAGS.checkpoint_dir) == False):
            os.mkdir(FLAGS.checkpoint_dir)

        if (os.path.exists(output_dir) == False):
            os.makedirs(output_dir)
        loss_track = []
        with tf.Graph().as_default(), tf.Session() as sess:

            # Placeholders
            inputs = tf.placeholder(
                tf.float32, [None, hparams.max_time_step, hparams.input_depth],
                name='inputs')
            targets = tf.placeholder(tf.int32, (None, None), 'targets')
            dec_inputs = tf.placeholder(tf.int32, (None, None),
                                        'decoder_inputs')
            keep_prob_ = tf.placeholder(tf.float32, name='keep')

            # model
            logits, pred_outputs, loss, optimizer, dec_states = build_whole_model(
                hparams, char2numY, inputs, targets, dec_inputs, keep_prob_)
            count_prameters()
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            saver = tf.train.Saver()
            print(str(datetime.now()))
            # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            ckpt_name = "model_fold{:02d}.ckpt".format(fold_idx)
            ckpt_exist = False
            for file in os.listdir(FLAGS.checkpoint_dir):
                if file.startswith(ckpt_name):
                    ckpt_exist = True
            ckpt_name = os.path.join(FLAGS.checkpoint_dir, ckpt_name)

            # if ckpt and ckpt.model_checkpoint_path:
            # if os.path.isfile(ckpt_name):
            if ckpt_exist:
                # # Restore
                # ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
                # saver.restore(session, os.path.join(checkpoint_dir, ckpt_name))
                # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_dir))

                saver.restore(sess, ckpt_name)

                # or 'load meta graph' and restore weights
                # saver = tf.train.import_meta_graph(ckpt_name+".meta")
                # saver.restore(session,tf.train.latest_checkpoint(checkpoint_dir))
                evaluate_model(hparams, X_test, y_test, classes)
            else:

                for epoch_i in range(hparams.epochs):
                    start_time = time.time()
                    # train_acc = []
                    y_true = []
                    y_pred = []
                    for batch_i, (source_batch, target_batch) in enumerate(
                            batch_data(X_train, y_train, hparams.batch_size)):

                        # _, batch_loss, batch_logits, alignments_alphas = sess.run([optimizer, loss, logits,dec_states.alignment_history.stack()],
                        #     feed_dict = {inputs: source_batch,
                        #                  dec_inputs: target_batch[:, :-1],
                        #                  targets: target_batch[:, 1:],keep_prob_: 0.5} #,
                        #                                        )

                        _, batch_loss, batch_logits = sess.run(
                            [optimizer, loss, logits],
                            feed_dict={
                                inputs: source_batch,
                                dec_inputs: target_batch[:, :-1],
                                targets: target_batch[:, 1:],
                                keep_prob_: 0.5
                            }  #,
                        )
                        loss_track.append(batch_loss)
                        # alignments_alphas = alignments_alphas.transpose((1, 0, 2))
                        # alignments_alphas = alignments_alphas[:, :hparams.max_time_step]
                        # train_acc.append(batch_logits.argmax(axis=-1) == target_batch[:,1:])
                        y_pred_ = batch_logits[:, :hparams.
                                               max_time_step].argmax(axis=-1)
                        y_true_ = target_batch[:, 1:-1]

                        # input_tags - word representation of input sequence, use None to skip
                        # output_tags - word representation of output sequence, use None to skip
                        # i - index of input element in batch
                        # input_tags = [[num2charY[i] for i in seq] for seq in y_true_]
                        # output_tags = [[num2charY[i] for i in seq] for seq in y_pred_]
                        # plot_attention(alignments_alphas[1, :, :], input_tags[1], output_tags[1])

                        y_true.extend(y_true_)
                        y_pred.extend(y_pred_)
                    # accuracy = np.mean(train_acc)
                    y_true = np.asarray(y_true)
                    y_pred = np.asarray(y_pred)
                    y_true = y_true.flatten()
                    y_pred = y_pred.flatten()
                    n_examples = len(y_true)
                    cm = confusion_matrix(y_true,
                                          y_pred,
                                          labels=range(len(char2numY) - 2))
                    accuracy = np.mean(y_true == y_pred)
                    mf1 = f1_score(y_true, y_pred, average="macro")
                    ck_score = cohen_kappa_score(y_true, y_pred)

                    print(
                        'Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f} F1-score: {:>6.4f} Cohen\'s Kappa: {:>6.4f} Epoch duration: {:>6.3f}s'
                        .format(epoch_i, np.mean(batch_loss), accuracy, mf1,
                                ck_score,
                                time.time() - start_time))
                    if (epoch_i + 1) % hparams.test_step == 0:
                        acc_avg, f1_macro, ck_score, y_true, y_pred, alignments_alphas_all = evaluate_model(
                            hparams, X_test, y_test, classes)

                        if np.nan_to_num(
                                f1_macro
                        ) > pre_f1_macro:  # save the better model based on the f1 score
                            print(
                                'Loss {:.4f} after {} epochs (batch_size={})'.
                                format(loss_track[-1], epoch_i + 1,
                                       hparams.batch_size))
                            pre_f1_macro = f1_macro
                            ckpt_name = "model_fold{:02d}.ckpt".format(
                                fold_idx)
                            save_path = os.path.join(FLAGS.checkpoint_dir,
                                                     ckpt_name)
                            saver.save(sess, save_path)
                            print(
                                "The best model (till now) saved in path: %s" %
                                save_path)

                            # Save
                            save_dict = {
                                "y_true":
                                y_true,
                                "y_pred":
                                y_pred,
                                "ck_score":
                                ck_score,
                                "alignments_alphas_all":
                                alignments_alphas_all[:
                                                      200],  # we save just the first 200 batch results because it is so huge
                            }
                            filename = "output_" + channel_ename + "_fold{:02d}.npz".format(
                                fold_idx)
                            save_path = os.path.join(output_dir, filename)
                            np.savez(save_path, **save_dict)
                            print(
                                "The best results (till now) saved in path: %s"
                                % save_path)

                # plt.plot(loss_track)
                # plt.show()
                # print 'Classes: ', classes

            print(str(datetime.now()))
            print('Fold{} took: {:>6.3f}s'.format(
                fold_idx,
                time.time() - start_time_fold_i))
Ejemplo n.º 3
0
max_time_step=1#10,  # 5 3 second best 10# 40 # 100

channel_ename = ""  # CHANGE THIS
path = os.path.split(args.data_dir)
output_dir = "traindata_" + args.sampling
traindata_dir = os.path.join(args.data_dir, output_dir)
if not os.path.exists(traindata_dir):
    os.makedirs(traindata_dir)

print(str(datetime.now()))


for fold_idx in range(num_folds):
    start_time_fold_i = time.time()
    data_loader = SeqDataLoader(args.data_dir, num_folds, fold_idx, classes=classes)
    X_train, y_train, X_test, y_test = data_loader.load_data(seq_len=max_time_step)

    # preprocessing
    char2numY = dict(zip(classes, range(len(classes))))

##    # <SOD> is a token to show start of decoding  and <EOD> is a token to indicate end of decoding
##    char2numY['<SOD>'] = len(char2numY)
##    char2numY['<EOD>'] = len(char2numY)
##    num2charY = dict(zip(char2numY.values(), char2numY.keys()))


    # over-sampling: SMOTE:
    X_train = np.reshape(X_train,[X_train.shape[0]*X_train.shape[1],-1])
    y_train= y_train.flatten()

    nums = []
    use_beamsearch_decode=False,
    max_time_step=1,  # 10,  # 5 3 second best 10# 40 # 100
    output_max_length=10 + 2,  # max_time_step +1
    akara2017=True,
    test_step=5  # each 10 epochs
)

channel_ename = ""  # CHANGE THIS
path = os.path.split(data_dir)
traindata_dir = os.path.join(os.path.abspath(os.path.join(data_dir, os.pardir)), 'traindata_eog/')
print(str(datetime.now()))

for fold_idx in range(num_folds):
    start_time_fold_i = time.time()
    data_loader = SeqDataLoader(data_dir, num_folds, fold_idx, classes=classes)
    X_train, y_train, X_test, y_test = data_loader.load_data(seq_len=hparams["max_time_step"])

    # preprocessing
    char2numY = dict(zip(classes, range(len(classes))))
    pre_f1_macro = 0

    # <SOD> is a token to show start of decoding  and <EOD> is a token to indicate end of decoding
    char2numY['<SOD>'] = len(char2numY)
    char2numY['<EOD>'] = len(char2numY)
    num2charY = dict(zip(char2numY.values(), char2numY.keys()))

    # over-sampling: SMOTE:
    # X_train = np.reshape(X_train, [X_train.shape[0] * X_train.shape[1], -1])
    # y_train = y_train.flatten()

    if data_version == 2018: