Beispiel #1
0
def main():
    # rs = np.random.RandomState(SEED)
    # Get the outbreaks, and loop through the
    df_outbreaks = utils.get_outbreaks()
    df_shocks = utils.get_shocks_data()
    df_risk_all = pd.read_excel(f'input/risk/{FILENAME_ZIMBABWE}')
    df_performance_all = utils.get_df_performance_all()
    # Get adm2 present in risks file
    adm2_shortlist = utils.get_adm2_shortlist(df_risk_all)
    # Plot for outbreaks and shocks
    fig, axs = plt.subplots(len(adm2_shortlist), 1, figsize=(10, 10))
    for iadm2, (admin2_pcode, admin2_name) in enumerate(adm2_shortlist):
        print(f'Analyzing admin region {admin2_name}')
        df_outbreak = df_outbreaks[df_outbreaks['admin2Pcode'] == admin2_pcode]
        df_shock = df_shocks[df_shocks['pcode'] == admin2_pcode]
        # Make the fake data
        # df_risk = utils.generate_fake_risk(rs, START_DATE, END_DATE)
        # Get risk from Zimbabwe data
        df_risk = utils.get_risk_df(df_risk_all, admin2_name)
        # Get outbreak date indices
        df_risk['outbreak'] = df_risk['date'].isin(
            df_outbreak['Outbreak month'])
        real_outbreaks = df_risk[df_risk['outbreak']].index.values
        # Get shocks
        shocks, df_risk = utils.get_shocks(df_shock, df_risk)
        # Get detections per threshold
        df_performance = utils.loop_over_thresholds(df_risk['risk'],
                                                    real_outbreaks)
        df_performance = utils.calculate_f1(df_performance)
        # Add it to the full frame
        df_performance_all = (pd.concat(
            [df_performance[['thresh', 'TP', 'FP', 'FN']],
             df_performance_all]).groupby(['thresh']).sum().reset_index())
        # Make plots
        plot_utils.plot_adm2(df_risk, df_performance, real_outbreaks, shocks,
                             admin2_pcode, admin2_name)
        # Plot shocks / outbreaks
        plot_utils.plot_shocks_and_outbreaks(
            axs[iadm2],
            real_outbreaks,
            shocks,
            admin2_name,
            df_risk,
            show_x_axis=(iadm2 == len(adm2_shortlist) - 1))
        # TODO: evaluate the best threshold value and calculate the overall value of precision and recall
    # Save the shocks / outbreaks figure
    fig.savefig('plots/outbreaks_shocks.png')
    plt.close(fig)
    # Caclulate overall performance
    df_performance_all = utils.calculate_f1(df_performance_all)
    # Confusion matrix
    fig, ax = plt.subplots()
    plot_utils.plot_confusion_matrix(df_performance_all, ax)
    fig.savefig('plots/full_confusion_matrix.png')
    plt.close()
    # Performance
    fig, ax = plt.subplots()
    plot_utils.plot_performance(df_performance_all, ax)
    fig.savefig('plots/full_performance.png')
    plt.close()
Beispiel #2
0
    def __init__(self, words_per_document, num_classes, vocabulary_size,
                 embedding_size, filter_sizes, num_filters, l2_reg_lambda,
                 train_size, batch_size):
        # Placeholders
        self.emb_place_holder = tf.placeholder(tf.float32,
                                               [None, embedding_size],
                                               name="emb_place_holder")
        self.x_place_holder = tf.placeholder(tf.int32,
                                             [None, words_per_document],
                                             name="x")
        self.y_place_holder = tf.placeholder(tf.float32, [None, num_classes],
                                             name="labels")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")
        self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
        self.decay_rate = tf.placeholder(tf.float32, name="decay_rate")
        l2_loss = tf.constant(0.0, name="l2_loss")

        # First Layer Embeddings [MiniBatchSize, Largest Size, embSize]
        with tf.device('/cpu:0'):
            self.W = tf.Variable(tf.constant(
                0.0, shape=[vocabulary_size, embedding_size]),
                                 trainable=True,
                                 name="W")
            embedding_init = self.W.assign(value=self.emb_place_holder)
            self.embedded_chars = tf.nn.embedding_lookup(
                embedding_init, self.x_place_holder)
            self.embedded_chars_expanded = tf.expand_dims(
                self.embedded_chars, -1)

        # Second Layer Convolutional filter size [3, 128] [4, 128] [5, 128]
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1),
                                name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]),
                                name="b")
                conv = tf.nn.conv2d(self.embedded_chars_expanded,
                                    W,
                                    strides=[1, 1, 1, 1],
                                    padding="VALID",
                                    name="conv")
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="h")
                # Maxpooling layer
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, words_per_document - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="pool")

                tf.summary.histogram("weights", W)
                tf.summary.histogram("biases", b)
                tf.summary.histogram("activations", h)
                pooled_outputs.append(pooled)

        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat,
                                        self.dropout_keep_prob)
            tf.summary.histogram("dropout", self.h_drop)

        # Last layer # xw_plus_b: compute  matmul(x, weights) + biases
        with tf.name_scope("output"):
            W = tf.Variable(tf.truncated_normal(
                [num_filters_total, num_classes], stddev=0.1),
                            name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            l2_loss += tf.nn.l2_loss(W, name="l2_loss")
            l2_loss += tf.nn.l2_loss(b, name="l2_loss")
            tf.summary.histogram("l2", l2_loss)
            tf.summary.histogram("weigths", W)
            tf.summary.histogram("biases", b)

        # Loss function # cross entropy between what we got and our labels
        with tf.name_scope("loss"):
            cross_entropy_r = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.scores, labels=self.y_place_holder)
            self.cross_entropy = tf.reduce_mean(
                cross_entropy_r,
                name="cross_entropy") + (l2_reg_lambda * l2_loss)
            self.stream_loss, self.stream_loss_update = tf.contrib.metrics.streaming_mean(
                self.cross_entropy)
            tf.summary.scalar("loss_tr", self.stream_loss)
            # tf.summary.scalar("loss_tr", self.cross_entropy)

        # Train Step # using learning rate 0.0004
        # minimize is it the same as compute_gradients and apply gradients
        with tf.name_scope("train"):
            self.global_step_ = tf.Variable(0,
                                            name="global_step",
                                            trainable=False)
            lr = tf.train.exponential_decay(self.learning_rate,
                                            self.global_step_ * batch_size,
                                            train_size, self.decay_rate)
            self.train_step = tf.train.AdamOptimizer(lr).minimize(
                self.cross_entropy,
                global_step=self.global_step_,
                name="train_oper")

        with tf.name_scope("accuracy"):
            correct_prediction = tf.equal(self.predictions,
                                          tf.argmax(self.y_place_holder, 1),
                                          name="correct_prediction")
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction,
                                                   tf.float32),
                                           name="accuracy")
            self.stream_accuracy, self.stream_accuracy_update = tf.contrib.metrics.streaming_mean(
                self.accuracy)
            tf.summary.scalar("accuracy", self.stream_accuracy)
            # tf.summary.scalar("accuracy", self.accuracy)

        with tf.name_scope("confussion_matrix"):
            labels_max = tf.argmax(self.y_place_holder, 1, name="label_max")
            # self.matrix = tf.contrib.metrics.confusion_matrix(labels_max, self.predictions, num_classes=2, name="matrix")
            self.matrix = tf.confusion_matrix(labels_max,
                                              self.predictions,
                                              num_classes=2,
                                              name="matrix")
            true_positive = self.matrix[1, 1]
            true_negative = self.matrix[0, 0]
            false_positive = self.matrix[0, 1]
            false_negative = self.matrix[1, 0]
            self.precision_mini_batch = utils.calculate_precision(
                true_positive, false_positive)
            self.recall_mini_batch = utils.calculate_recall(
                true_positive, false_negative)
            self.f1_score_min_batch = utils.calculate_f1(
                self.precision_mini_batch, self.recall_mini_batch)

            self.stream_precision, self.stream_precision_update = tf.contrib.metrics.streaming_mean(
                self.precision_mini_batch)
            self.stream_recall, self.stream_recall_update = tf.contrib.metrics.streaming_mean(
                self.recall_mini_batch)
            self.stream_f1, self.stream_f1_update = tf.contrib.metrics.streaming_mean(
                self.f1_score_min_batch)
            tf.summary.scalar("Precision", self.stream_precision)
            tf.summary.scalar("Recall", self.stream_recall)
            tf.summary.scalar("F1", self.stream_f1)

            # tf.summary.scalar("Precision", self.precision_mini_batch)
            # tf.summary.scalar("Recall", self.recall_mini_batch)
            # tf.summary.scalar("F1", self.f1_score_min_batch)

        # if should_load:
        #     log("Data processing ok load network...")
        #     saver = tf.train.Saver()
        #     try:
        #         saver.restore(sess, checkpoint_file_path)
        #     except Exception as e:
        #         log("Not able to load file")

        # summaries
        self.summary = tf.summary.merge_all()

        self.accuracy_validation_placeholder = tf.placeholder(
            tf.float32, name="acc_val_placeholder")
        self.loss_validation_placeholder = tf.placeholder(
            tf.float32, name="loss_val_placeholder")
        self.precision_validation_placeholder = tf.placeholder(
            tf.float32, name="prec_val_placeholder")
        self.recall_validation_placeholder = tf.placeholder(
            tf.float32, name="recall_val_placeholder")
        self.f1_validation_placeholder = tf.placeholder(
            tf.float32, name="f1_val_placeholder")

        with tf.name_scope("validation"):
            self.acc_validation_mean = tf.reduce_mean(
                self.accuracy_validation_placeholder)
            self.loss_validation_mean = tf.reduce_mean(
                self.loss_validation_placeholder)
            self.prec_validation_mean = tf.reduce_mean(
                self.precision_validation_placeholder)
            self.recall_validation_mean = tf.reduce_mean(
                self.recall_validation_placeholder)
            self.f1_validation_mean = tf.reduce_mean(
                self.f1_validation_placeholder)

            loss_val = tf.summary.scalar("loss_val", self.loss_validation_mean)
            accuracy_val = tf.summary.scalar("accuracy_val",
                                             self.acc_validation_mean)
            precission_val = tf.summary.scalar("Precision_val",
                                               self.prec_validation_mean)
            recall_val = tf.summary.scalar("Recall_val",
                                           self.recall_validation_mean)
            f1_val = tf.summary.scalar("F1_val", self.f1_validation_mean)
            self.summary_val = tf.summary.merge(
                [loss_val, accuracy_val, precission_val, recall_val, f1_val])
Beispiel #3
0
            data_placeholder: test_data,
            labels_placeholder: test_labels
        }) / 2
        auroc = auroc_tensor.eval(feed_dict={
            data_placeholder: test_data,
            labels_placeholder: test_labels
        })
        aupr = aupr_tensor.eval(feed_dict={
            data_placeholder: test_data,
            labels_placeholder: test_labels
        })
        # classification_accuracy = runner.evaluate_model(accuracy_tensor, batch_size,
        #                                                 feed_vars=(data_placeholder, labels_placeholder),
        #                                                 feed_data=pt.train.feed_numpy(test_batch_num, test_data, test_labels),
        #                                                 print_every=500)
        f1 = calculate_f1(precision, recall)
        if best_f1 < f1:
            best_f1 = f1
            update_best_results(accuracy, precision, recall, f1, aupr, auroc,
                                epoch, best_f1_results)
        if best_accuracy < accuracy:
            best_accuracy = accuracy
            update_best_results(accuracy, precision, recall, f1, aupr, auroc,
                                epoch, best_accuracy_results)
        print('Accuracy after %d epoch %g%%' % (epoch + 1, accuracy * 100))
        print('F1 after %d epoch %g%%' % (epoch + 1, f1 * 100))
    print('Train size is {}.Best accuracy is: {}'.format(
        num_lab, best_accuracy_results))
    print('Train size is {}.Best f1 is: {}'.format(num_lab, best_f1_results))
    print('==================================')
end = time.time()
Beispiel #4
0
def testing_step(checkpoint_dir, batch_size, x_test, y_test, vocab_inv_emb_dset, LOG_FILE, parameters, early_stop_log):
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()

    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            # all_vars = graph.get_operations()
            x_placeholder = graph.get_operation_by_name("x").outputs[0]
            y_placeholder = graph.get_operation_by_name("labels").outputs[0]
            embedding_placeholder = graph.get_operation_by_name("emb_place_holder").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            accuracies = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
            loss = graph.get_operation_by_name("loss/cross_entropy").outputs[0]
            label_match = graph.get_operation_by_name("confussion_matrix/label_max").outputs[0]
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]
            l2_loss = graph.get_operation_by_name("l2_loss").outputs[0]

            testing_data = (Dataset.from_tensor_slices((x_test, y_test))
                            .shuffle(buffer_size=10)
                            .batch(parameters.batch_size)
                            .make_initializable_iterator())
            next_element_testing = testing_data.get_next()

            epochs = 1
            step = 0
            accuracies_test = []
            losses_test = []
            precisions_test = []
            recalls_test = []
            f1_scores_test = []
            for actual_epoch in range(epochs):
                sess.run(testing_data.initializer)
                while True:
                    try:
                        batch_testing = sess.run(next_element_testing)

                        feed_dict = {x_placeholder: batch_testing[0], y_placeholder: batch_testing[1],
                                     embedding_placeholder: vocab_inv_emb_dset, dropout_keep_prob: 1.0}

                        acc_batch, loss_batch, label_op, pred_op, l2_loss_op = \
                            sess.run([accuracies, loss, label_match, predictions, l2_loss], feed_dict)

                        matrix_batch = confusion_matrix(label_op, pred_op)

                        true_positive = matrix_batch[1, 1]
                        true_negative = matrix_batch[0, 0]
                        false_positive = matrix_batch[0, 1]
                        false_negative = matrix_batch[1, 0]
                        precision_mini_batch = utils.calculate_precision(true_positive, false_positive)
                        recall_mini_batch = utils.calculate_recall(true_positive, false_negative)
                        f1_score_min_batch = utils.calculate_f1(precision_mini_batch, recall_mini_batch)

                        accuracies_test.append(acc_batch)
                        losses_test.append(loss_batch)
                        precisions_test.append(precision_mini_batch)
                        recalls_test.append(recall_mini_batch)
                        f1_scores_test.append(f1_score_min_batch)

                        log("Step " + str(step) + "(epoch " + str(epochs) + ")" + "Test accuracy: " + str(
                            acc_batch) +
                            " test loss: " + str(loss_batch) + " test precission: " + str(precision_mini_batch) +
                            " test recall: " +   str(recall_mini_batch) + "test F1: " + str(f1_score_min_batch), LOG_FILE)

                    except tf.errors.OutOfRangeError:
                        avg_accuracy = np.mean(accuracies_test)
                        avg_losses = np.mean(losses_test)
                        avg_precision = np.mean(precisions_test)
                        avg_recall = np.mean(recalls_test)
                        avg_f1 = np.mean(f1_scores_test)
                        log(str(parameters), LOG_FILE)
                        log("Final results, test accuracy: " + str(avg_accuracy) + " test loss: " + str(avg_losses) +
                            " test precission: " + str(avg_precision) + " test recall: " + str(
                            avg_recall) + " test f1: "
                            + str(avg_f1), LOG_FILE)
                        log("End training dataset epoch: " + str(early_stop_log), LOG_FILE)
                        break
Beispiel #5
0
def event_tagger():
    # Read event data
    en_train = read_event_data('en/train.txt')
    en_dev = read_event_data('en/dev.txt')
    en_test = read_event_data('en/test.txt')

    it_train = read_event_data('it/train.txt')
    it_dev = read_event_data('it/dev.txt')
    it_test = read_event_data('it/test.txt')

    print('English TimeML:', len(en_train), len(en_dev), len(en_test))
    print('Italian News:', len(it_train), len(it_dev), len(it_test))

    tags = list(set(word_label[1] for sent in it_train for word_label in sent))
    print(len(tags))

    # By convention, the 0'th slot is reserved for padding.
    tags = ["<pad>"] + tags

    tag2idx = {tag: idx for idx, tag in enumerate(tags)}
    idx2tag = {idx: tag for idx, tag in enumerate(tags)}

    print(tag2idx)
    print(idx2tag)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                              do_lower_case=False)

    model = Net(vocab_size=len(tag2idx), device=device)
    model.to(device)
    model = nn.DataParallel(model)

    # One fine-tuning step
    train_dataset = EventDataset(en_train, tokenizer, tag2idx)

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=8,
                                 shuffle=True,
                                 num_workers=1,
                                 collate_fn=pad)

    eval_dataset = EventDataset(it_test, tokenizer, tag2idx)

    test_iter = data.DataLoader(dataset=eval_dataset,
                                batch_size=8,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=pad)

    criterion = nn.CrossEntropyLoss(ignore_index=0)

    num_epoch = 1
    base_lr = 0.001
    decay_factor = 0.2
    discriminative_fine_tuning = True
    gradual_unfreezing = False

    # params order top to bottom
    group_to_discriminate = ['classifier', 'bert']
    no_decay = ['bias', 'LayerNorm.weight']

    if discriminative_fine_tuning:
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and not 'bert' in n
            ],
            'layers': [
                n for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and not 'bert' in n
            ],
            'lr':
            0.001,
            'name':
            'classifier.decay',
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and not 'bert' in n
            ],
            'layers': [
                n for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and not 'bert' in n
            ],
            'lr':
            0.001,
            'name':
            'classifier.no_decay',
            'weight_decay':
            0.0
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and 'bert' in n
            ],
            'layers': [
                n for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and 'bert' in n
            ],
            'lr':
            0.00002,
            'name':
            'bert.decay',
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and 'bert' in n
            ],
            'layers': [
                n for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and 'bert' in n
            ],
            'lr':
            0.00002,
            'name':
            'bert.no_decay',
            'weight_decay':
            0.0
        }]
    else:
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

    optimizer = AdamW(optimizer_grouped_parameters)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=len(train_iter) *
                                     num_epoch // 10,
                                     t_total=len(train_iter) * num_epoch)

    for e in range(num_epoch):
        unfreeze = (True, False)[e != 0]

        if discriminative_fine_tuning and gradual_unfreezing:
            for pg in optimizer.param_groups:
                layers = ''
                for layer in pg['layers']:
                    layers += layer + ';'
                # print('epoch: {}, Layers: {}'.format(e, layers))
                if 'bert' in pg['name']:
                    for param in pg['params']:
                        param.requires_grad = unfreeze

        loss = train(model, train_iter, optimizer, scheduler, criterion)
        acc = eval(model, test_iter, idx2tag)

        print("epoch: {}, loss: {}".format(e, loss))
        print("epoch: {}, acc: {}".format(e, acc))
    '''
    ## Second fine-tuning step (epoch=1)
    
    train_dataset = EventDataset(it_train, tokenizer, tag2idx)
    for e in range(num_epoch):
        unfreeze = (True, False)[e != 0]

        if discriminative_fine_tuning and gradual_unfreezing:
            for pg in optimizer.param_groups:
                layers = ''
                for layer in pg['layers']:
                    layers += layer + ';'
                # print('epoch: {}, Layers: {}'.format(e, layers))
                if 'bert' in pg['name']:
                    for param in pg['params']:
                        param.requires_grad = unfreeze

        loss = train(model, train_iter, optimizer, scheduler, criterion)
        acc = eval(model, test_iter, idx2tag)

        print("epoch: {}, loss: {}".format(e, loss))
        print("epoch: {}, acc: {}".format(e, acc))
    '''

    calculate_acc()
    calculate_f1()