def runTest(file1, version, model, mode='words'):
    """Gets training or test file for stance detection SemiVal 2016 competition and prints prediction results.

        Parameters
        ----------
        file1 : list
            a list with text tokens on index (0)  and hashtags list on index (1)

        istest : Boolean
            specifies if the dataset is for test or training

        version : int
            0: Training dataset, 1: Test dataset, 2:Other domain dataset
        mode : str
            choose either (words) or (hashtags)

        """
    indata = readfile(file1, version)
    data = preprocesstweets(indata,
                            ignoreNONE=False,
                            version=version,
                            lowerCase=True)
    tfidfAdded = getTfidfRepresentation(data, version, mode)
    labels = [d[7] for d in data]
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels)
    print(encoder.classes_)
    if version == 0:
        x_train, x_test, y_train, y_test = train_test_split(tfidfAdded,
                                                            y,
                                                            test_size=0.2)
        y_test = np_utils.to_categorical(y_test, num_classes=3)
        y_train = np_utils.to_categorical(y_train, num_classes=3)
        print(x_train.shape[1])
        print(model.summary())
        model.fit(x_train,
                  y_train,
                  epochs=10,
                  verbose=2,
                  validation_data=(x_test, y_test))
        loss, acc = model.evaluate(x_test, y_test, verbose=0)
        ypred = model.predict(x_test)
        print('Training Accuracy: %f' % (acc * 100))
        print('Training F-Score: ', f1(y_test, ypred) * 100)
    if version == 1 or version == 2:
        y = np_utils.to_categorical(y, num_classes=3)
        loss, acc = model.evaluate(tfidfAdded, y)
        ypred = model.predict(tfidfAdded)
        otherdomain = ''
        if version == 2:
            otherdomain = '(other domain)'
        print('TEST Accuracy ' + otherdomain + ': %f' % ((acc * 100)))
        print('TEST F-Score ' + otherdomain + ': ', (f1(y, ypred) * 100))
Example #2
0
    def link(self, m1, m2, hypothetical=False, beta=1):
        if m1 == -1:
            return self.get_f1(beta=beta) if hypothetical else None

        c1, c2 = self.mention_to_cluster[m1], self.mention_to_cluster[m2]
        assert c1 != c2
        new_c = c1 + c2
        p_num, r_num, p_den, r_den = self.p_num, self.r_num, self.p_den, self.r_den

        if len(c1) == 1:
            self.p_den += 1
        if len(c2) == 1:
            self.p_den += 1
        self.update_b3(new_c, hypothetical=hypothetical)

        if hypothetical:
            f1 = evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta)
            self.p_num, self.r_num, self.p_den, self.r_den = p_num, r_num, p_den, r_den
            return f1
        else:
            self.ana_to_ant[m2] = m1
            self.ant_to_anas[m1].append(m2)
            self.clusters.remove(c1)
            self.clusters.remove(c2)
            self.clusters.append(new_c)
            for m in new_c:
                self.mention_to_cluster[m] = new_c
Example #3
0
    def link(self, m1, m2, hypothetical=False, beta=1):
        timer.start("link")
        if m1 == -1:
            return self.get_f1(beta=beta) if hypothetical else None

        c1, c2 = self.mention_to_cluster[m1], self.mention_to_cluster[m2]
        assert c1 != c2
        new_c = c1 + c2
        p_num, r_num, p_den, r_den = self.p_num, self.r_num, self.p_den, self.r_den

        if len(c1) == 1:
            self.p_den += 1
        if len(c2) == 1:
            self.p_den += 1
        self.update_b3(new_c, hypothetical=hypothetical)

        if hypothetical:
            f1 = evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta)
            self.p_num, self.r_num, self.p_den, self.r_den = p_num, r_num, p_den, r_den
            timer.stop("link")
            return f1
        else:
            self.ana_to_ant[m2] = m1
            self.ant_to_anas[m1].append(m2)
            self.clusters.remove(c1)
            self.clusters.remove(c2)
            self.clusters.append(new_c)
            for m in new_c:
                self.mention_to_cluster[m] = new_c
        timer.stop("link")
Example #4
0
def main():
    # reading in
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='data/sampling',
                        help='determine the base dir of the dataset document')
    parser.add_argument("--sample_n",
                        default=1000,
                        type=int,
                        help='starting image index of preprocessing')
    parser.add_argument("--evidence_n",
                        default=20,
                        type=int,
                        help='how many top/bottom tiles to pick from')
    parser.add_argument("--repl_n",
                        default=3,
                        type=int,
                        help='how many resampled replications')
    parser.add_argument("--image_split",
                        action='store_true',
                        help='if use image_split')
    parser.add_argument("--batch_size",
                        default=50,
                        type=int,
                        help="batch size")
    parser.add_argument("--stage_two",
                        action='store_true',
                        help='if only use stage two patients')
    parser.add_argument("--changhai",
                        action='store_true',
                        help='if use additional data')
    args = parser.parse_args()

    feature_size = 32
    #gpu = "cuda:0"
    gpu = None
    # 5-folds cross validation
    dataloader = CVDataLoader(args, gpu, feature_size)

    n_epoch = 800
    lr = 0.0005
    if args.stage_two:
        weight_decay = 0.008
    else:
        weight_decay = 0.005
    manytimes_n = 8

    if not os.path.isdir('figure'):
        os.mkdir('figure')
    if not os.path.isdir(os.path.join(args.data_dir, 'model')):
        os.mkdir(os.path.join(args.data_dir, 'model'))

    acc_folds = []
    auc_folds = []
    c_index_folds = []
    f1_folds = []
    f1_folds_pos = []
    total_round = 0
    model_count = 0

    loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.8))

    for _ in range(manytimes_n):  # averaging
        for i in range(5):
            train_history = []
            test_history = []
            minimum_loss = None
            auc_fold = None
            acc_fold = None
            early_stop_count = 0

            model = Predictor(evidence_size=args.evidence_n,
                              layers=(100, 50, 1),
                              feature_size=feature_size)
            # model.apply(weight_init)
            if gpu:
                model = model.to(gpu)
            optimizer = torch.optim.RMSprop(model.parameters(),
                                            lr=lr,
                                            weight_decay=weight_decay)
            # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

            dataloader.set_fold(i)
            X_test, Y_test, df_test = dataloader.get_test()
            # X_train, Y_train, df_train = dataloader.get_train()
            print('starting fold %d' % i)

            for epoch in range(n_epoch):
                #result = model(X_train)
                #loss = nn.functional.binary_cross_entropy(result, Y_train) + nn.functional.mse_loss(result, Y_train)
                # loss = nn.functional.mse_loss(result, Y_train)
                #loss.backward()
                #optimizer.step()
                #optimizer.zero_grad()

                # batch input
                for X_train_batch, Y_train_batch, df_train_batch in dataloader:
                    # print(X_train_batch.shape)
                    result = model(X_train_batch)
                    loss = loss_function(result, Y_train_batch)
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

                X_train, Y_train, df_train = X_train_batch, Y_train_batch, df_train_batch

                if epoch % 20 == 0:
                    result_test = model(X_test)
                    loss_test = loss_function(result_test, Y_test)
                    #loss_test = nn.functional.mse_loss(result_test, Y_test)
                    acc_train, acc_test = accuracy(result, Y_train), accuracy(
                        result_test, Y_test)
                    auc_train, auc_test = auc(result, Y_train), auc(
                        result_test, Y_test)
                    if args.changhai:
                        c_index_train, c_index_test = 0, 0
                    else:
                        c_index_train, c_index_test = c_index(
                            result, df_train), c_index(result_test, df_test)
                    recall_train, recall_test = recall(result,
                                                       Y_train), recall(
                                                           result_test, Y_test)
                    precision_train, precision_test = precision(
                        result, Y_train), precision(result_test, Y_test)
                    f1_train_pos, f1_test_pos = f1(result, Y_train), f1(
                        result_test, Y_test)
                    f1_train, f1_test = f1(result, Y_train,
                                           negative=True), f1(result_test,
                                                              Y_test,
                                                              negative=True)
                    train_history.append(
                        (epoch, loss, acc_train, auc_train, c_index_train))
                    test_history.append(
                        (epoch, loss_test, acc_test, auc_test, c_index_test))
                    if epoch % 40 == 0:
                        print(
                            "%s epoch:%d loss:%.3f/%.3f acc:%.3f/%.3f auc:%.3f/%.3f c_index:%.3f/%.3f recall:%.3f/%.3f prec:%.3f/%.3f f1:%.3f/%.3f f1(neg):%.3f/%.3f"
                            % (time.strftime(
                                '%m.%d %H:%M:%S', time.localtime(
                                    time.time())), epoch, loss, loss_test,
                               acc_train, acc_test, auc_train, auc_test,
                               c_index_train, c_index_test, recall_train,
                               recall_test, precision_train, precision_test,
                               f1_train_pos, f1_test_pos, f1_train, f1_test))
                    # early stop
                    if minimum_loss is None or minimum_loss * 0.995 > loss_test:
                        # if minimum_loss is None or minimum_loss > loss_test:
                        if f1_train == 0:
                            continue
                        minimum_loss = loss_test
                        auc_fold = auc_test
                        acc_fold = acc_test
                        c_index_fold = c_index_test
                        f1_fold_pos = f1_test_pos
                        f1_fold = f1_test
                        early_stop_count = 0
                    elif auc_test > auc_fold and auc_test > 0.5 and acc_test >= acc_fold:
                        minimum_loss = loss_test
                        auc_fold = auc_test
                        acc_fold = acc_test
                        c_index_fold = c_index_test
                        f1_fold_pos = f1_test_pos
                        f1_fold = f1_test
                        early_stop_count = 0
                    else:
                        early_stop_count += 1
                    if early_stop_count > 2 and epoch > 100:
                        if args.stage_two:
                            if auc_fold > 0.55:
                                print('early stop at epoch %d' % epoch)
                                break
                        elif early_stop_count > 3:
                            print('early stop at epoch %d' % epoch)
                            break
                    if epoch > 500:
                        optimizer = torch.optim.RMSprop(
                            model.parameters(),
                            lr * 0.6,
                            weight_decay=weight_decay * 1.2)

            train_history = np.array(train_history)
            test_history = np.array(test_history)
            acc_folds.append(acc_fold)
            auc_folds.append(auc_fold)
            f1_folds.append(f1_fold)
            f1_folds_pos.append(f1_fold_pos)
            c_index_folds.append(c_index_fold)
            plt.plot(train_history[:, 0], train_history[:, 1], label='train')
            plt.plot(test_history[:, 0], test_history[:, 1], label='test')
            plt.legend()
            plt.savefig('figure/sample_%d_fold%d.png' % (args.sample_n, i))
            plt.cla()
            if acc_fold > 0.7 and auc_fold > 0.6 and model_count < 10:
                model.save(args.data_dir + "/model/model_%d" % model_count)
                model_count += 1
            print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" %
                  (acc_fold, auc_fold, c_index_fold, f1_fold))
            total_round += 1
            if gpu:
                del dataloader.X_train, dataloader.Y_train, dataloader.X_test, dataloader.Y_test
                del X_test, Y_test, X_train, Y_train, model, optimizer
                torch.cuda.empty_cache()

    print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1:%.3f f1(neg):%.3f' %
          (sum(acc_folds) / 5 / manytimes_n, sum(auc_folds) / 5 / manytimes_n,
           sum(c_index_folds) / 5 / manytimes_n, sum(f1_folds_pos) / 5 /
           manytimes_n, sum(f1_folds) / 5 / manytimes_n))
Example #5
0
 def get_f1(self, beta=1):
     return evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta)
def main():
    """Main function for training and testing."""
    # Parse command line arguments and cache
    opt = opts.Opts().args
    utils.savecmd(opt.resume, sys.argv)

    utils.print_color_msg("==> Setting up data loader")
    train_loader, val_loader, test_loader = dataloader.create(opt)

    # Load checkpoint if specified, None otherwise
    utils.print_color_msg("==> Checking checkpoints")
    checkpoint = checkpoints.load(opt)

    utils.print_color_msg("==> Setting up model and criterion")
    model, optim_state = init.setup(opt, checkpoint)
    loss_fn = criterion.setup(opt, checkpoint)

    utils.print_color_msg("==> Loading trainer")
    trainer = train.create_trainer(model, loss_fn, opt, optim_state)

    best_loss = float('Inf')
    val_loss = float('Inf')
    start_epoch = max([1, opt.epochNum])
    if checkpoint is not None:
        start_epoch = checkpoint['epoch'] + 1
        best_loss = checkpoint['loss']
        print("".ljust(4) + "Previous best loss: " +
              utils.color_msg('%.5f' % best_loss))

    if opt.valOnly:
        assert start_epoch > 1, "There must be at least one epoch"
        utils.print_color_msg("==> Validation:")
        print("".ljust(4) + "=> Epoch %i" % (start_epoch - 1))
        trainer.val(val_loader, start_epoch - 1)
        sys.exit()

    if opt.testOnly:
        assert start_epoch > 1, "There must be at least one epoch"
        utils.print_color_msg("==> Testing:")
        print("".ljust(4) + "=> Epoch %i" % (start_epoch - 1))
        _, prediction, reference, post, seq_length = trainer.test(
            test_loader, start_epoch - 1)

        prediction = F.sigmoid(torch.Tensor(prediction)).numpy()
        nce = evaluation.nce(reference, prediction)
        precision, recall, area, threshold = evaluation.pr(
            reference, prediction)
        precision_bl, recall_bl, area_bl, _ = evaluation.pr(reference, post)
        f1, f1_precision, f1_recall, f1_threshold = evaluation.f1(
            precision, recall, threshold)
        tpr, fpr, roc_area = evaluation.roc(reference, prediction)

        # Calculate stats for sequences binned by the posterior
        limits = np.linspace(0, 1, 11).tolist()
        utils.print_color_msg('\n\nEffect of Input Posterior on Performance')
        for i in range(len(limits) - 1):
            ref, pred, p = evaluation.bin_results(reference, prediction, post, measure=post, \
                                                  lower_limit=limits[i], upper_limit=limits[i+1])
            if ref.size:
                nce_post = evaluation.nce(ref, pred)
                nce_post_bl = evaluation.nce(ref, p)
                precision_post, recall_post, area_post, threshold_post = evaluation.pr(
                    ref, pred)
                precision_post_bl, recall_post_bl, area_post_bl, threshold_post_bl = evaluation.pr(
                    ref, p)
                f1_post, _, _, _ = evaluation.f1(precision_post, recall_post,
                                                 threshold_post)
                f1_post_bl, _, _, _ = evaluation.f1(precision_post_bl,
                                                    recall_post_bl,
                                                    threshold_post_bl)
                _, _, roc_area_post = evaluation.roc(ref, pred)
                print('%.1f. - %.1f. %d    Results (model/bl)     NCE: %.4f. , %.4f.    AUC(PR): %.4f. , %.4f.    F-1:  %.4f. , %.4f.    AUC(ROC): %.4f.'\
                      %(limits[i], limits[i+1], int(ref.size), nce_post, nce_post_bl, area_post, area_post_bl, f1_post, f1_post_bl, roc_area_post))
            else:
                print('%.1f. - %.1f. Empty' % (limits[i], limits[i + 1]))

        # Caluclate stats for sequences binned by sequence length
        limits = [0, 2, 3, 6, 10, 20, 40]
        utils.print_color_msg('\n\nEffect of Sequence Length on Performance')
        for i in range(len(limits) - 1):
            ref, pred, p = evaluation.bin_results(reference, prediction, post, measure=seq_length, \
                                                  lower_limit=limits[i], upper_limit=limits[i+1])
            if ref.size:
                nce_len = evaluation.nce(ref, pred)
                nce_len_bl = evaluation.nce(ref, p)
                precision_len, recall_len, area_len, threshold_len = evaluation.pr(
                    ref, pred)
                precision_len_bl, recall_len_bl, area_len_bl, threshold_len_bl = evaluation.pr(
                    ref, p)
                f1_len, _, _, _ = evaluation.f1(precision_len, recall_len,
                                                threshold_len)
                f1_len_bl, _, _, _ = evaluation.f1(precision_len_bl,
                                                   recall_len_bl,
                                                   threshold_len_bl)
                _, _, roc_area_len = evaluation.roc(ref, pred)
                print(f'%d - %d  %d   Results (model/bl)    NCE: %.4f. , %.4f.    AUC: %.4f. , %.4f.    F-1:  %.4f. , %.4f.    AUC(ROC): %.4f.'\
                      %(limits[i], limits[i+1], int(ref.size), nce_len, nce_len_bl, area_len, area_len_bl, f1_len, f1_len_bl, roc_area_len))
            else:
                print('%d - %d Empty' % (limits[i], limits[i + 1]))

        # Calulate calibration stats
        limits = np.linspace(0, 1, 11).tolist()
        print('\n\nCalibration Stats')
        ece = 0
        for i in range(len(limits) - 1):
            ref, pred, p = evaluation.bin_results(reference, prediction, post, measure=prediction, \
                                                  lower_limit=limits[i], upper_limit=limits[i+1])
            if ref.size:
                accuracy_bin = np.mean(ref)
                confidence_bin = np.mean(pred)
                posterior_bin = np.mean(p)
                ece += abs(accuracy_bin -
                           confidence_bin) * len(ref) / len(reference)
                print(
                    f'%.1f. - %.1f. %d    Reference: %.4f. ,    Prediction: %.4f. ,    Posterior: %.4f.'
                    % (limits[i], limits[i + 1], int(ref.size), accuracy_bin,
                       confidence_bin, posterior_bin))
            else:
                print('%.1f. - %.1f. Empty' % (limits[i], limits[i + 1]))

        # Print Test Stats
        print('\n\nTest Stats')
        print(
            "".ljust(7) + "\nNCE: %.4f. \nAUC(PR): %.4f. \nF-1: %.4f. p: %.4f. r: %.4f. t: %.4f. \nAUC(ROC): %.4f. \nECE: %.4f. " \
            %(nce, area, f1, f1_precision, f1_recall, f1_threshold, roc_area, nce))

        trainer.logger['test'].write('NCE: %f\nAUC(PR): %f\n' % (nce, area))
        evaluation.plot_pr([precision, precision_bl], [recall, recall_bl],
                           [area, area_bl], ['BiLatticeRNN', 'posterior'],
                           opt.resume)
        np.savez(os.path.join(opt.resume, 'result.npz'),
                 prediction=prediction,
                 reference=reference,
                 posteriors=post)
        sys.exit()

    utils.print_color_msg("==> Training:")
    for epoch in range(start_epoch, opt.nEpochs + 1):
        print("".ljust(4) + "=> Epoch %i" % epoch)
        best_model = False
        _ = trainer.train(train_loader, epoch, val_loss)

        if not opt.debug:
            val_loss = trainer.val(val_loader, epoch)
            if val_loss < best_loss:
                best_model = True
                print("".ljust(4) + "** Best model: " +
                      utils.color_msg('%.4f' % val_loss))
                best_loss = val_loss
            checkpoints.save(epoch, trainer.model, loss_fn,
                             trainer.optim_state, best_model, val_loss, opt)

    if not opt.debug:
        utils.print_color_msg("==> Testing:")
        _, prediction, reference, _, _ = trainer.test(test_loader, opt.nEpochs)
        prediction = F.sigmoid(torch.Tensor(prediction)).numpy()
        nce = evaluation.nce(reference, prediction)
        precision, recall, area, _ = evaluation.pr(reference, prediction)
        utils.print_color_msg("".ljust(7) + "NCE: %.4f. AUC(PR): %.4f" %
                              (nce, area))
        trainer.logger['test'].write('NCE: %f\nAUC(PR): %f\n' % (nce, area))
        evaluation.plot_pr([precision], [recall], [area], ['BiLatticeRNN'],
                           opt.resume)

        # Flush write out and reset pointer
        for open_file in trainer.logger.values():
            open_file.flush()
            open_file.seek(0)
        plot.plot(opt.resume, opt.onebest)
Example #7
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default='data/sampling',
                        help='determine the base dir of the dataset document')
    parser.add_argument("--sample_n",
                        default=2000,
                        type=int,
                        help='starting image index of preprocessing')
    parser.add_argument("--evidence_n",
                        default=500,
                        type=int,
                        help='how many top/bottom tiles to pick from')
    parser.add_argument("--repl_n",
                        default=3,
                        type=int,
                        help='how many resampled replications')
    parser.add_argument("--image_split",
                        action='store_true',
                        help='if use image_split')
    parser.add_argument("--batch_size",
                        default=200,
                        type=int,
                        help="batch size")
    parser.add_argument("--stage_two",
                        action='store_true',
                        help='if only use stage two patients')
    parser.add_argument("--threshold",
                        default=25,
                        type=float,
                        help='threshold')
    parser.add_argument("--changhai",
                        action='store_true',
                        help='if use additional data')
    parser.add_argument("--TH", action='store_true')
    args = parser.parse_args()

    gpu = "cuda:0"
    n_epoch = 80
    acc_folds = []
    auc_folds = []
    c_index_folds = []
    f1_folds = []
    f1_folds_pos = []
    unsuccessful_count = 0
    model_count = 0
    n_manytimes = 8

    # caching
    if False:
        # if os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl')) and os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_df.pkl')):
        print("loading cached graph data")
        with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'),
                  'rb') as file:
            dataset = pickle.load(file)
        with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'),
                  'rb') as file:
            df = pickle.load(file)
    else:
        if not os.path.exists(os.path.join(args.data_dir, 'graph')):
            os.mkdir(os.path.join(args.data_dir, 'graph'))
        dataset, df = construct_graph_dataset(args, gpu)
        with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'),
                  'wb') as file:
            pickle.dump(dataset, file)
        with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'),
                  'wb') as file:
            pickle.dump(df, file)

    splitter = CrossValidationSplitter(dataset,
                                       df,
                                       n=5,
                                       n_manytimes=n_manytimes)
    # criterion = torch.nn.CrossEntropyLoss()
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.4))
    fold_num = 0
    if not os.path.isdir(os.path.join(args.data_dir, 'model')):
        os.mkdir(os.path.join(args.data_dir, 'model'))

    for train_dataset, test_dataset, train_df, test_df in splitter:
        print("starting fold %d-%d" % (fold_num // 5, fold_num % 5))
        train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
        test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
        train_history = []
        test_history = []
        minimum_loss = None
        auc_fold = None
        acc_fold = None
        early_stop_count = 0
        model = GNN(32).to(gpu)
        optimizer = torch.optim.RMSprop(model.parameters(),
                                        lr=0.0004,
                                        weight_decay=0.001)

        for epoch in range(n_epoch):
            model.train()
            for data in train_loader:  # Iterate in batches over the training dataset.
                y_pred = model(data.x, data.edge_index,
                               data.batch.to(gpu)).view(
                                   -1)  # Perform a single forward pass.
                loss = criterion(y_pred, data.y)  # Compute the loss.
                loss.backward()  # Derive gradients.
                optimizer.step()  # Update parameters based on gradients.
                optimizer.zero_grad()  # Clear gradients.

            if epoch % 1 == 0:
                model.eval()
                y_pred_train, y_train = concat_result(train_loader, model, gpu)
                y_pred_test, y_test = concat_result(test_loader, model, gpu)
                loss_train, loss_test = criterion(y_pred_train,
                                                  y_train), criterion(
                                                      y_pred_test, y_test)
                #loss_test = nn.functional.mse_loss(result_test, Y_test)
                acc_train, acc_test = accuracy(y_pred_train,
                                               y_train), accuracy(
                                                   y_pred_test, y_test)
                auc_train, auc_test = auc(y_pred_train,
                                          y_train), auc(y_pred_test, y_test)
                if False:
                    c_index_train, c_index_test = 0, 0
                else:
                    c_index_train, c_index_test = c_index(
                        y_pred_train, train_df), c_index(y_pred_test, test_df)
                f1_train, f1_test = f1(y_pred_train, y_train,
                                       negative=True), f1(y_pred_test,
                                                          y_test,
                                                          negative=True)
                if epoch % 5 == 0:
                    print(
                        f'Epoch:{epoch:03d} Loss:{loss_train:.3f}/{loss_test:.3f} ACC:{acc_train:.3f}/{acc_test:.3f} AUC:{auc_train:.3f}/{auc_test:.3f} CI:{c_index_train:.3f}/{c_index_test:.3f} f1(neg):{f1_train:.3f}/{f1_test:.3f}'
                    )

                # early stop
                if minimum_loss is None or minimum_loss * 0.997 > loss_test:
                    # if minimum_loss is None or minimum_loss > loss_test:
                    if f1_train == 0:
                        continue
                    minimum_loss = loss_test
                    auc_fold = auc_test
                    acc_fold = acc_test
                    c_index_fold = c_index_test
                    f1_fold = f1_test
                    early_stop_count = 0
                    if acc_fold > 0.75 and auc_fold > 0.75:
                        model.save(args.data_dir +
                                   "/model/graph_%d" % model_count)
                #elif auc_test > auc_fold and auc_test>0.5 and acc_test >= acc_fold:
                #    minimum_loss = loss_test
                #    auc_fold = auc_test
                #    acc_fold = acc_test
                #    c_index_fold = c_index_test
                #    f1_fold = f1_test
                #    early_stop_count = 0\
                elif auc_fold + acc_fold + c_index_fold < auc_test + acc_test + c_index_fold:
                    minimum_loss = loss_test
                    auc_fold = auc_test
                    acc_fold = acc_test
                    c_index_fold = c_index_test
                    f1_fold = f1_test
                    early_stop_count = 0
                    if acc_fold > 0.75 and auc_fold > 0.75:
                        model.save(args.data_dir +
                                   "/model/graph_%d" % model_count)
                else:
                    early_stop_count += 1
                if abs(auc_fold - 1) < 0.0001:
                    pass
                    #print('wtf')
                if early_stop_count > 3 and epoch > 25:
                    if args.stage_two:
                        if auc_fold > 0.55 and acc_fold > 0.55:
                            print('early stop at epoch %d' % epoch)
                            if acc_fold > 0.75 and auc_fold > 0.75:
                                model.load(args.data_dir +
                                           "/model/graph_%d" % model_count)
                                model_count += 1
                            break
                    elif early_stop_count > 3:
                        print('early stop at epoch %d' % epoch)
                        break

        acc_folds.append(acc_fold)
        auc_folds.append(auc_fold)
        f1_folds.append(f1_fold)
        c_index_folds.append(c_index_fold)
        fold_num += 1
        print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" %
              (acc_fold, auc_fold, c_index_fold, f1_fold))

    total_count = 5 * n_manytimes
    print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1(neg):%.3f' %
          (sum(acc_folds) / total_count, sum(auc_folds) / total_count,
           sum(c_index_folds) / total_count, sum(f1_folds) / total_count))
def convModel(tweets, stances, tweets_test, stances_test):
    #General Parameters
    global max
    embeding_dim = 200
    dropout_prob = (0.0, 0.5)
    batch_size = 64
    num_epochs = 20

    print('Fitting tokenizer')
    tokenizer = Tokenizer()
    tokenizer.fit_on_sequences(tweets + tweets2)
    max_length = max([len(s.split()) for s in tweets + tweets2])
    print('max_length', max_length)

    vocab_size = len(tokenizer.word_index) + 1

    #Train and test split
    print('Train and test split')
    x_train, x_test, y_train, y_test = train_test_split(tweets, stances, test_size=0.2)
    print('x_train: ', len(x_train), 'x_test', len(x_test))


    #Training data
    #traindata = np.array(x_train)
    #testdata = np.array(x_test)

    trainTokens = tokenizer.texts_to_sequences(x_train)
    Xtrain = pad_sequences(trainTokens, maxlen=max_length, padding='post')
    XtestTokens = tokenizer.texts_to_sequences(x_test)
    Xtest = pad_sequences(XtestTokens, maxlen=max_length, padding='post')
    #============ TEST DATA =============================================
    #testgroup = np.array(tweets_test)
    #testGroupTokens = tokenizer.texts_to_sequences(tweets_test)
    #XtestGroup = pad_sequences(testGroupTokens, maxlen=max_length, padding='post')
    #print('Xtrain padding: ', len(Xtrain), 'Xtest padding: ', len(Xtest), 'XtestGroup padding: ', len(XtestGroup))

    #Convert stances to categorical output
    y_test = np_utils.to_categorical(y_test, num_classes=3)
    y_train = np_utils.to_categorical(y_train, num_classes=3)
    y_testGroup = np_utils.to_categorical(stances_test, num_classes=3)
    print('y_test: ', len(y_test), 'y_train: ', len(y_train), 'y_testGroup: ', len(stances_test))


    print('Loading embeddings..')
    #load word2vec and create embedding layer
    wv_from_bin = KeyedVectors.load_word2vec_format(datapath('E:/glove/glove.twitter.27B.200dGINSIM.txt'),binary=False)
    embedding_vectors = get_weight_matrix2(wv_from_bin, tokenizer.word_index.items())
    embedding_layer = Embedding(vocab_size, embeding_dim, weights=[embedding_vectors], input_length=max_length, trainable=False)

    #Create the model
    print('Create and compile the model..')
    model = createModelC(max_length, embedding_layer)
    model.compile(loss="categorical_hinge", optimizer="adam", metrics=[f1])
    model.summary(85)

    print('Fitting the model..')
    history = model.fit(Xtrain, y_train, batch_size=batch_size, epochs=num_epochs,
                        validation_data=(Xtest, y_test), verbose=2)
    print('History', history.history)

    # evaluate
    print('Predicting (training)..')
    ypred = model.predict(Xtest)
    print('Accuracy (TRAIN): %f' % (model.evaluate(Xtest,y_test)[0]*100))
    print('FScore (TRAIN): %f' % (f1(y_test, ypred)*100))

    print('Predicting (testing)..')
Example #9
0
 def get_f1(self, beta=1):
     return evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta)