Exemple #1
0
def gao(labels):
    n_classes = max(labels) + 1
    best_nmi = 0.0
    best_k = None
    best_purity = None
    best_f1 = None
    for i in range(n_classes - 1, n_classes + 3):
        kmeansy = KMeans(n_clusters=i, random_state=0).fit(x).labels_
        nmi = normalized_mutual_info_score(labels, kmeansy)
        purity = metric.purity_score(kmeansy, labels)
        f1 = metric.f1_score(kmeansy, labels)
        if nmi > best_nmi:
            best_nmi = nmi
            best_k = i
            best_purity = purity
            best_f1 = f1
    print("best kmeans (nmi, k, purity, f1): ", best_nmi, best_k, best_purity,
          best_f1)

    s_labels = labels
    # best dbscan
    for e in [
            0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
            1.4, 1.5
    ]:
        dbscany = DBSCAN(eps=e).fit_predict(x)
        nmi = normalized_mutual_info_score(s_labels, dbscany)
        purity = metric.purity_score(dbscany, s_labels)
        f1 = metric.f1_score(dbscany, s_labels)
        #print(e, nmi)
        print("nmi, k, purity, p, r, f1: ", nmi, len(set(dbscany)), purity, f1)
Exemple #2
0
def metric_f1(model_file=None, functionclasses_file=None, average=None):
    assert average

    keyed_vectors = KeyedVectors.load_word2vec_format(model_file, binary=False)
    classes = __read_class_labels(functionclasses_file)
    score = metric.f1_score(classes, average, 0.3, keyed_vectors)
    print(score)
Exemple #3
0
def dev(data_loader, vocab, model, device, mode='dev'):
    """test model performance on dev-set"""
    model.eval()
    true_tags = []
    pred_tags = []
    sent_data = []
    dev_losses = 0
    for idx, batch_samples in enumerate(data_loader):
        sentences, labels, masks, lens = batch_samples
        sent_data.extend([[
            vocab.id2word.get(idx.item()) for i, idx in enumerate(indices)
            if mask[i] > 0
        ] for (mask, indices) in zip(masks, sentences)])
        sentences = sentences.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        y_pred = model.forward(sentences)
        labels_pred = model.crf.decode(y_pred, mask=masks)
        targets = [
            itag[:ilen] for itag, ilen in zip(labels.cpu().numpy(), lens)
        ]
        true_tags.extend([[vocab.id2label.get(idx) for idx in indices]
                          for indices in targets])
        pred_tags.extend([[vocab.id2label.get(idx) for idx in indices]
                          for indices in labels_pred])
        # 计算梯度
        _, dev_loss = model.forward_with_crf(sentences, masks, labels)
        dev_losses += dev_loss
    assert len(pred_tags) == len(true_tags)
    if mode == 'test':
        assert len(sent_data) == len(true_tags)

    # logging loss, f1 and report
    metrics = {}
    if mode == 'dev':
        f1 = f1_score(true_tags, pred_tags, mode)
        metrics['f1'] = f1
    else:
        bad_case(true_tags, pred_tags, sent_data)
        f1_labels, f1 = f1_score(true_tags, pred_tags, mode)
        metrics['f1_labels'] = f1_labels
        metrics['f1'] = f1
    metrics['loss'] = float(dev_losses) / len(data_loader)
    return metrics
Exemple #4
0
def validate(step, model, data_loader, criterion, device):
    f1_sum, prec_sum, rec_sum = 0, 0, 0
    rouge1_sum, rouge2_sum, rougeL_sum = 0, 0, 0
    count = 0
    loss = 0
    batch_count = 0
    for _, batch in enumerate(data_loader):
        model.eval()
        batch = to_device(batch, device=device)
        batch_size = len(batch['id'])

        (preds, logits), _ = model(batch['article']['sents_unk'],
                                   batch['article']['lens'])

        preds = preds.cpu().numpy()
        results = point2result(preds, batch['article']['origin'])
        golds = batch['abstract']['origin']

        # validation loss
        targets = batch['target']['position'].long()[:, :4]
        loss += sequence_loss(logits, targets, criterion, pad_idx=-1).item()
        batch_count += 1

        targets = batch['target']['position'].long().cpu().numpy()
        for i in range(batch_size):
            # point level evaluation
            pred = preds[i]
            target = targets[i]
            f1, prec, rec = f1_score(pred, target)
            f1_sum += f1
            prec_sum += prec
            rec_sum += rec

            # summary level evaluation
            eval = results[i]
            ref = golds[i]
            rouge1_sum += rouge.rouge_n(eval, ref, n=1)['f']
            rouge2_sum += rouge.rouge_n(eval, ref, n=2)['f']
            rougeL_sum += rouge.rouge_l_summary_level(eval, ref)['f']
            count += 1
    f1_avg = f1_sum / count
    prec_avg = prec_sum / count
    rec_avg = rec_sum / count
    print('validation loss: ' + str(loss / batch_count))
    print('step %d/%d: F1 %.4f Precision %.4f Recall %.4f' %
          (step + 1, len(data.train_loader),
           f1_avg, prec_avg, rec_avg))
    print(' ROUGE-1 ' + str(rouge1_sum / count) +
          ' ROUGE-2 ' + str(rouge2_sum / count) +
          ' ROUGE-L ' + str(rougeL_sum / count))
    return f1_avg
Exemple #5
0
def logit(model_file=None, classes_file=None, average=None, zoomout=None, output_file=None):
    keyed_vectors = KeyedVectors.load_word2vec_format(model_file, binary=False)
    classes = __read_class_labels(classes_file)

    x_values = np.arange(0.1, 1, 0.1)
    y_values = []
    for x in x_values:
        score = metric.f1_score(classes, average, x, keyed_vectors)
        y_values.append(score)

    if zoomout:
        visualization.series(x=x_values, y=y_values, xlabel='Training fraction', ylabel='%s-F1 score' % average,
                             xlim=(0, 1), ylim=(0, 1), output_file=output_file)
    else:
        visualization.series(x=x_values, y=y_values, xlabel='Training fraction', ylabel='%s-F1 score' % average,
                             output_file=output_file)
Exemple #6
0
def dev(data_loader, vocab, model, device, mode='dev'):
    """test model performance on dev-set"""
    model.eval()
    true_tags = []
    pred_tags = []
    sent_data = []
    dev_losses = 0
    for idx, batch_samples in enumerate(tqdm(data_loader)):
        uni_words, labels, masks, lens = batch_samples
        sent_data.extend([[
            vocab.id2word.get(idx.item()) for i, idx in enumerate(indices)
            if mask[i] > 0
        ] for (mask, indices) in zip(masks, uni_words)])
        uni_words = uni_words.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        y_pred = model.forward(uni_words, training=False)
        labels_pred = model.crf.decode(y_pred, mask=masks)
        targets = [
            itag[:ilen] for itag, ilen in zip(labels.cpu().numpy(), lens)
        ]
        true_tags.extend([[vocab.id2label.get(idx) for idx in indices]
                          for indices in targets])
        pred_tags.extend([[vocab.id2label.get(idx) for idx in indices]
                          for indices in labels_pred])
        # 计算梯度
        _, dev_loss = model.forward_with_crf(uni_words, masks, labels)
        dev_losses += dev_loss
    assert len(pred_tags) == len(true_tags)
    assert len(sent_data) == len(true_tags)

    # logging loss, f1 and report
    metrics = {}
    f1, p, r = f1_score(true_tags, pred_tags)
    metrics['f1'] = f1
    metrics['p'] = p
    metrics['r'] = r
    metrics['loss'] = float(dev_losses) / len(data_loader)
    if mode != 'dev':
        bad_case(sent_data, pred_tags, true_tags)
        output_write(sent_data, pred_tags)
    return metrics
Exemple #7
0
def train_eval(args, train_data_path, valid_data_path):

    index = read_pickle(args.index_path)
    word2index, tag2index = index['word2id'], index['tag2id']
    args.num_labels = len(tag2index)
    args.vocab_size = len(word2index)+1
    set_seed(args.seed_num)
    train_dataloader, train_samples = get_dataloader(train_data_path, args.train_batch_size, True)
    valid_dataloader, _ = get_dataloader(valid_data_path, args.valid_batch_size, False)

    if args.model == 'bert':
        bert_config = BertConfig(args.bert_config_path)
        model = NERBert(bert_config, args)
        model.load_state_dict(torch.load(args.bert_model_path), strict=False)
        # model = NERBert.from_pretrained('bert_chinese',
        #                                 # cache_dir='/home/dutir/yuetianchi/.pytorch_pretrained_bert',
        #                                 num_labels=args.num_labels)
    else:
        if args.embedding:
            word_embedding_matrix = read_pickle(args.embedding_data_path)
            model = NERModel(args, word_embedding_matrix)
        else:
            model = NERModel(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.model == 'bert':
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if 'bert' not in n], 'lr': 5e-5, 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and ('bert' in n)],
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and ('bert' in n)],
             'weight_decay': 0.0}
        ]
        warmup_proportion = 0.1
        num_train_optimization_steps = int(
            train_samples / args.train_batch_size / args.gradient_accumulation_steps) * args.epochs

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=warmup_proportion,
                             t_total=num_train_optimization_steps)
    else:
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=current_learning_rate
        )

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    global_step = init_step
    best_score = 0.0

    logging.info('Start Training...')
    logging.info('init_step = %d' % global_step)
    for epoch_id in range(int(args.epochs)):

        tr_loss = 0
        model.train()
        for step, train_batch in enumerate(train_dataloader):


            batch = tuple(t.to(device) for t in train_batch)
            _, loss = model(batch[0], batch[1])
            if n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            loss.backward()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % 500 == 0:
                print(loss.item())

            if args.do_valid and global_step % args.valid_step == 1:
                true_res = []
                pred_res = []
                len_res = []
                model.eval()
                for valid_step, valid_batch in enumerate(valid_dataloader):
                    valid_batch = tuple(t.to(device) for t in valid_batch)

                    with torch.no_grad():
                        logit = model(valid_batch[0])
                    if args.model == 'bert':
                        # 第一个token是‘cls’
                        len_res.extend(torch.sum(valid_batch[0].gt(0), dim=-1).detach().cpu().numpy()-1)
                        true_res.extend(valid_batch[1].detach().cpu().numpy()[:,1:])
                        pred_res.extend(logit.detach().cpu().numpy()[:,1:])
                    else:
                        len_res.extend(torch.sum(valid_batch[0].gt(0),dim=-1).detach().cpu().numpy())
                        true_res.extend(valid_batch[1].detach().cpu().numpy())
                        pred_res.extend(logit.detach().cpu().numpy())
                acc, score = cal_score(true_res, pred_res, len_res, tag2index)
                score = f1_score(true_res, pred_res, len_res, tag2index)
                logging.info('Evaluation:step:{},acc:{},fscore:{}'.format(str(epoch_id), acc, score))
                if score>=best_score:
                    best_score = score
                    if args.model == 'bert':
                        model_to_save = model.module if hasattr(model,
                                                                'module') else model  # Only save the model it-self
                        output_dir = '{}_{}'.format('bert', str(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                            output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
                            torch.save(model_to_save.state_dict(), output_model_file)
                            output_config_file = os.path.join(output_dir, CONFIG_NAME)
                            with open(output_config_file, 'w') as f:
                                f.write(model_to_save.config.to_json_string())
                    else:
                        save_variable_list = {
                            'step': global_step,
                            'current_learning_rate': args.learning_rate,
                            'warm_up_steps': step
                        }
                        save_model(model, optimizer, save_variable_list, args)
                model.train()
Exemple #8
0
def main():
    args = parser.parse_args()
    np.random.seed(1)
    tf.set_random_seed(1)
    # ignore tensorflow warning
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    
    view_shape, views, label = process_data(args)
    num_class = np.unique(label).shape[0] 
    batch_size = label.shape[0] 
    # class_single = batch_size / num_class # 10

    reg1 = 1.0
    reg2 = 1.0 
    alpha = max(0.4 - (num_class-1)/10 * 0.1, 0.1)
    lr = args.lr
    acc_= []
    
    tf.reset_default_graph()

    if args.test:
        label_10_subjs = label - label.min() + 1
        label_10_subjs = np.squeeze(label_10_subjs) 
        Coef = sio.loadmat('./result/rgbd_coef.mat')['coef']
        print('load mat ..')
        y_x, L = post_proC(Coef, label_10_subjs.max(), 3, 1)   
        missrate_x = err_rate(label_10_subjs, y_x)                
        acc_x = 1 - missrate_x
        nmi = normalized_mutual_info_score(label_10_subjs, y_x)
        f_measure = f1_score(label_10_subjs, y_x)
        ri = rand_index_score(label_10_subjs, y_x)
        ar = adjusted_rand_score(label_10_subjs, y_x)
        print("nmi: %.4f" % nmi, \
            "accuracy: %.4f" % acc_x, \
            "F-measure: %.4f" % f_measure, \
            "RI: %.4f" % ri, \
            "AR: %.4f" % ar)   
        exit()

    if not args.ft:
        # pretrian stage 
        mtv = MTV(view_shape=view_shape, batch_size=batch_size, ft=False, reg_constant1=reg1, reg_constant2=reg2)
        mtv.restore()        
        epoch = 0 
        min_loss = 9970
        while epoch < args.pretrain:
            loss = mtv.reconstruct(views[0], views[1], views[2], lr)
            print("epoch: %.1d" % epoch, "loss: %.8f" % (loss/float(batch_size)))
            if loss/float(batch_size) < min_loss:
                print('save model.')
                mtv.save_model()
                min_loss = loss/float(batch_size)                          
            epoch += 1
    else:
        # self-expressive stage
        mtv = MTV(view_shape=view_shape, batch_size=batch_size, ft=True, reg_constant1=reg1, reg_constant2=reg2)
        mtv.restore()
        Coef = None
        label_10_subjs = label - label.min() + 1
        label_10_subjs = np.squeeze(label_10_subjs) 

        best_acc, best_epoch = 0, 0
        
        epoch = 0 
        while epoch < args.epochs:
            loss, Coef, Coef_1, Coef_2 = mtv.finetune(views[0], views[1], lr)
            print("epoch: %.1d" % epoch, "loss: %.8f" % (loss))
            epoch += 1

        Coef = thrC(Coef, alpha)                                  
        sio.savemat('./result/rgbd_coef.mat', dict([('coef', Coef)]))
        y_x, L = post_proC(Coef, label_10_subjs.max(), 3, 1, 0)    
        missrate_x = err_rate(label_10_subjs, y_x)                
        acc_x = 1 - missrate_x
        nmi = normalized_mutual_info_score(label_10_subjs, y_x)
        f_measure = f1_score(label_10_subjs, y_x)
        ri = rand_index_score(label_10_subjs, y_x)
        ar = adjusted_rand_score(label_10_subjs, y_x)
        print("epoch: %d" % epoch, \
            "nmi: %.4f" % nmi, \
            "accuracy: %.4f" % acc_x, \
            "F-measure: %.4f" % f_measure, \
            "RI: %.4f" % ri, \
            "AR: %.4f" % ar)