def valid_model(model_name, ckpt_path):

    # model = create_model(model_name.split('-')[-2], num_classes=3, checkpoint_path=ckpt_path)
    model = load_checkpoint(ckpt_path)
    model.cuda()

    img_size = int(model_name.split('-')[-1])
    interpolation = "bicubic"
    batch_size = 64

    dataset = Dataset(os.path.join(BASE, "valid"))
    # loader = create_loader(
    #     dataset,
    #     input_size=img_size,
    #     batch_size=batch_size,
    #     use_prefetcher=False,
    #     interpolation=interpolation,
    #     num_workers=4)
    loader = torch.utils.data.DataLoader(
        Dataset(root=os.path.join(BASE, "valid"),
                transform=get_test_transform(img_size)),
        batch_size=64,
        num_workers=8,
        drop_last=False,
    )
    print('..... Finished loading model! ......')
    class_2_index = {0: 'normal', 1: 'phone', 2: 'smoke'}

    with open("./txts/v-info-new.json", 'r', encoding="utf-8") as f:
        shape_dict = json.load(f)

    ## 特征的维度需要自己根据特定的模型调整,我这里采用的是哪一个我也忘了
    labels = []
    total_pred_idx = []
    dets_info = {}

    with torch.no_grad():
        for batch_idx, (input, target) in enumerate(tqdm(loader)):
            output = model(input.cuda())

            prob = torch.max(torch.softmax(output, -1), -1)[0]
            idx = torch.max(torch.softmax(output, -1), -1)[1]
            pred_idx = idx.cpu().numpy()
            total_pred_idx.extend(pred_idx)

            for j in range(len(pred_idx)):
                filename = loader.dataset.filenames()[batch_idx * batch_size + j]
                name = filename.split('/')[-1].split('.')[0]

                dets_info[name] = [class_2_index[pred_idx[j]], float(prob[j]), shape_dict[name][1], shape_dict[name][2]]

            labels.extend(target.cpu().numpy())

    with open("%s/v.json" % (feature_path), "w", encoding="utf-8") as f:
        json.dump(dets_info, f)
    prec = accuracy_score(labels, total_pred_idx)
    print("%.4f" % prec)
Example #2
0
def train():
    # 1. 加载数据集
    print("start loading data_set")
    train_dataset: Dataset = Dataset.load("../data/essay_data/train.pickle")
    dev_dataset: Dataset = Dataset.load("../data/essay_data/dev.pickle")
    test_dataset: Dataset = Dataset.load("../data/essay_data/test.pickle")
    print("end loading data_set")

    # 2. 计算特征
    essay_set_num = len(train_dataset.data)
    for set_id in range(1, essay_set_num + 1):
        train_data = train_dataset[str(set_id)]
        dev_data = dev_dataset[str(set_id)]
        test_data = test_dataset[str(set_id)]
        train_every_set(train_data, dev_data, test_data, set_id)
def save_feature_batch(model_name, ckpt_path, feature_path, label_path, data_type="valid"):
    model = create_model(model_name.split('-')[-2], num_classes=3, checkpoint_path=ckpt_path)
    model.cuda().eval()
    print('..... Finished loading model! ......')
    img_size = int(model_name.split('-')[-1])
    interpolation = "bicubic"
    batch_size = 128

    dataset = Dataset(os.path.join(BASE, data_type))
    loader = create_loader(
        dataset,
        input_size=img_size,
        batch_size=batch_size,
        use_prefetcher=False,
        interpolation=interpolation,
        num_workers=8)

    features = []
    labels = []
    with torch.no_grad():
        for batch_idx, (input, target) in enumerate(tqdm(loader)):
            out = model.forward_features(input.cuda())
            out2 = nn.AdaptiveAvgPool2d(1)(out)
            feature = out2.view(out.size(0), -1)

            features.append(feature.cpu().numpy())
            labels.extend(target.cpu().numpy())
    features = np.array(np.vstack(features))

    pickle.dump(features, open(feature_path, 'wb'))
    pickle.dump(labels, open(label_path, 'wb'))
    print('CNN features obtained and saved.')
def classifier_pred(classifier, shape_path, feature, id, model_name, data_type="valid"):

    class_2_index = {0: 'normal', 1: 'phone', 2: 'smoke'}
    dets_info = {}

    features = pickle.load(open(feature, 'rb'))
    ids = pickle.load(open(id, 'rb'))
    predict = classifier.predict(features)

    # predicted_test_scores = classifier.decision_function(features)
    # probs = softmax(predicted_test_scores)
    # prob_list = [prob[int(predict[i])] for i, prob in enumerate(probs)]

    probs = classifier.predict_proba(features)
    prob_list = [round(prob[int(predict[i])], 4) for i, prob in enumerate(probs)]

    prediction = predict.tolist()
    total_pred_idx = [int(pred) for pred in prediction]
    total_true_idx = [int(label) for label in ids]

    with open("./txts/%s.json" % shape_path, 'r', encoding="utf-8") as f:
        shape_dict = json.load(f)

    dataset = Dataset(os.path.join(BASE, data_type))
    filenames = dataset.filenames()

    for i, filename in enumerate(filenames):
        name = filename.split('/')[-1].split('.')[0]
        dets_info[name] = [class_2_index[int(prediction[i])], prob_list[i], shape_dict[name][1], shape_dict[name][2]]

    with open("%s/%s.json" % (feature_path, shape_path.split('-')[0]), "w", encoding="utf-8") as f:
        json.dump(dets_info, f)
    accuracy = round(accuracy_score(total_true_idx, total_pred_idx), 4)

    test_map, ap_list = eval_map(detFolder="%s/v.json" % feature_path, gtFolder="txts/v-info-new.json", return_each_ap=True)
    print("Accuracy: %s, map: %.4f" % (accuracy, test_map))

    with open("weights/%s-valid.json" % model_name, 'w', encoding="utf-8") as f:
        prob_dict = {}
        prob_dict["prob"] = probs
        prob_dict["model_weight"] = test_map
        prob_dict["label_weight"] = ap_list

        json.dump(prob_dict, f, cls=MyEncoder)

    return accuracy, round(test_map, 4)
Example #5
0
    def get_train_feature(self, train_data_set: Dataset):
        set_id = 1
        data, score_list = train_data_set.get_data_list(set_id)
        wv_similarity = word_vector_similarity_train(data, score_list)
        print(wv_similarity)

        pos_bigram = pos_bigram_train(data)
        print(pos_bigram)

        # TODO 拼接
        return
Example #6
0
def test():
    train_dataset: Dataset = Dataset.load("../../data/train.pickle")
    train_data = train_dataset.data['1']
    essay_data, token_data, scores = Dataset.get_data_list(train_data,
                                                           acquire_score=False)
    result1, result2 = word_length(essay_data)
    result3, result4, result5 = word_bigram_train(token_data)

    result6, result7, result8 = pos_bigram_train(token_data)
    print(result1)
    print(result1.shape)
    print(result2)
    print(result2.shape)
    print(result3)
    print(result3.shape)
    print(result4)
    print(result4.shape)
    print(result6)
    print(result6.shape)
    print(result7)
    print(result7.shape)
def classifier_test(model_path, feature, data_type="test"):
    class_2_index = {0: 'normal', 1: 'calling', 2: 'smoking'}

    features = pickle.load(open(feature, 'rb'))
    classifier = joblib.load(model_path)
    predict = classifier.predict(features)

    probs = classifier.predict_proba(features)
    prob_list = [round(prob[int(predict[i])], 4) for i, prob in enumerate(probs)]
    prediction = predict.tolist()

    result_list = []
    clas_name = model_path.split('/')[-1].split('-')[0]
    dataset = Dataset(os.path.join(BASE, data_type))
    filenames = dataset.filenames()
    print(clas_name)
    with open('./infer/result-%s.json' % clas_name, 'w', encoding="utf-8") as out_file:
        for i in range(len(filenames)):
            filename = filenames[i].split('/')[-1].strip()
            name = class_2_index[int(prediction[i])]
            result_data = {"image_name": str(filename), "category": name, "score": prob_list[i]}
            result_list.append(result_data)
        json.dump(result_list, out_file, cls=MyEncoder, indent=4)
Example #8
0
def validate(args):
    # might as well try to validate something
    args.prefetcher = not args.no_prefetcher

    # create model
    model = create_model(
        args.model,
        num_classes=args.num_classes,
        in_chans=3,
        global_pool=args.gp)

    if args.checkpoint:
        load_checkpoint(model, args.checkpoint)

    param_count = sum([m.numel() for m in model.parameters()])
    _logger.info('Model %s created, param count: %d' % (args.model, param_count))

    data_config = resolve_data_config(vars(args), model=model)
    # model, test_time_pool = apply_test_time_pool(model, data_config, args)

    if torch.cuda.is_available():
        model.cuda()

    criterion = nn.CrossEntropyLoss().cuda()

    dataset = Dataset(args.data)

    crop_pct = data_config['crop_pct']
    loader = create_loader(
        dataset,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        use_prefetcher=args.prefetcher,
        interpolation=data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        crop_pct=crop_pct)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    f1_m = AverageMeter()

    end = time.time()
    total_pred_idx = []
    total_truth_idx = []
    mistake_image = []
    mistake_image_dict = {'calling': [], 'normal': [], 'smoking': [], 'smoking_calling': []}
    # class_2_index = {0: 'normal', 1: 'phone', 2: 'smoke'}
    class_2_index = {0: 'calling', 1: 'normal', 2: 'smoking', 3: 'smoking_calling'}
    with open("./txts/%s.json" % json_name, 'r', encoding="utf-8") as f:
        shape_dict = json.load(f)
    dets_info = {}

    model.eval()
    with torch.no_grad():
        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
        input = torch.randn((args.batch_size,) + data_config['input_size'])
        if torch.cuda.is_available():
            input = input.cuda()

        model(input)
        end = time.time()
        for batch_idx, (input, target) in enumerate(loader):
            if args.no_prefetcher and torch.cuda.is_available():
                target = target.cuda()
                input = input.cuda()

            # compute output
            # t0 = time.time()
            output = model(input)
            # print("time0: %.8f s" % ((time.time() - t0)))
            # t1 = time.time()
            # out = output.detach().cpu()
            # print("time1: %.8f s" % ((time.time() - t1) / 64))
            # print("time2: %.8f s" % ((time.time() - t0) / 64))
            # t2 = time.time()
            # out = out.cuda().cpu()
            # print("time3: %.8f s" % ((time.time() - t2) / 64))
            # get prediction index and ground turth index
            prob = torch.max(F.softmax(output, -1), -1)[0]
            idx = torch.max(F.softmax(output, -1), -1)[1]

            target_idx = target.cpu().numpy()
            predict_idx = idx.cpu().numpy()

            for j in range(len(target_idx)):
                total_truth_idx.append(target_idx[j])
                total_pred_idx.append(predict_idx[j])

                class_dict = loader.dataset.class_to_idx

                target_class = list(class_dict.keys())[list(class_dict.values()).index(int(target_idx[j]))]
                pred_class = list(class_dict.keys())[list(class_dict.values()).index(int(predict_idx[j]))]

                filename = loader.dataset.filenames()[batch_idx * args.batch_size + j]
                name = filename.split('/')[-1].split('.')[0]

                dets_info[name] = [pred_class, float(prob[j]), shape_dict[name][1], shape_dict[name][2]]

                if target_idx[j] != predict_idx[j]:
                    mistake_image.append(
                        [loader.dataset.filenames()[batch_idx * args.batch_size + j], target_class, pred_class,
                         np.round(prob[j].cpu().numpy(), 4)])

                    mistake_image_dict[class_2_index[predict_idx[j]]].append(
                        loader.dataset.filenames()[batch_idx * args.batch_size + j])

            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target, topk=(1, 3))

            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if batch_idx % args.log_freq == 0:
                _logger.info(
                    'Test: [{0:>4d}/{1}]  '
                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
                    'Acc@1: {top1.val:>7.2f} ({top1.avg:>7.2f})  '
                    'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
                        batch_idx, len(loader), batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                        loss=losses, top1=top1, top5=top5))

    with open("%s/%s.json" % (output_path, json_name.split('-')[0]), "w", encoding="utf-8") as f:
        json.dump(dets_info, f)

    top1a, top5a = top1.avg, top5.avg
    results = OrderedDict(
        top1=round(top1a, 4), top1_err=round(100 - top1a, 4),
        top5=round(top5a, 4), top5_err=round(100 - top5a, 4),
        param_count=round(param_count / 1e6, 2),
        img_size=data_config['input_size'][-1],
        cropt_pct=crop_pct,
        interpolation=data_config['interpolation'],
        mistake_image_dict=mistake_image_dict,
        pred_idx=total_pred_idx, truth_idx=total_truth_idx)

    _logger.info(' * Acc@1 {:.2f} ({:.2f}) Acc@5 {:.2f} ({:.2f})'.format(
       results['top1'], results['top1_err'], results['top5'], results['top5_err']))

    map, each_ap = eval_map(detFolder="%s/%s.json" % (output_path, json_name.split('-')[0]),
                   gtFolder="txts/%s.json" % json_name, return_each_ap=True)
    _logger.info('Valid mAP: {}, each ap: {}'.format(round(map, 4), each_ap))

    return results
Example #9
0
import sys
sys.path.append("../..")
from src.feature.iku import spell_error, Mean_sentence_depth
from src.data import Dataset

test_dataset = Dataset.load("../../data/test.pickle")

# print(test_dataset.data)

for data in test_dataset.data['1']:
    Mean_sentence_depth(data)
Example #10
0
def main():
    setup_default_logging()
    args, args_text = _parse_args()

    args.prefetcher = not args.no_prefetcher
    torch.manual_seed(args.seed)

    model = create_model(args.model,
                         pretrained=True,
                         num_classes=args.num_classes,
                         drop_rate=args.drop,
                         drop_path_rate=args.drop_path,
                         drop_block_rate=args.drop_block,
                         checkpoint_path=args.initial_checkpoint)

    if args.local_rank == 0:
        _logger.info('Model %s created, param count: %d' %
                     (args.model, sum([m.numel()
                                       for m in model.parameters()])))

    data_config = resolve_data_config(vars(args),
                                      model=model,
                                      verbose=args.local_rank == 0)

    if args.num_gpu > 1:
        model = nn.DataParallel(model,
                                device_ids=list(range(args.num_gpu))).cuda()
    else:
        model.cuda()

    optimizer = create_optimizer(args, model)

    loss_scaler = None
    # optionally resume from a checkpoint
    resume_epoch = None
    if args.resume:
        resume_epoch = resume_checkpoint(
            model,
            args.resume,
            optimizer=None if args.no_resume_opt else optimizer,
            loss_scaler=None if args.no_resume_opt else loss_scaler,
            log_info=args.local_rank == 0)

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    start_epoch = 0
    if args.start_epoch is not None:
        # a specified start_epoch will always override the resume epoch
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch
    if lr_scheduler is not None and start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        _logger.info('Scheduled epochs: {}'.format(num_epochs))

    train_dir = os.path.join(args.data, 'train')
    if not os.path.exists(train_dir):
        _logger.error(
            'Training folder does not exist at: {}'.format(train_dir))
        exit(1)
    dataset_train = Dataset(train_dir)

    collate_fn = None
    mixup_fn = None
    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
    if mixup_active:
        mixup_args = dict(mixup_alpha=args.mixup,
                          cutmix_alpha=args.cutmix,
                          cutmix_minmax=args.cutmix_minmax,
                          prob=args.mixup_prob,
                          switch_prob=args.mixup_switch_prob,
                          elementwise=args.mixup_elem,
                          label_smoothing=args.smoothing,
                          num_classes=args.num_classes)
        if args.prefetcher:
            collate_fn = FastCollateMixup(**mixup_args)
        else:
            mixup_fn = Mixup(**mixup_args)

    loader_train = create_loader(
        dataset_train,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        re_prob=args.reprob,
        re_mode=args.remode,
        re_count=args.recount,
        color_jitter=args.color_jitter,
        auto_augment=args.aa,
        num_workers=args.workers,
        collate_fn=collate_fn,
    )

    eval_dir = os.path.join(args.data, 'valid')
    if not os.path.isdir(eval_dir):
        eval_dir = os.path.join(args.data, 'validation')
        if not os.path.isdir(eval_dir):
            _logger.error(
                'Validation folder does not exist at: {}'.format(eval_dir))
            exit(1)
    dataset_eval = Dataset(eval_dir)

    loader_eval = create_loader(
        dataset_eval,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        num_workers=args.workers,
        crop_pct=data_config['crop_pct'],
    )

    if mixup_active:
        # smoothing is handled with mixup target transform
        train_loss_fn = SoftTargetCrossEntropy().cuda()
    elif args.smoothing:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=args.smoothing).cuda()
        train_loss_ce = nn.CrossEntropyLoss().cuda()
    else:
        train_loss_fn = nn.CrossEntropyLoss().cuda()
    validate_loss_fn = nn.CrossEntropyLoss().cuda()

    eval_metric = args.eval_metric
    best_metric = None
    best_epoch = None
    saver = None
    output_dir = ''
    plateau_num = 0
    if args.local_rank == 0:
        output_base = args.output if args.output else './output'
        exp_name = '-'.join([
            datetime.now().strftime("%Y%m%d-%H%M%S"), args.model,
            str(data_config['input_size'][-1])
        ])
        output_dir = get_outdir(output_base, exp_name)
        decreasing = True if eval_metric == 'loss' else False
        saver = CheckpointSaver(model=model,
                                optimizer=optimizer,
                                args=args,
                                amp_scaler=loss_scaler,
                                checkpoint_dir=output_dir,
                                recovery_dir=output_dir,
                                decreasing=decreasing,
                                max_history=2)
        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
            f.write(args_text)

        with open("./txts/%s.json" % json_name, 'r', encoding="utf-8") as f:
            shape_dict = json.load(f)
    try:
        for epoch in range(start_epoch, num_epochs):

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        [train_loss_fn, train_loss_ce],
                                        args,
                                        lr_scheduler=lr_scheduler,
                                        output_dir=output_dir,
                                        mixup_fn=mixup_fn)

            eval_metrics, dets_info = validate(model,
                                               loader_eval,
                                               validate_loss_fn,
                                               args,
                                               shape_dict=shape_dict)

            with open("%s/v.json" % output_dir, "w", encoding="utf-8") as f:
                json.dump(dets_info, f)
            map = round(
                eval_map(detFolder="%s/v.json" % output_dir,
                         gtFolder="txts/%s.json" % json_name), 4)
            eval_metrics["map"] = map
            _logger.info('Valid mAP: {}'.format(map))

            if lr_scheduler is not None:
                # step LR for next epoch
                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                save_metric = eval_metrics[eval_metric]
                saver.save_prefix = "%.2f-%s" % (eval_metrics["top1"], map)
                best_metric, best_epoch = saver.save_checkpoint(
                    epoch, metric=save_metric)

            if eval_metrics[eval_metric] >= best_metric:
                plateau_num = 0
            else:
                plateau_num += 1

            # 超过30个epoch还没有更新metric,停止运行
            if plateau_num == 30:
                break

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        _logger.info('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))
Example #11
0
import sys

sys.path.append("../..")
from src.feature.iku import spell_error, Mean_sentence_depth_level, semantic_vector_similarity, essay_length
from src.data import Dataset
from gensim import corpora, models
from gensim.similarities import MatrixSimilarity

# test_dataset= Dataset.load("../../data/test.pickle")
train_dataset = Dataset.load("../../data/train.pickle")
Mean_sentence_depth_level(train_dataset.data['1'])
# train_dataset = Dataset()
# train_dataset.load_from_raw_file("../../data/train.tsv", ['essay_set', 'essay_id', 'essay', 'domain1_score'])
# Dataset.save(train_dataset, '../../data/train.pickle')
# dev_dataset = Dataset()
# dev_dataset.load_from_raw_file("../../data/dev.tsv", ['essay_set', 'essay_id', 'essay', 'domain1_score'])
# Dataset.save(dev_dataset, '../../data/dev.pickle')
# test_dataset = Dataset()
# test_dataset.load_from_raw_file("../../data/test.tsv", ['essay_set', 'essay_id', 'essay'])
# Dataset.save(test_dataset, '../../data/test.pickle')

# print(test_dataset.data)

# spell_error(train_dataset.data['3'])

# semantic_vector_similarity(train_dataset.data['3'], train_dataset.data['3'])

# essay_length(train_dataset.data['1'])
# for data in test_dataset.data['1']:
#    print(type)
#    # Mean_sentence_depth(data)
Example #12
0
def main():
    setup_default_logging()
    args = parser.parse_args()
    # might as well try to do something useful...
    args.pretrained = args.pretrained or not args.checkpoint

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    args.checkpoint = glob.glob(args.checkpoint + '/*.pth')[0]

    # create model
    model = create_model(args.model,
                         num_classes=args.num_classes,
                         in_chans=3,
                         pretrained=args.pretrained)
    load_checkpoint(model, args.checkpoint)

    logging.info('Model %s created, param count: %d' %
                 (args.model, sum([m.numel() for m in model.parameters()])))

    args.img_size = int(args.checkpoint.split('/')[-2].split('-')[-1])
    config = resolve_data_config(vars(args), model=model)
    # model, test_time_pool = apply_test_time_pool(model, config, args)

    if torch.cuda.is_available():
        model = model.cuda()

    loader = create_loader(Dataset(args.data),
                           input_size=config['input_size'],
                           batch_size=args.batch_size,
                           use_prefetcher=False,
                           interpolation=config['interpolation'],
                           mean=config['mean'],
                           std=config['std'],
                           num_workers=args.workers,
                           crop_pct=config['crop_pct'])

    model.eval()

    batch_time = AverageMeter()
    end = time.time()
    topk_ids = []
    scores = []
    total_pred_idx = []
    total_truth_idx = []
    with torch.no_grad():
        for batch_idx, (input, target) in enumerate(loader):
            if torch.cuda.is_available():
                input = input.cuda()
            output = model(input)

            prob = torch.max(F.softmax(output, -1), -1)[0]
            idx = torch.max(F.softmax(output, -1), -1)[1]

            total_pred_idx.extend(idx.cpu().numpy())
            total_truth_idx.extend(target.cpu().numpy())

            scores.extend(prob.cpu().numpy())
            topk_ids.extend(idx.cpu().numpy())

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if batch_idx % args.log_freq == 0:
                logging.info(
                    'Predict: [{0}/{1}] Time {batch_time.val:.3f} ({batch_time.avg:.3f})'
                    .format(batch_idx, len(loader), batch_time=batch_time))

    # result_file_path = os.path.join('./results', model_name)
    # if not os.path.exists(result_file_path):
    #     os.makedirs(result_file_path)

    # res_cf = open('%s/results-all.csv' % result_file_path, mode='w')
    # for i in range(len(total_pred_idx)):
    #     res_cf.write('{0},'.format(str(total_pred_idx[i])))
    # res_cf.write('\n')
    # for i in range(len(total_truth_idx)):
    #     res_cf.write('{0},'.format(str(total_truth_idx[i])))

    # dst_root = './infer/%s' % args.checkpoint.split('/')[-2]
    # if not os.path.exists(dst_root):
    #     os.makedirs(dst_root)
    # else:
    #     shutil.rmtree(dst_root)

    result_list = []
    # class_2_index = {0: 'normal', 1: 'calling', 2: 'smoking'}
    class_2_index = {
        0: 'calling',
        1: 'normal',
        2: 'smoking',
        3: 'smoking_calling'
    }

    with open(os.path.join(args.output_dir, 'result.json'),
              'w',
              encoding="utf-8") as out_file:
        filenames = loader.dataset.filenames()
        for i in range(len(scores)):
            filename = filenames[i].split('/')[-1]
            name = class_2_index[topk_ids[i]]
            result_data = {
                "image_name": str(filename),
                "category": name,
                "score": scores[i]
            }
            result_list.append(result_data)

            # if scores[i] > 0.95:
            # dst_path = os.path.join(dst_root, name)
            # if not os.path.exists(dst_path):
            #     os.makedirs(dst_path)
            # shutil.copy(filenames[i], os.path.join(dst_path, filename))

        json.dump(result_list, out_file, cls=MyEncoder, indent=4)
Example #13
0
def data(args):
    bpath = os.path.join('./data', args.dataset)

    with open(os.path.join(bpath, 'entity2id.txt'), 'r') as f:
        e_ix = dict(map(lambda x: x.split()[::-1], f.read().split('\n')[1:-1]))

    tp_ix, tp_rix = dict(), dict()
    tp_rx = re.compile(r'^/(\w+)/.*$')
    for i, e in e_ix.items():
        tp = re.findall(tp_rx, e)[0]
        if tp not in tp_ix:
            tp_ix[tp] = list()
        tp_ix[tp].append(int(i))
        tp_rix[int(i)] = tp

    with open(os.path.join(bpath, 'entity2id.txt'), 'r') as f:
        e_ix_ln = int(f.readline().strip())
    with open(os.path.join(bpath, 'relation2id.txt'), 'r') as f:
        r_ix_ln = int(f.readline().strip())

    tr_ds = Dataset(args, os.path.join(bpath, 'train2id.txt'), e_ix_ln, tp_ix,
                    tp_rix, 1)
    vd_ds = Dataset(args, os.path.join(bpath, 'valid2id.txt'), e_ix_ln, tp_ix,
                    tp_rix, 2)
    ts_ds = Dataset(args, os.path.join(bpath, 'test2id.txt'), e_ix_ln, tp_ix,
                    tp_rix, 3)

    if args.model.startswith('DE'):
        t_ix = FakeTimeIndex()
    else:
        al_t = np.concatenate([tr_ds, vd_ds, ts_ds], axis=1)[0, :,
                                                             3:].flatten()
        t_ix = {e: i for i, e in enumerate(np.unique(al_t))}
    t_ix_ln = len(t_ix)

    tr_ds.transform(t_ix, qs_bs={})
    vd_ds.transform(t_ix, qs_bs=tr_ds._qs)
    ts_ds.transform(t_ix, qs=False)

    tr_smp = DistributedSampler(tr_ds,
                                num_replicas=_size(args),
                                rank=_rank(args))
    vd_smp = DistributedSampler(vd_ds,
                                num_replicas=_size(args),
                                rank=_rank(args),
                                shuffle=False)
    ts_smp = DistributedSampler(ts_ds,
                                num_replicas=_size(args),
                                rank=_rank(args),
                                shuffle=False)

    tr_dl = DataLoader(tr_ds,
                       batch_size=args.batch_size,
                       sampler=tr_smp,
                       num_workers=args.workers,
                       pin_memory=not args.tpu,
                       drop_last=args.tpu)
    vd_dl = DataLoader(vd_ds,
                       batch_size=args.test_batch_size,
                       sampler=vd_smp,
                       num_workers=args.workers,
                       pin_memory=not args.tpu,
                       drop_last=args.tpu)
    ts_dl = DataLoader(ts_ds,
                       batch_size=args.test_batch_size,
                       sampler=ts_smp,
                       num_workers=args.workers,
                       pin_memory=not args.tpu,
                       drop_last=args.tpu)

    return tr_dl, vd_dl, ts_dl, e_ix_ln, r_ix_ln, t_ix_ln, tp_ix, tp_rix
Example #14
0
def train(contain_test=False, use_save=False, model_name='SVR'):
    """ 训练模型 """
    # 1. 加载数据集
    print("start loading data_set")
    train_dataset: Dataset = Dataset.load(TRAIN_DADA_PATH)
    dev_dataset: Dataset = Dataset.load(DEV_DATA_PATH)
    test_dataset: Dataset = Dataset.load(TEST_DATA_PATH)
    print("end loading data_set")

    # 2. 计算特征
    essay_set_num = len(train_dataset.data)
    print(essay_set_num)
    mean_qwk = 0
    all_test_sample = []
    qwk_score_list = []
    use_dev = ''

    for set_id in range(1, essay_set_num + 1):
        train_data = train_dataset.data[str(set_id)]
        dev_data = dev_dataset.data[str(set_id)]
        test_data = test_dataset.data[str(set_id)]

        train_feature_dict = train_dataset.load_feature(set_id, 'train')
        feature_class = Feature.get_instance(train_feature_dict)

        new_train_data = copy.deepcopy(train_data)
        new_train_data.extend(dev_data)
        train_data = new_train_data

        train_sentences_list, train_tokens_list, train_scores = Dataset.get_data_list(
            train_data, acquire_score=True)

        print(
            "start compute the feature for essay set  %s, train_set_len = %s" %
            (set_id, len(train_sentences_list)))
        st = time.time()

        # TODO 需要啥填什麽
        reset_list = []
        train_feature, train_feature_dict = feature_class.get_saved_feature_all(
            train_feature_dict, train_sentences_list, train_tokens_list,
            train_data, train_scores, 'train', feature_list)
        train_dataset.save_feature(set_id, train_feature_dict, 'train')

        et = time.time()
        print("end compute the feature for essay set, ", set_id, "time = ",
              et - st)

        # 3. 构建模型,训练
        use_dev = 'No'  # 手动修改
        clf = model(model_name, train_feature, train_scores, set_id)

        # 4. 测试
        dev_sentences_list, dev_tokens_list, dev_scores = Dataset.get_data_list(
            dev_data, acquire_score=True)

        dev_feature_dict = train_dataset.load_feature(set_id, 'dev')
        dev_feature, dev_feature_dict = feature_class.get_saved_feature_all(
            dev_feature_dict, dev_sentences_list, dev_tokens_list, dev_data,
            train_scores, 'dev', reset_list)
        train_dataset.save_feature(set_id, dev_feature_dict, 'dev')

        print('dev ends')
        predicted = clf.predict(dev_feature)
        qwk = kappa(dev_scores, predicted, weights='quadratic')
        print(set_id, qwk)
        qwk_score_list.append(qwk)
        mean_qwk += qwk

        test_sentences_list, test_tokens_list = Dataset.get_data_list(
            test_data, acquire_score=False)

        test_feature_dict = train_dataset.load_feature(set_id, 'test')
        test_feature, test_feature_dict = feature_class.get_saved_feature_all(
            test_feature_dict, test_sentences_list, test_tokens_list,
            test_data, train_scores, 'test', reset_list)
        train_dataset.save_feature(set_id, test_feature_dict, 'test')

        test_predicted = clf.predict(test_feature)

        for idx, sample in enumerate(test_data):
            # sample['domain1_score'] = int(test_predicted[idx])
            sample['domain1_score'] = int(np.round(float(test_predicted[idx])))
        all_test_sample.extend(test_data)

    save_to_tsv(all_test_sample, '../MG1933004.tsv')
    mean_qwk = mean_qwk / essay_set_num
    print(mean_qwk)
    save_info_to_file(feature_list, use_dev, qwk_score_list, mean_qwk)

    # 保存特征 只能保存dataset对象了
    train_dataset.save(train_dataset, TRAIN_DADA_PATH)
Example #15
0
 def test_20_newsgroups(self):
     ds = Dataset.load('20_newsgroups')
     ds = Dataset.load('20_newsgroups')
     assert len(ds.data) == 18846
     assert len(ds.target) == 18846