import tensorflow as _tf
import dataset.csv as csv
import os as _os
from utils import logutil as _logutil
from . import augmentation as _augmentation

_tde = _tf.data.experimental
_logging = _logutil.get_logger()
_FLAGS = _tf.app.flags.FLAGS


def load(machine=None):
    csv.inspect()
    if machine is not None:
        machine.dataset_loader = Loader()
    else:
        return Loader()


class Loader():
    def __init__(self):
        self._streams = {}
        self._generate_stream()
        pass

    def _generate_stream(self):
        for key in [key for key in _FLAGS if str(key).endswith('csv')]:
            csv_file = _FLAGS[key]._value
            csv_path = _os.path.join(_FLAGS.dataset_dir, _FLAGS.type, csv_file)
            key = str(key).split('_')[0]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import unittest
from unittest import TestCase

from exporter import *
from exporter.book import BookExport
from utils.logutil import get_logger

log = get_logger(__name__)


class TestBookExport(TestCase):
    def setUp(self):
        self.exporter = BookExport("einverne")

    def test_get_books(self):
        books = self.exporter.get_books(COLLECT)
        for book in books:
            log.debug(book)
            self.assertIsNotNone(book, "book object fetch failed")
            self.assertNotEqual(book.title, '', 'book title fetch failed')
            break

    def test_get_read(self):
        readed = self.exporter.get_read()
        for b in readed:
            log.debug(b)
            self.assertIsNotNone(b, 'book object fetch failed')
            self.assertNotEqual(b.title, '', 'book title fetch failed')
            break
Example #3
0
def main():
    if not torch.cuda.is_available():
        raise Exception("need gpu to train network!")

    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    cudnn.benchmark = True
    cudnn.enabled = True

    logger = get_logger(__name__, Config.log)

    Config.gpus = torch.cuda.device_count()
    logger.info("use {} gpus".format(Config.gpus))
    config = {
        key: value
        for key, value in Config.__dict__.items() if not key.startswith("__")
    }
    logger.info(f"args: {config}")

    start_time = time.time()

    # dataset and dataloader
    logger.info("start loading data")

    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    train_dataset = ImageFolder(Config.train_dataset_path, train_transform)
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
    )
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    val_dataset = ImageFolder(Config.val_dataset_path, val_transform)
    val_loader = DataLoader(
        val_dataset,
        batch_size=Config.batch_size,
        num_workers=Config.num_workers,
        pin_memory=True,
    )
    logger.info("finish loading data")

    # network
    net = ChannelDistillResNet1834(Config.num_classes, Config.dataset_type)
    net = nn.DataParallel(net).cuda()

    # loss and optimizer
    criterion = []
    for loss_item in Config.loss_list:
        loss_name = loss_item["loss_name"]
        loss_type = loss_item["loss_type"]
        if "kd" in loss_type:
            criterion.append(losses.__dict__[loss_name](loss_item["T"]).cuda())
        else:
            criterion.append(losses.__dict__[loss_name]().cuda())

    optimizer = SGD(net.parameters(),
                    lr=Config.lr,
                    momentum=0.9,
                    weight_decay=1e-4)
    scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1)

    # only evaluate
    if Config.evaluate:
        # load best model
        if not os.path.isfile(Config.evaluate):
            raise Exception(
                f"{Config.evaluate} is not a file, please check it again")
        logger.info("start evaluating")
        logger.info(f"start resuming model from {Config.evaluate}")
        checkpoint = torch.load(Config.evaluate,
                                map_location=torch.device("cpu"))
        net.load_state_dict(checkpoint["model_state_dict"])
        prec1, prec5 = validate(val_loader, net)
        logger.info(
            f"epoch {checkpoint['epoch']:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%"
        )
        return

    start_epoch = 1
    # resume training
    if os.path.exists(Config.resume):
        logger.info(f"start resuming model from {Config.resume}")
        checkpoint = torch.load(Config.resume,
                                map_location=torch.device("cpu"))
        start_epoch += checkpoint["epoch"]
        net.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        logger.info(
            f"finish resuming model from {Config.resume}, epoch {checkpoint['epoch']}, "
            f"loss: {checkpoint['loss']:3f}, lr: {checkpoint['lr']:.6f}, "
            f"top1_acc: {checkpoint['acc']}%, loss {checkpoint['loss']}%")

    if not os.path.exists(Config.checkpoints):
        os.makedirs(Config.checkpoints)

    logger.info("start training")
    best_acc = 0.
    for epoch in range(start_epoch, Config.epochs + 1):
        prec1, prec5, loss = train(train_loader, net, criterion, optimizer,
                                   scheduler, epoch, logger)
        logger.info(
            f"train: epoch {epoch:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%"
        )

        prec1, prec5 = validate(val_loader, net)
        logger.info(
            f"val: epoch {epoch:0>3d}, top1 acc: {prec1:.2f}%, top5 acc: {prec5:.2f}%"
        )

        # remember best prec@1 and save checkpoint
        torch.save(
            {
                "epoch": epoch,
                "acc": prec1,
                "loss": loss,
                "lr": scheduler.get_lr()[0],
                "model_state_dict": net.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "scheduler_state_dict": scheduler.state_dict(),
            }, os.path.join(Config.checkpoints, "latest.pth"))
        if prec1 > best_acc:
            shutil.copyfile(os.path.join(Config.checkpoints, "latest.pth"),
                            os.path.join(Config.checkpoints, "best.pth"))
            best_acc = prec1

    training_time = (time.time() - start_time) / 3600
    logger.info(
        f"finish training, best acc: {best_acc:.2f}%, total training time: {training_time:.2f} hours"
    )
Example #4
0
def main():
    if not torch.cuda.is_available():
        raise Exception("need gpu to train network!")

    setup_seed(2020)

    logger = get_logger(__name__, Config.log)

    Config.gpus = torch.cuda.device_count()
    logger.info("use {} gpus".format(Config.gpus))
    config = {  # 用类名直接调用__dict__,会输出由该类中所有类属性组成的字典
        key: value
        for key, value in Config.__dict__.items() if not key.startswith("__")
    }
    logger.info(f"args: {config}")

    start_time = time.time()

    # dataset and dataloader
    logger.info("start loading data")

    train_transform = transforms.Compose([
        transforms.Pad(4, padding_mode='reflect'),
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32),
        transforms.ToTensor(),
        transforms.Normalize(
            np.array([125.3, 123.0, 113.9]) / 255.0,
            np.array([63.0, 62.1, 66.7]) / 255.0),
    ])
    train_dataset = CIFAR100(
        Config.train_dataset_path,
        train=True,
        transform=train_transform,
        download=True,
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True,
        num_workers=Config.num_workers,
        pin_memory=True,
    )
    val_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
            np.array([125.3, 123.0, 113.9]) / 255.0,
            np.array([63.0, 62.1, 66.7]) / 255.0),
    ])
    val_dataset = CIFAR100(
        Config.val_dataset_path,
        train=False,
        transform=val_transform,
        download=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=Config.batch_size,
        num_workers=Config.num_workers,
        pin_memory=True,
    )
    logger.info("finish loading data")

    if Config.baseline:
        net = ChannelDistillWRN1628(
            Config.num_classes)  # 返回了(ss, ts) 学生网络和预训练的教师网络
        net = nn.DataParallel(net).cuda(
        )  # ChannelDistillResNet50152( (student): ResNet() (teacher): ResNet())

        optimizer_s = torch.optim.SGD(net.module.student.parameters(),
                                      lr=Config.lr_logit,
                                      momentum=0.9,
                                      weight_decay=1e-4)
        optimizer_t = torch.optim.SGD(net.module.teacher.parameters(),
                                      lr=Config.lr_logit,
                                      momentum=0.9,
                                      weight_decay=1e-4)

        scheduler_s = torch.optim.lr_scheduler.MultiStepLR(
            optimizer_s, milestones=[150, 225], gamma=0.1)
        scheduler_t = torch.optim.lr_scheduler.MultiStepLR(
            optimizer_t, milestones=[150, 225], gamma=0.1)

        optimizer = [optimizer_s, optimizer_t]
        scheduler = [scheduler_s, scheduler_t]

        # loss and optimizer
        criterion = losses.__dict__["CELoss"]().cuda()

        start_epoch = 1
        # resume training
        if os.path.exists(Config.resume):
            pass

        if not os.path.exists(Config.checkpoints):
            os.makedirs(Config.checkpoints)

        logger.info('start training')
        best_stu_acc = 0.
        best_tea_acc = 0.
        for epoch in range(start_epoch, Config.epochs + 1):
            logger.info(f"train:\n")
            prec1_s, prec1_t, prec5_s, prec5_t, loss_s, loss_t = train_baseline(
                train_loader, net, criterion, optimizer, scheduler, epoch,
                logger)
            logger.info(
                f"Student ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n"
            )
            logger.info(
                f"Teacher ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n"
            )

            logger.info(f"val:\n")
            prec1_s, prec5_s, prec1_t, prec5_t = validate(val_loader, net)
            logger.info(
                f"Student ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n"
            )
            logger.info(
                f"Teacher ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n"
            )

            # remember best prec@1 and save checkpoint
            torch.save(
                {
                    "epoch": epoch,
                    "acc": prec1_s,
                    "loss": loss_s,
                    "lr": scheduler[0].get_lr()[0],
                    "model_state_dict": net.state_dict(),
                    "optimizer_state_dict": optimizer[0].state_dict(),
                    "scheduler_state_dict": scheduler[0].state_dict(),
                }, os.path.join(Config.checkpoints, "stu_base_latest.pth"))
            if prec1_s > best_stu_acc:
                shutil.copyfile(
                    os.path.join(Config.checkpoints, "stu_base_latest.pth"),
                    os.path.join(Config.checkpoints, "stu_base_best.pth"))
                best_stu_acc = prec1_s

            torch.save(
                {
                    "epoch": epoch,
                    "acc": prec1_t,
                    "loss": loss_t,
                    "lr": scheduler[1].get_lr()[0],
                    "model_state_dict": net.state_dict(),
                    "optimizer_state_dict": optimizer[1].state_dict(),
                    "scheduler_state_dict": scheduler[1].state_dict(),
                }, os.path.join(Config.checkpoints, "tea_base_latest.pth"))
            if prec1_t > best_tea_acc:
                shutil.copyfile(
                    os.path.join(Config.checkpoints, "tea_base_latest.pth"),
                    os.path.join(Config.checkpoints, "tea_base_best.pth"))
                best_tea_acc = prec1_t

        training_time = (time.time() - start_time) / 3600
        logger.info(f"finish training\n")
        logger.info(
            f"Stu -> best acc: {best_stu_acc:.2f}%, Tea -> best acc: {best_tea_acc:.2f}%, total training time: {training_time:.2f} hours"
        )
    else:
        # network
        net = ChannelDistillWRN1628(
            Config.num_classes)  # 返回了(ss, ts) 学生网络和预训练的教师网络
        # net = ChannelDistillResNet50152(Config.num_classes, Config.dataset_type)  # 返回了(ss, ts) 学生网络和预训练的教师网络
        net = nn.DataParallel(net).cuda(
        )  # ChannelDistillResNet50152( (student): ResNet() (teacher): ResNet())

        discriminator = DiscriminatorStudentTeacher(
            128, Config.model_type).cuda()  # WRN最后剩下128
        # discriminator = DiscriminatorStudentTeacher(2048, Config.model_type).cuda()

        # loss and optimizer
        criterion = [
            losses.__dict__["CELoss"]().cuda(),
            losses.__dict__["KDLoss"](Config.T).cuda(),
            torch.nn.MSELoss().cuda()
        ]

        # 优化学生和老师 -> feature extracter
        optimizer_logit = [
            torch.optim.SGD(net.module.student.parameters(),
                            lr=Config.lr_logit,
                            momentum=0.9,
                            weight_decay=1e-4),
            torch.optim.SGD(net.module.teacher.parameters(),
                            lr=Config.lr_logit,
                            momentum=0.9,
                            weight_decay=1e-4)
        ]  # g1, g2
        scheduler_logit = [
            torch.optim.lr_scheduler.MultiStepLR(optimizer_logit[0],
                                                 milestones=[150, 225],
                                                 gamma=0.1),
            torch.optim.lr_scheduler.MultiStepLR(optimizer_logit[1],
                                                 milestones=[150, 225],
                                                 gamma=0.1)
        ]

        # 优化学生和教师及其他们的判别器-> feature extracter and D1 and D2
        optimizer_g1_fmap = torch.optim.Adam(net.module.student.parameters(),
                                             lr=Config.lr_fmap,
                                             weight_decay=1e-1)
        optimizer_d1_fmap = torch.optim.Adam(
            discriminator.discri_s.parameters(),
            lr=Config.lr_fmap,
            weight_decay=1e-1)
        scheduler_g1_fmap = torch.optim.lr_scheduler.MultiStepLR(
            optimizer_g1_fmap, milestones=[75, 150], gamma=0.1)
        scheduler_d1_fmap = torch.optim.lr_scheduler.MultiStepLR(
            optimizer_d1_fmap, milestones=[75, 150], gamma=0.1)

        optimizer_s_fmap = [optimizer_g1_fmap, optimizer_d1_fmap]  # g1, d1
        scheduler_s_fmap = [scheduler_g1_fmap, scheduler_d1_fmap]

        optimizer_g2_fmap = torch.optim.Adam(net.module.teacher.parameters(),
                                             lr=Config.lr_fmap,
                                             weight_decay=1e-1)
        optimizer_d2_fmap = torch.optim.Adam(
            discriminator.discri_t.parameters(),
            lr=Config.lr_fmap,
            weight_decay=1e-1)
        scheduler_g2_fmap = torch.optim.lr_scheduler.MultiStepLR(
            optimizer_g2_fmap, milestones=[75, 150], gamma=0.1)
        scheduler_d2_fmap = torch.optim.lr_scheduler.MultiStepLR(
            optimizer_d2_fmap, milestones=[75, 150], gamma=0.1)

        optimizer_t_fmap = [optimizer_g2_fmap, optimizer_d2_fmap]  # g2, d2
        scheduler_t_fmap = [scheduler_g2_fmap, scheduler_d2_fmap]

        # only evaluate
        if Config.evaluate:
            pass

        start_epoch = 1
        # resume training
        if os.path.exists(Config.resume):
            pass

        if not os.path.exists(Config.checkpoints):
            os.makedirs(Config.checkpoints)

        logger.info('start training')
        best_stu_acc = 0.
        best_tea_acc = 0.
        for epoch in range(start_epoch, Config.epochs + 1):
            logger.info(f"train:\n")
            prec1_s, prec1_t, prec5_s, prec5_t, loss_s, loss_t = train(
                train_loader, net, discriminator, criterion, optimizer_logit,
                scheduler_logit, optimizer_s_fmap, scheduler_s_fmap,
                optimizer_t_fmap, scheduler_t_fmap, epoch, logger)
            logger.info(
                f"Student ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n"
            )
            logger.info(
                f"Teacher ---> train: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n"
            )

            logger.info(f"val:\n")
            prec1_s, prec5_s, prec1_t, prec5_t = validate(val_loader, net)
            logger.info(
                f"Student ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_s:.2f}%, top5 acc: {prec5_s:.2f}%\n"
            )
            logger.info(
                f"Teacher ---> val: epoch {epoch:0>3d}, top1 acc: {prec1_t:.2f}%, top5 acc: {prec5_t:.2f}%\n"
            )

            # remember best prec@1 and save checkpoint
            torch.save(
                {
                    "epoch": epoch,
                    "acc": prec1_s,
                    "loss": loss_s,
                    "lr_logit": scheduler_logit[0].get_lr()[0],
                    "lr_g": scheduler_s_fmap[0].get_lr()[0],
                    "lr_d": scheduler_s_fmap[1].get_lr()[0],
                    "model_state_dict": net.state_dict(),
                    "optimizer_logit_state_dict":
                    optimizer_logit[0].state_dict(),
                    "optimizer_fmap_g_state_dict":
                    optimizer_s_fmap[0].state_dict(),
                    "optimizer_fmap_d_state_dict":
                    optimizer_s_fmap[1].state_dict(),
                    "scheduler_logit_state_dict":
                    scheduler_logit[0].state_dict(),
                    "scheduler_g_state_dict": scheduler_s_fmap[0].state_dict(),
                    "scheduler_d_state_dict": scheduler_s_fmap[1].state_dict(),
                }, os.path.join(Config.checkpoints, "stu_latest.pth"))
            if prec1_s > best_stu_acc:
                shutil.copyfile(
                    os.path.join(Config.checkpoints, "stu_latest.pth"),
                    os.path.join(Config.checkpoints, "stu_best.pth"))
                best_stu_acc = prec1_s

            torch.save(
                {
                    "epoch": epoch,
                    "acc": prec1_t,
                    "loss": loss_t,
                    "lr_logit": scheduler_logit[1].get_lr()[0],
                    "lr_g": scheduler_t_fmap[0].get_lr()[0],
                    "lr_d": scheduler_t_fmap[1].get_lr()[0],
                    "model_state_dict": net.state_dict(),
                    "optimizer_logit_state_dict":
                    optimizer_logit[1].state_dict(),
                    "optimizer_fmap_g_state_dict":
                    optimizer_t_fmap[0].state_dict(),
                    "optimizer_fmap_d_state_dict":
                    optimizer_t_fmap[1].state_dict(),
                    "scheduler_logit_state_dict":
                    scheduler_logit[1].state_dict(),
                    "scheduler_g_state_dict": scheduler_t_fmap[0].state_dict(),
                    "scheduler_d_state_dict": scheduler_t_fmap[1].state_dict(),
                }, os.path.join(Config.checkpoints, "tea_latest.pth"))
            if prec1_t > best_tea_acc:
                shutil.copyfile(
                    os.path.join(Config.checkpoints, "tea_latest.pth"),
                    os.path.join(Config.checkpoints, "tea_best.pth"))
                best_tea_acc = prec1_t

        training_time = (time.time() - start_time) / 3600
        logger.info(f"finish training\n")
        logger.info(
            f"Stu -> best acc: {best_stu_acc:.2f}%, Tea -> best acc: {best_tea_acc:.2f}%, total training time: {training_time:.2f} hours"
        )
Example #5
0
#!/usr/bin/env python
# coding=utf-8
import time
import re
from utils import base62
from utils import pageutil
from utils import logutil

logger = logutil.get_logger()


# 解析用户关注页面
def follow_page_parse(user_data_dict, num):
    follow_url_list = []
    # 拼接用户关注页面URL
    download_url = 'http://weibo.cn/' + user_data_dict['user_id'] + '/follow?page=' + str(num)
    logger.info("Processing follow URL:" + download_url)
    soup = pageutil.get_soup_from_page(download_url)
    follow_block = soup.find_all('td', attrs={'valign': 'top'})
    # 逐行处理当前页面关注用户
    for i in range(0, len(follow_block) / 2):
        follow = follow_block[i * 2 + 1].find_all('a')[0]
        follow_url_list.append(follow.get('href'))
    return follow_url_list


# 解析用户粉丝页面
def fans_page_parse(user_data_dict, num):
    fans_url_list = []
    # 拼接用户粉丝页面URL
    download_url = 'http://weibo.cn/' + user_data_dict['user_id'] + '/fans?page=' + str(num)