Ejemplo n.º 1
0
def test_assemblyvalidation():
    bamasm = get_testfile('cm-500pgun-asm-b2mv31-bam')
    contigfa = get_testfile('cm-500pgun-asm-b2mv31-fa')
    bamref = get_testfile('cm-500pgun-ref-bam')
    refstatsfile = get_testfile('cm-500pgun-ref-stats')
    refphylfile = get_testfile('cm-ref-phyl')
    nucmercoords = get_testfile('cm-500pgun-val-nucmer')

    val = AssemblyValidation(bamref, bamasm, refphylfile, refstatsfile, contigfa, nucmercoords)
    assert(len(val.contigs) == int(get_shell_output("grep -c '^>' " + contigfa)[0]))

    make_dir(get_outdir() + "masm")
    val.write_contig_purity(get_outdir() + "masm" + "/contig-purity.tsv")
    val.write_general_stats(get_outdir() + "masm" + "/asm-stats.tsv")
    val.write_genome_contig_cov(get_outdir() + "masm" + "/genome-contig-coverage.tsv")
    def read(self):
        print_ = get_print(self.customWidget)
        for try_ in range(8):
            self.customWidget.print_('get_session')
            try:
                session = get_session()
                html = downloader.read_html(self.url, session=session)
                soup = Soup(html)
                get_title_artist(soup)
                break
            except Exception as e:
                print(e)

        else:
            raise

        title, self.artist = get_title_artist(soup)
        self.__title = title
        title_dir = clean_title((u'[{}] {}').format(self.artist, title))
        ex = soup.find('div', id='novel_ex')
        self.novel_ex = ex.text.strip() if ex else None
        texts = []
        subtitles = soup.findAll('dd', class_='subtitle')
        if subtitles:
            for subtitle in subtitles:
                update = subtitle.parent.find('dt', class_='long_update')
                update2 = None
                if update:
                    for span in update.findAll('span'):
                        update2 = span.attrs['title']
                        span.decompose()

                    update = update.text.strip()
                if update2:
                    update += (u'  ({})').format(update2)
                a = subtitle.find('a')
                subtitle = a.text.strip()
                href = urljoin(self.url, a.attrs['href'])
                if not re.search(('ncode.syosetu.com/{}/[0-9]+').format(self.id_), href):
                    print_((u'skip: {}').format(href))
                    continue
                text = Text(subtitle, update, href, session, False)
                texts.append(text)

        else:
            self.single = True
            text = Text(title_dir, None, self.url, session, True)
            texts.append(text)
        self.print_((u'single: {}').format(self.single))
        outdir = get_outdir('syosetu')
        for text in texts:
            if self.single:
                file = os.path.join(outdir, text.filename)
            else:
                file = os.path.join(outdir, title_dir, text.filename)
            if os.path.isfile(file):
                self.urls.append(file)
            else:
                self.urls.append(text.url)

        self.title = title_dir
Ejemplo n.º 3
0
def main():
    args = parser.parse_args()

    num_classes = len(get_labels())
    test_time_pool = 0  #5 if 'dpn' in args.model else 0

    model = model_factory.create_model(args.model,
                                       in_chs=1,
                                       num_classes=num_classes,
                                       global_pool=args.gp,
                                       test_time_pool=test_time_pool)
    #model.reset_classifier(num_classes=num_classes)

    if args.num_gpu > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(range(
                                          args.num_gpu))).cuda()
    else:
        model.cuda()

    if not os.path.exists(args.checkpoint):
        print("=> no checkpoint found at '{}'".format(args.checkpoint))
        exit(1)
    print("=> loading checkpoint '{}'".format(args.checkpoint))
    checkpoint = torch.load(args.checkpoint)
    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.checkpoint, checkpoint['epoch']))
    else:
        model.load_state_dict(checkpoint)

    csplit = os.path.normpath(args.checkpoint).split(sep=os.path.sep)
    if len(csplit) > 1:
        exp_name = csplit[-2] + '-' + csplit[-1].split('.')[0]
    else:
        exp_name = ''

    if args.output:
        output_base = args.output
    else:
        output_base = './output'

    output_dir = get_outdir(output_base, 'predictions', exp_name)

    dataset = CommandsDataset(root=args.data,
                              mode='test',
                              format='spectrogram')

    loader = data.DataLoader(dataset,
                             batch_size=args.batch_size,
                             pin_memory=True,
                             shuffle=False,
                             num_workers=args.workers)

    model.eval()
    batch_time_m = AverageMeter()
    data_time_m = AverageMeter()
    try:
        # open CSV for writing predictions
        cf = open(os.path.join(output_dir, 'results.csv'), mode='w')
        res_writer = csv.writer(cf)
        res_writer.writerow(['fname'] + dataset.id_to_label)

        # open CSV for writing submission
        cf = open(os.path.join(output_dir, 'submission.csv'), mode='w')
        sub_writer = csv.writer(cf)
        sub_writer.writerow(['fname', 'label', 'prob'])

        end = time.time()
        batch_sample_idx = 0
        for batch_idx, (input, target) in enumerate(loader):
            data_time_m.update(time.time() - end)
            input_var = autograd.Variable(input.cuda(), volatile=True)
            output = model(input_var)

            # augmentation reduction
            #reduce_factor = loader.dataset.get_aug_factor()
            #if reduce_factor > 1:
            #    output.data = output.data.unfold(0, reduce_factor, reduce_factor).mean(dim=2).squeeze(dim=2)
            #    index = index[0:index.size(0):reduce_factor]

            # move data to CPU and collect)
            output_logprob = F.log_softmax(output, dim=1).data.cpu().numpy()
            output = F.softmax(output, dim=1)
            output_prob, output_idx = output.max(1)
            output_prob = output_prob.data.cpu().numpy()
            output_idx = output_idx.data.cpu().numpy()
            for i in range(output_logprob.shape[0]):
                index = batch_sample_idx + i
                pred_label = dataset.id_to_label[output_idx[i]]
                pred_prob = output_prob[i]
                filename = dataset.filename(index)
                res_writer.writerow([filename] + list(output_logprob[i]))
                sub_writer.writerow([filename] + [pred_label, pred_prob])

            batch_sample_idx += input_var.size(0)
            batch_time_m.update(time.time() - end)
            if batch_idx % args.print_freq == 0:
                print('Inference: [{}/{} ({:.0f}%)]  '
                      'Time: {batch_time.val:.3f}s, {rate:.3f}/s  '
                      '({batch_time.avg:.3f}s, {rate_avg:.3f}/s)  '
                      'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
                          batch_sample_idx,
                          len(loader.sampler),
                          100. * batch_idx / len(loader),
                          batch_time=batch_time_m,
                          rate=input_var.size(0) / batch_time_m.val,
                          rate_avg=input_var.size(0) / batch_time_m.avg,
                          data_time=data_time_m))

            end = time.time()
            # end iterating through dataset

    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(str(e))
Ejemplo n.º 4
0
    def read(self):
        type = self.pixiv_type
        cw = self.customWidget
        print_ = cw.print_
        ui_setting = self.ui_setting

        if type == 'following':
            raise NotImplementedError('following')

        self._format = [None, 'gif', 'webp',
                        'png'][ui_setting.ugoira_convert.currentIndex()]
        self._format_name = compatstr(ui_setting.pixivFormat.currentText())
        types = [t.lower() for t in query_url(self.url).get('type', [])]
        if types:
            s = (u', ').join(sorted(types))
            types = set(types)
        else:
            s = 'all'
            types = None
        print_((u'Type: {}').format(s))
        print_((u'info: {}').format(self.info))
        api = self.api
        query = self.id.replace('_bmk', '').replace('_illust', '').replace(
            'pixiv_', '').replace('search_', '')
        if type != 'search':
            query = int(query)
        print('pixiv_query:', query)
        try:
            if type in ('user', 'bookmark', 'search'):
                max_pid = get_max_range(cw, 2000)
                if ui_setting.groupBox_tag.isChecked():
                    tags = [
                        compatstr(ui_setting.tagList.item(i).text())
                        for i in range(ui_setting.tagList.count())
                    ]
                else:
                    tags = []
                if type == 'search':
                    query = query.replace('+', ' ')
                    name = query
                else:
                    id = self.id.replace('_bmk', '').replace('pixiv_',
                                                             '').replace(
                                                                 'search_', '')
                    print('name', id)
                    name = get_name(id, self.api, cw=cw)
                    cw.artist = name
                title = u'{} ({})'.format(name, self.id)
                print_(title)
                dir = os.path.join(get_outdir('pixiv'), clean_title(title))
                imgs = get_imgs(query,
                                type=type,
                                api=api,
                                n=max_pid,
                                tags=tags,
                                types=types,
                                format=self._format,
                                format_name=self._format_name,
                                dir=dir,
                                cw=cw,
                                title=title,
                                info=self.info)
            elif type == 'illust':
                for try_ in range(N_TRY):
                    try:
                        detail = api.illust_detail(query, req_auth=True)
                        error = detail.get('error')
                        if error:
                            raise PixivError(error)
                        break
                    except PixivError as e:
                        api = e.api
                        print_(e)
                        if try_ < N_TRY - 1:
                            print_('retry...')
                        sleep(SLEEP)
                else:
                    raise

                illust = detail.illust
                name = illust.title
                title = (u'{} ({})').format(name, self.id)
                dir = os.path.join(get_outdir('pixiv'), clean_title(title))
                imgs = get_imgs_from_illust(illust,
                                            api=api,
                                            format=self._format,
                                            dir=dir,
                                            cw=cw,
                                            format_name=self._format_name)
        except PixivError as e:
            msg = (u'PixivError: {}').format(e.message)
            return self.Invalid(msg)

        self.imgs = imgs
        for img in imgs:
            self.urls.append(img.url)
            self.filenames[img.url] = img.filename

        self.title = clean_title(title)  # 1390
Ejemplo n.º 5
0
def main():
    args = parser.parse_args()

    if args.output:
        output_base = args.output
    else:
        output_base = './output'
    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, args.gp,
        'f' + str(args.fold)
    ])
    output_dir = get_outdir(output_base, 'train', exp_name)

    train_input_root = os.path.join(args.data)
    batch_size = args.p * args.k
    num_epochs = args.epochs
    wav_size = (16000, )
    num_classes = 128  # triplet embedding size

    torch.manual_seed(args.seed)

    model = model_factory.create_model(args.model,
                                       in_chs=1,
                                       pretrained=args.pretrained,
                                       num_classes=num_classes,
                                       drop_rate=args.drop,
                                       global_pool=args.gp,
                                       embedding_net=True,
                                       embedding_norm=2.,
                                       embedding_act_fn=torch.sigmoid,
                                       checkpoint_path=args.initial_checkpoint)

    dataset_train = dataset.CommandsDataset(
        root=train_input_root,
        mode='train',
        fold=args.fold,
        wav_size=wav_size,
        format='spectrogram',
        train_unknown=False,
    )

    loader_train = data.DataLoader(dataset_train,
                                   batch_size=batch_size,
                                   pin_memory=True,
                                   sampler=dataset.PKSampler(dataset_train,
                                                             p=args.p,
                                                             k=args.k),
                                   num_workers=args.workers)

    dataset_eval = dataset.CommandsDataset(
        root=train_input_root,
        mode='validate',
        fold=args.fold,
        wav_size=wav_size,
        format='spectrogram',
        train_unknown=False,
    )

    loader_eval = data.DataLoader(dataset_eval,
                                  batch_size=batch_size,
                                  pin_memory=True,
                                  sampler=dataset.PKSampler(dataset_eval,
                                                            p=args.p,
                                                            k=args.k),
                                  num_workers=args.workers)

    train_loss_fn = validate_loss_fn = TripletLoss(margin=0.5, sample=True)
    train_loss_fn = train_loss_fn.cuda()
    validate_loss_fn = validate_loss_fn.cuda()

    opt_params = list(model.parameters())
    if args.opt.lower() == 'sgd':
        optimizer = optim.SGD(opt_params,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay,
                              nesterov=True)
    elif args.opt.lower() == 'adam':
        optimizer = optim.Adam(opt_params,
                               lr=args.lr,
                               weight_decay=args.weight_decay,
                               eps=args.opt_eps)
    elif args.opt.lower() == 'nadam':
        optimizer = nadam.Nadam(opt_params,
                                lr=args.lr,
                                weight_decay=args.weight_decay,
                                eps=args.opt_eps)
    elif args.opt.lower() == 'adadelta':
        optimizer = optim.Adadelta(opt_params,
                                   lr=args.lr,
                                   weight_decay=args.weight_decay,
                                   eps=args.opt_eps)
    elif args.opt.lower() == 'rmsprop':
        optimizer = optim.RMSprop(opt_params,
                                  lr=args.lr,
                                  alpha=0.9,
                                  eps=args.opt_eps,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    else:
        assert False and "Invalid optimizer"
    del opt_params

    if not args.decay_epochs:
        print('No decay epoch set, using plateau scheduler.')
        lr_scheduler = ReduceLROnPlateau(optimizer, patience=10)
    else:
        lr_scheduler = None

    # optionally resume from a checkpoint
    start_epoch = 0 if args.start_epoch is None else args.start_epoch
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
                if 'args' in checkpoint:
                    print(checkpoint['args'])
                new_state_dict = OrderedDict()
                for k, v in checkpoint['state_dict'].items():
                    if k.startswith('module'):
                        name = k[7:]  # remove `module.`
                    else:
                        name = k
                    new_state_dict[name] = v
                model.load_state_dict(new_state_dict)
                if 'optimizer' in checkpoint:
                    optimizer.load_state_dict(checkpoint['optimizer'])
                if 'loss' in checkpoint:
                    train_loss_fn.load_state_dict(checkpoint['loss'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
                start_epoch = checkpoint[
                    'epoch'] if args.start_epoch is None else args.start_epoch
            else:
                model.load_state_dict(checkpoint)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            exit(1)

    saver = CheckpointSaver(checkpoint_dir=output_dir)

    if args.num_gpu > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(range(
                                          args.num_gpu))).cuda()
    else:
        model.cuda()

    best_loss = None
    try:
        for epoch in range(start_epoch, num_epochs):
            if args.decay_epochs:
                adjust_learning_rate(optimizer,
                                     epoch,
                                     initial_lr=args.lr,
                                     decay_rate=args.decay_rate,
                                     decay_epochs=args.decay_epochs)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        train_loss_fn,
                                        args,
                                        saver=saver,
                                        output_dir=output_dir)

            # save a recovery in case validation blows up
            saver.save_recovery(
                {
                    'epoch': epoch + 1,
                    'arch': args.model,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'loss': train_loss_fn.state_dict(),
                    'args': args,
                    'gp': args.gp,
                },
                epoch=epoch + 1,
                batch_idx=0)

            step = epoch * len(loader_train)
            eval_metrics = validate(step,
                                    model,
                                    loader_eval,
                                    validate_loss_fn,
                                    args,
                                    output_dir=output_dir)

            if lr_scheduler is not None:
                lr_scheduler.step(eval_metrics['eval_loss'])

            rowd = OrderedDict(epoch=epoch)
            rowd.update(train_metrics)
            rowd.update(eval_metrics)
            with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf:
                dw = csv.DictWriter(cf, fieldnames=rowd.keys())
                if best_loss is None:  # first iteration (epoch == 1 can't be used)
                    dw.writeheader()
                dw.writerow(rowd)

            # save proper checkpoint with eval metric
            best_loss = saver.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.model,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'args': args,
                    'gp': args.gp,
                },
                epoch=epoch + 1,
                metric=eval_metrics['eval_loss'])

    except KeyboardInterrupt:
        pass
    if best_loss is not None:
        print('*** Best loss: {0} (epoch {1})'.format(best_loss[1],
                                                      best_loss[0]))
Ejemplo n.º 6
0
def get_imgs(username,
             session,
             title,
             types,
             n=0,
             format='[%y-%m-%d] id_ppage',
             cw=None):
    print_ = get_print(cw)

    # Range
    n = max(n, get_max_range(cw))

    # 2303
    ids = set()
    names = dict()
    dir_ = os.path.join(get_outdir('twitter'), title)
    if os.path.isdir(dir_) and cw:
        for name in cw.names_old:
            name = os.path.basename(name)
            id_ = re.find('([0-9]+)_p', name)
            if id_ is None:
                continue
            if get_ext(name).lower() == '.mp4':
                type_ = 'video'
            else:
                type_ = 'img'
            if type_ not in types:
                continue
            id_ = int(id_)
            ids.add(id_)
            if id_ in names:
                names[id_].append(name)
            else:
                names[id_] = [name]
    max_id = max(ids) if ids else 0

    # 2303
    imgs_old = []
    for id_ in sorted(ids, reverse=True):
        for p, file in enumerate(
                sorted(os.path.join(dir_, name) for name in names[id_])):
            img = Image(file, '', id_, 0, p, format, cw, False)
            img.url = LazyUrl_twitter(None, lambda _: file, img)
            img.filename = os.path.basename(file)
            imgs_old.append(img)

    imgs_new = []
    enough = False
    for tweet in TwitterAPI(session, cw).timeline_media(username):
        id_ = int(tweet['id_str'])
        if id_ < max_id:
            print_('enough')
            enough = True
            break

        imgs_ = get_imgs_from_tweet(tweet, session, types, format, cw)

        if id_ in ids:
            print_('duplicate: {}'.format(id_))
            continue
        ids.add(id_)

        imgs_new += imgs_

        if len(imgs_old) + len(imgs_new) >= n:
            break

        msg = '{}  {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new))
        if cw:
            if not cw.alive:
                break
            cw.setTitle(msg)
        else:
            print(msg)

    if not enough and not imgs_new:
        raise Exception('no imgs')

    imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True)

    if len(imgs) < n:
        imgs = get_imgs_more(username,
                             session,
                             title,
                             types,
                             n,
                             format,
                             cw,
                             imgs=imgs)

    return imgs
Ejemplo n.º 7
0
def main():
    config = DefaultConfigs()
    train_input_root = os.path.join(config.data)
    train_labels_file = 'labels.csv'

    if config.output:
        if not os.path.exists(config.output):
            os.makedirs(config.output)
        output_base = config.output
    else:
        if not os.path.exists(config.output):
            os.makedirs(config.output)
        output_base = config.output

    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"), config.model,
        str(config.img_size), 'f' + str(config.fold)
    ])
    mask_exp_name = '-'.join(
        [config.model,
         str(config.img_size), 'f' + str(config.fold)])
    mask_exp_name = glob.glob(
        os.path.join(output_base, 'train', '*' + mask_exp_name))
    if config.resume and mask_exp_name:
        output_dir = mask_exp_name
    else:
        output_dir = get_outdir(output_base, 'train', exp_name)

    batch_size = config.batch_size
    test_batch_size = config.test_batch_size
    num_epochs = config.epochs
    img_type = config.image_type
    img_size = (config.img_size, config.img_size)
    num_classes = get_tags_size(config.labels)

    torch.manual_seed(config.seed)

    dataset_train = HumanDataset(
        train_input_root,
        train_labels_file,
        train=True,
        multi_label=config.multi_label,
        img_type=img_type,
        img_size=img_size,
        fold=config.fold,
    )

    #sampler = WeightedRandomOverSampler(dataset_train.get_sample_weights())

    loader_train = data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        shuffle=True,
        #sampler=sampler,
        num_workers=config.num_processes)

    dataset_eval = HumanDataset(
        train_input_root,
        train_labels_file,
        train=False,
        multi_label=config.multi_label,
        img_type=img_type,
        img_size=img_size,
        test_aug=config.tta,
        fold=config.fold,
    )

    loader_eval = data.DataLoader(dataset_eval,
                                  batch_size=test_batch_size,
                                  shuffle=False,
                                  num_workers=config.num_processes)

    #    model = model_factory.create_model(
    #        config.model,
    #        pretrained=True,
    #        num_classes=num_classes,
    #        drop_rate=config.drop,
    #        global_pool=config.gp)

    model = get_net(config.model, num_classes, config.drop, config.channels)

    if not config.no_cuda:
        if config.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              config.num_gpu))).cuda()
        else:
            model.cuda()

    if config.opt.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=config.lr,
                              momentum=config.momentum,
                              weight_decay=config.weight_decay)
    elif config.opt.lower() == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=config.lr,
                               weight_decay=config.weight_decay)
    elif config.opt.lower() == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=config.lr,
                                   weight_decay=config.weight_decay)
    elif config.opt.lower() == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=config.lr,
                                  alpha=0.9,
                                  momentum=config.momentum,
                                  weight_decay=config.weight_decay)
    elif config.opt.lower() == 'yellowfin':
        optimizer = YFOptimizer(model.parameters(),
                                lr=config.lr,
                                weight_decay=config.weight_decay,
                                clip_thresh=2)
    else:
        assert False and "Invalid optimizer"

    if not config.decay_epochs:
        lr_scheduler = ReduceLROnPlateau(optimizer, patience=8)
    else:
        lr_scheduler = None

    if config.class_weights:
        class_weights = torch.from_numpy(
            dataset_train.get_class_weights()).float()
        class_weights_norm = class_weights / class_weights.sum()
        if not config.no_cuda:
            class_weights = class_weights.cuda()
            class_weights_norm = class_weights_norm.cuda()
    else:
        class_weights = None
        class_weights_norm = None

    if config.loss.lower() == 'nll':
        #assert not args.multi_label and 'Cannot use crossentropy with multi-label target.'
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    elif config.loss.lower() == 'mlsm':
        assert config.multi_label
        loss_fn = torch.nn.MultiLabelSoftMarginLoss(weight=class_weights)
    else:
        assert config and "Invalid loss function"

    if not config.no_cuda:
        loss_fn = loss_fn.cuda()

    # optionally resume from a checkpoint
    start_epoch = 1
    if config.resume:
        if os.path.isfile(config.resume):
            print("=> loading checkpoint '{}'".format(config.resume))
            checkpoint = torch.load(config.resume)
            config.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                config.resume, checkpoint['epoch']))
            start_epoch = checkpoint['epoch']
        else:
            print("=> no checkpoint found at '{}'".format(config.resume))
            exit(-1)

    use_tensorboard = not config.no_tb and CrayonClient is not None
    if use_tensorboard:
        hostname = '127.0.0.1'
        port = 8889
        host_port = config.tbh.split(':')[:2]
        if len(host_port) == 1:
            hostname = host_port[0]
        elif len(host_port) >= 2:
            hostname, port = host_port[:2]
        try:
            cc = CrayonClient(hostname=hostname, port=port)
            try:
                cc.remove_experiment(exp_name)
            except ValueError:
                pass
            exp = cc.create_experiment(exp_name)
        except Exception as e:
            exp = None
            print(
                "Error (%s) connecting to Tensoboard/Crayon server. Giving up..."
                % str(e))
    else:
        exp = None

    # Optional fine-tune of only the final classifier weights for specified number of epochs (or part of)
    if not config.resume and config.ft_epochs > 0.:
        if config.opt.lower() == 'adam':
            finetune_optimizer = optim.Adam(model.get_fc().parameters(),
                                            lr=config.ft_lr,
                                            weight_decay=config.weight_decay)
        else:
            finetune_optimizer = optim.SGD(model.get_fc().parameters(),
                                           lr=config.ft_lr,
                                           momentum=config.momentum,
                                           weight_decay=config.weight_decay)

        finetune_epochs_int = int(np.ceil(config.ft_epochs))
        finetune_final_batches = int(
            np.ceil((1 - (finetune_epochs_int - config.ft_epochs)) *
                    len(loader_train)))
        print(finetune_epochs_int, finetune_final_batches)
        for fepoch in range(1, finetune_epochs_int + 1):
            if fepoch == finetune_epochs_int and finetune_final_batches:
                batch_limit = finetune_final_batches
            else:
                batch_limit = 0
            train_epoch(fepoch,
                        model,
                        loader_train,
                        finetune_optimizer,
                        loss_fn,
                        config,
                        class_weights_norm,
                        output_dir,
                        batch_limit=batch_limit)
            step = fepoch * len(loader_train)
            score, _ = validate(step, model, loader_eval, loss_fn, config, 0.3,
                                output_dir)

    score_metric = 'f2'
    best_loss = None
    best_f2 = None
    threshold = 0.2
    try:
        for epoch in range(start_epoch, num_epochs + 1):
            if config.decay_epochs:
                adjust_learning_rate(optimizer,
                                     epoch,
                                     initial_lr=config.lr,
                                     decay_epochs=config.decay_epochs)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        loss_fn,
                                        config,
                                        class_weights_norm,
                                        output_dir,
                                        exp=exp)

            step = epoch * len(loader_train)
            eval_metrics, latest_threshold = validate(step,
                                                      model,
                                                      loader_eval,
                                                      loss_fn,
                                                      config,
                                                      threshold,
                                                      output_dir,
                                                      exp=exp)

            if lr_scheduler is not None:
                lr_scheduler.step(eval_metrics['eval_loss'])

            rowd = OrderedDict(epoch=epoch)
            rowd.update(train_metrics)
            rowd.update(eval_metrics)
            with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf:
                dw = csv.DictWriter(cf, fieldnames=rowd.keys())
                if best_loss is None:  # first iteration (epoch == 1 can't be used)
                    dw.writeheader()
                dw.writerow(rowd)

            best = False
            if best_loss is None or eval_metrics['eval_loss'] < best_loss[1]:
                best_loss = (epoch, eval_metrics['eval_loss'])
                if score_metric == 'loss':
                    best = True
            if best_f2 is None or eval_metrics['eval_f2'] > best_f2[1]:
                best_f2 = (epoch, eval_metrics['eval_f2'])
                if score_metric == 'f2':
                    best = True

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': config.model,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'threshold': latest_threshold,
                    'config': config
                },
                is_best=best,
                filename=os.path.join(config.checkpoint_path,
                                      'checkpoint-%d.pth.tar' % epoch),
                output_dir=output_dir)

    except KeyboardInterrupt:
        pass
    print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0]))
    print('*** Best f2: {0} (epoch {1})'.format(best_f2[1], best_f2[0]))
Ejemplo n.º 8
0
def get_imgs(username, title, cw=None):
    urls = [
        'https://m.facebook.com/{}/photos'.format(username),
        'https://m.facebook.com/profile.php?id={}&sk=photos'.format(
            username),  # no custom URL
    ]

    for url in urls:
        print('get_imgs url:', url)
        try:
            html = read_html(url)
        except:
            continue
        soup = Soup(html)
        if soup.find('a', id='signup-button'):
            raise errors.LoginRequired()

        photo = soup.find('div', class_='_5v64')
        if photo is not None:
            break
    else:
        raise Exception('No photo div')

    cursor = photo.a.attrs['href'].split('/photos/')[1].split('/')[1]
    print('first cursor:', cursor)

    href = re.find(r'(/photos/pandora/\?album_token=.+?)"', html)
    href = urljoin(url, href)
    href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)

    cursors = set([cursor])

    imgs = []

    dups = {}
    dir = os.path.join(get_outdir('facebook'), title)
    try:
        filenames = os.listdir(dir)
    except:
        filenames = []
    for filename in filenames:
        name, ext = os.path.splitext(filename)
        if name.isdigit():
            dups[int(name)] = os.path.join(dir, filename)

    pages = set()

    while True:
        print(href)
        html = read_html(href)
        data_raw = html.replace('for (;;);', '')
        data = json.loads(data_raw)
        actions = data['payload']['actions']
        for action in actions:
            if action['target'] == 'm_more_photos':
                break
        else:
            print('No more photos')
            break
        html = action['html']
        soup = Soup(html)
        photos = soup.findAll('div', class_='_5v64')
        for photo in photos:
            for a in photo.findAll('a'):
                page = a.attrs['href']
                page = urljoin(href, page)

                # remove duplicate pages
                if page in pages:
                    continue
                pages.add(page)

                img = Image(page)
                id = img.id
                if id in dups and getsize(dups[id]) > 0:
                    print('skip', id)
                    imgs.append(dups[id])
                else:
                    imgs.append(img)

        s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
        if cw is not None:
            cw.setTitle(s)
            if not cw.alive:
                return []
        else:
            print(s)

        cursor = re.find(PATTERN_CURSOR, data_raw)
        #print(cursor)
        if cursor is None:
            print('no cursor')
            break
        if cursor in cursors:
            print('same cursor')
            break
        cursors.add(cursor)

        href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href)

    return imgs
Ejemplo n.º 9
0
def main():
    args = parser.parse_args()

    train_input_root = os.path.join(args.data)
    train_labels_file = './data/labels.csv'
    output_dir = get_outdir('./output', 'eval', datetime.now().strftime("%Y%m%d-%H%M%S"))

    batch_size = args.batch_size
    num_epochs = 1000
    if args.tif:
        img_type = '.tif'
    else:
        img_type = '.jpg'
    img_size = (args.img_size, args.img_size)
    num_classes = get_tags_size(args.labels)
    debug_model = False

    torch.manual_seed(args.seed)

    if args.train:
        dataset_train = AmazonDataset(
            train_input_root,
            train_labels_file,
            train=False,
            train_fold=True,
            tags_type=args.labels,
            multi_label=args.multi_label,
            img_type=img_type,
            img_size=img_size,
            fold=args.fold,
        )

        loader_train = data.DataLoader(
            dataset_train,
            batch_size=batch_size,
            shuffle=False,
            num_workers=args.num_processes
        )

    dataset_eval = AmazonDataset(
        train_input_root,
        train_labels_file,
        train=False,
        tags_type=args.labels,
        multi_label=args.multi_label,
        img_type=img_type,
        img_size=img_size,
        test_aug=args.tta,
        fold=args.fold,
    )

    loader_eval = data.DataLoader(
        dataset_eval,
        batch_size=batch_size,
        shuffle=False,
        num_workers=args.num_processes
    )

    model = create_model(args.model, pretrained=args.pretrained, num_classes=num_classes, global_pool=args.gp)

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda()
        else:
            model.cuda()

    if False:
        class_weights = torch.from_numpy(dataset_train.get_class_weights()).float()
        class_weights_norm = class_weights / class_weights.sum()
        if not args.no_cuda:
            class_weights = class_weights.cuda()
            class_weights_norm = class_weights_norm.cuda()
    else:
        class_weights = None
        class_weights_norm = None

    if args.loss.lower() == 'nll':
        #assert not args.multi_label and 'Cannot use crossentropy with multi-label target.'
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    elif args.loss.lower() == 'mlsm':
        assert args.multi_label
        loss_fn = torch.nn.MultiLabelSoftMarginLoss(weight=class_weights)
    else:
        assert False and "Invalid loss function"

    if not args.no_cuda:
        loss_fn = loss_fn.cuda()

    # load a checkpoint
    if args.restore_checkpoint is not None:
        assert os.path.isfile(args.restore_checkpoint), '%s not found' % args.restore_checkpoint
        checkpoint = torch.load(args.restore_checkpoint)
        print('Restoring model with %s architecture...' % checkpoint['arch'])
        sparse_checkpoint = True if 'sparse' in checkpoint and checkpoint['sparse'] else False
        if sparse_checkpoint:
            print("Loading sparse model")
            dense_sparse_dense.sparsify(model, sparsity=0.)  # ensure sparsity_masks exist in model definition
        model.load_state_dict(checkpoint['state_dict'])
        if 'threshold' in checkpoint:
            threshold = checkpoint['threshold']
            threshold = torch.FloatTensor(threshold)
            print('Using thresholds:', threshold)
            if not args.no_cuda:
                threshold = threshold.cuda()
        else:
            threshold = 0.5
        if 'gp' in checkpoint and checkpoint['gp'] != args.gp:
            print("Warning: Model created with global pooling (%s) different from checkpoint (%s)"
                  % (args.gp, checkpoint['gp']))
        print('Model restored from file: %s' % args.restore_checkpoint)
    else:
        assert False and "No checkpoint specified"

    if args.train:
        print('Validating training data...')
        validate(
            model, loader_train, loss_fn, args, threshold, prefix='train', output_dir=output_dir)

    print('Validating validation data...')
    validate(
        model, loader_eval, loss_fn, args, threshold, prefix='eval', output_dir=output_dir)
Ejemplo n.º 10
0
def main():
    args = parser.parse_args()

    batch_size = args.batch_size
    img_size = (args.img_size, args.img_size)
    num_classes = 17
    if args.tif:
        img_type = '.tif'
    else:
        img_type = '.jpg'

    dataset = AmazonDataset(
        args.data,
        train=False,
        multi_label=args.multi_label,
        tags_type='all',
        img_type=img_type,
        img_size=img_size,
        test_aug=args.tta,
    )

    tags = get_tags()
    output_col = ['image_name'] + tags
    submission_col = ['image_name', 'tags']

    loader = data.DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=args.num_processes)

    model = create_model(args.model,
                         pretrained=False,
                         num_classes=num_classes,
                         global_pool=args.gp)

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              args.num_gpu))).cuda()
        else:
            model.cuda()

    if args.restore_checkpoint is not None:
        assert os.path.isfile(
            args.restore_checkpoint), '%s not found' % args.restore_checkpoint
        checkpoint = torch.load(args.restore_checkpoint)
        print('Restoring model with %s architecture...' % checkpoint['arch'])
        sparse_checkpoint = True if 'sparse' in checkpoint and checkpoint[
            'sparse'] else False
        if sparse_checkpoint:
            print("Loading sparse model")
            dense_sparse_dense.sparsify(
                model,
                sparsity=0.)  # ensure sparsity_masks exist in model definition
        model.load_state_dict(checkpoint['state_dict'])
        if 'args' in checkpoint:
            train_args = checkpoint['args']
        if 'threshold' in checkpoint:
            threshold = checkpoint['threshold']
            threshold = torch.FloatTensor(threshold)
            print('Using thresholds:', threshold)
            if not args.no_cuda:
                threshold = threshold.cuda()
        else:
            threshold = 0.5
        if 'gp' in checkpoint and checkpoint['gp'] != args.gp:
            print(
                "Warning: Model created with global pooling (%s) different from checkpoint (%s)"
                % (args.gp, checkpoint['gp']))
        csplit = os.path.normpath(
            args.restore_checkpoint).split(sep=os.path.sep)
        if len(csplit) > 1:
            exp_name = csplit[-2] + '-' + csplit[-1].split('.')[0]
        else:
            exp_name = ''
        print('Model restored from file: %s' % args.restore_checkpoint)
    else:
        assert False and "No checkpoint specified"

    if args.output:
        output_base = args.output
    else:
        output_base = './output'
    if not exp_name:
        exp_name = '-'.join([
            args.model,
            str(train_args.img_size), 'f' + str(train_args.fold),
            'tif' if args.tif else 'jpg'
        ])
    output_dir = get_outdir(output_base, 'predictions', exp_name)

    model.eval()

    batch_time_m = AverageMeter()
    data_time_m = AverageMeter()
    results_raw = []
    results_thr = []
    results_sub = []
    try:
        end = time.time()
        for batch_idx, (input, target, index) in enumerate(loader):
            data_time_m.update(time.time() - end)
            if not args.no_cuda:
                input = input.cuda()
            input_var = autograd.Variable(input, volatile=True)
            output = model(input_var)

            # augmentation reduction
            reduce_factor = loader.dataset.get_aug_factor()
            if reduce_factor > 1:
                output.data = output.data.unfold(
                    0, reduce_factor, reduce_factor).mean(dim=2).squeeze(dim=2)
                index = index[0:index.size(0):reduce_factor]

            # output non-linearity and thresholding
            output = torch.sigmoid(output)
            if isinstance(threshold, torch.FloatTensor) or isinstance(
                    threshold, torch.cuda.FloatTensor):
                threshold_m = torch.unsqueeze(threshold,
                                              0).expand_as(output.data)
                output_thr = (output.data > threshold_m).byte()
            else:
                output_thr = (output.data > threshold).byte()

            # move data to CPU and collect
            output = output.cpu().data.numpy()
            output_thr = output_thr.cpu().numpy()
            index = index.cpu().numpy().flatten()
            for i, o, ot in zip(index, output, output_thr):
                #print(dataset.inputs[i], o, ot)
                image_name = os.path.splitext(
                    os.path.basename(dataset.inputs[i]))[0]
                results_raw.append([image_name] + list(o))
                results_thr.append([image_name] + list(ot))
                results_sub.append([image_name] + [vector_to_tags(ot, tags)])
                # end iterating through batch

            batch_time_m.update(time.time() - end)
            if batch_idx % args.log_interval == 0:
                print('Inference: [{}/{} ({:.0f}%)]  '
                      'Time: {batch_time.val:.3f}s, {rate:.3f}/s  '
                      '({batch_time.avg:.3f}s, {rate_avg:.3f}/s)  '
                      'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
                          batch_idx * len(input),
                          len(loader.sampler),
                          100. * batch_idx / len(loader),
                          batch_time=batch_time_m,
                          rate=input_var.size(0) / batch_time_m.val,
                          rate_avg=input_var.size(0) / batch_time_m.avg,
                          data_time=data_time_m))

            end = time.time()
            #end iterating through dataset
    except KeyboardInterrupt:
        pass
    results_raw_df = pd.DataFrame(results_raw, columns=output_col)
    results_raw_df.to_csv(os.path.join(output_dir, 'results_raw.csv'),
                          index=False)
    results_thr_df = pd.DataFrame(results_thr, columns=output_col)
    results_thr_df.to_csv(os.path.join(output_dir, 'results_thr.csv'),
                          index=False)
    results_sub_df = pd.DataFrame(results_sub, columns=submission_col)
    results_sub_df.to_csv(os.path.join(output_dir, 'submission.csv'),
                          index=False)
Ejemplo n.º 11
0
def main():
    args = parser.parse_args()

    train_input_root = os.path.join(args.data, 'inputs')
    train_target_root = os.path.join(args.data, 'targets')
    train_process_file = os.path.join(args.data, 'processed.csv')
    train_counts_file = './data/correct_train.csv'
    train_coords_file = './data/correct_coordinates.csv'
    output_dir = get_outdir('./output', 'train',
                            datetime.now().strftime("%Y%m%d-%H%M%S"))

    batch_size = args.batch_size
    num_epochs = 1000
    patch_size = (args.patch_size, args.patch_size)
    num_outputs = 5
    target_type = 'countception' if args.model in ['countception', 'cc'
                                                   ] else 'density'
    debug_model = False
    use_logits = args.use_logits
    num_logits = 12 if use_logits else 0

    torch.manual_seed(args.seed)

    dataset = SealionDataset(
        train_input_root,
        train_target_root,
        train_counts_file,
        train_coords_file,
        train_process_file,
        train=True,
        patch_size=patch_size,
        target_type=target_type,
        generate_target=True,
        per_image_norm=True,
        num_logits=num_logits,
    )

    sampler = RandomPatchSampler(dataset, oversample=32, repeat=16)

    loader = data.DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             num_workers=args.num_processes,
                             sampler=sampler)

    if args.model == 'cnet':
        model = ModelCnet(outplanes=num_outputs,
                          target_size=patch_size,
                          debug=debug_model)
    elif args.model in ['countception', 'cc']:
        model = ModelCountception(outplanes=num_outputs,
                                  use_logits=use_logits,
                                  logits_per_output=num_logits,
                                  debug=debug_model)
    else:
        assert False and "Invalid model"

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              args.num_gpu))).cuda()
        else:
            model.cuda()

    if args.opt.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.opt.lower() == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
    elif args.opt.lower() == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.weight_decay)
    else:
        assert False and "Invalid optimizer"

    if args.loss.lower() == 'l1':
        loss_fn = torch.nn.L1Loss()
    elif args.loss.lower() == 'smoothl1':
        loss_fn = torch.nn.SmoothL1Loss()
    elif args.loss.lower() == 'mse':
        loss_fn = torch.nn.MSELoss()
    elif args.loss.lower() in ['crossentropy', 'nll']:
        loss_fn = torch.nn.CrossEntropyLoss()
        assert use_logits and "Cross entropy only a valid loss of logits are being used"
    else:
        assert False and "Invalid loss function"

    # optionally resume from a checkpoint
    start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            start_epoch = checkpoint['epoch']
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    for epoch in range(start_epoch, num_epochs + 1):
        adjust_learning_rate(optimizer,
                             epoch,
                             initial_lr=args.lr,
                             decay_epochs=3)
        train_epoch(epoch,
                    model,
                    loader,
                    optimizer,
                    loss_fn,
                    args,
                    output_dir,
                    use_logits=use_logits)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': model.name(),
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            is_best=False,
            filename='checkpoint-%d.pth.tar' % epoch,
            output_dir=output_dir)