def test_assemblyvalidation(): bamasm = get_testfile('cm-500pgun-asm-b2mv31-bam') contigfa = get_testfile('cm-500pgun-asm-b2mv31-fa') bamref = get_testfile('cm-500pgun-ref-bam') refstatsfile = get_testfile('cm-500pgun-ref-stats') refphylfile = get_testfile('cm-ref-phyl') nucmercoords = get_testfile('cm-500pgun-val-nucmer') val = AssemblyValidation(bamref, bamasm, refphylfile, refstatsfile, contigfa, nucmercoords) assert(len(val.contigs) == int(get_shell_output("grep -c '^>' " + contigfa)[0])) make_dir(get_outdir() + "masm") val.write_contig_purity(get_outdir() + "masm" + "/contig-purity.tsv") val.write_general_stats(get_outdir() + "masm" + "/asm-stats.tsv") val.write_genome_contig_cov(get_outdir() + "masm" + "/genome-contig-coverage.tsv")
def read(self): print_ = get_print(self.customWidget) for try_ in range(8): self.customWidget.print_('get_session') try: session = get_session() html = downloader.read_html(self.url, session=session) soup = Soup(html) get_title_artist(soup) break except Exception as e: print(e) else: raise title, self.artist = get_title_artist(soup) self.__title = title title_dir = clean_title((u'[{}] {}').format(self.artist, title)) ex = soup.find('div', id='novel_ex') self.novel_ex = ex.text.strip() if ex else None texts = [] subtitles = soup.findAll('dd', class_='subtitle') if subtitles: for subtitle in subtitles: update = subtitle.parent.find('dt', class_='long_update') update2 = None if update: for span in update.findAll('span'): update2 = span.attrs['title'] span.decompose() update = update.text.strip() if update2: update += (u' ({})').format(update2) a = subtitle.find('a') subtitle = a.text.strip() href = urljoin(self.url, a.attrs['href']) if not re.search(('ncode.syosetu.com/{}/[0-9]+').format(self.id_), href): print_((u'skip: {}').format(href)) continue text = Text(subtitle, update, href, session, False) texts.append(text) else: self.single = True text = Text(title_dir, None, self.url, session, True) texts.append(text) self.print_((u'single: {}').format(self.single)) outdir = get_outdir('syosetu') for text in texts: if self.single: file = os.path.join(outdir, text.filename) else: file = os.path.join(outdir, title_dir, text.filename) if os.path.isfile(file): self.urls.append(file) else: self.urls.append(text.url) self.title = title_dir
def main(): args = parser.parse_args() num_classes = len(get_labels()) test_time_pool = 0 #5 if 'dpn' in args.model else 0 model = model_factory.create_model(args.model, in_chs=1, num_classes=num_classes, global_pool=args.gp, test_time_pool=test_time_pool) #model.reset_classifier(num_classes=num_classes) if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model.cuda() if not os.path.exists(args.checkpoint): print("=> no checkpoint found at '{}'".format(args.checkpoint)) exit(1) print("=> loading checkpoint '{}'".format(args.checkpoint)) checkpoint = torch.load(args.checkpoint) if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.checkpoint, checkpoint['epoch'])) else: model.load_state_dict(checkpoint) csplit = os.path.normpath(args.checkpoint).split(sep=os.path.sep) if len(csplit) > 1: exp_name = csplit[-2] + '-' + csplit[-1].split('.')[0] else: exp_name = '' if args.output: output_base = args.output else: output_base = './output' output_dir = get_outdir(output_base, 'predictions', exp_name) dataset = CommandsDataset(root=args.data, mode='test', format='spectrogram') loader = data.DataLoader(dataset, batch_size=args.batch_size, pin_memory=True, shuffle=False, num_workers=args.workers) model.eval() batch_time_m = AverageMeter() data_time_m = AverageMeter() try: # open CSV for writing predictions cf = open(os.path.join(output_dir, 'results.csv'), mode='w') res_writer = csv.writer(cf) res_writer.writerow(['fname'] + dataset.id_to_label) # open CSV for writing submission cf = open(os.path.join(output_dir, 'submission.csv'), mode='w') sub_writer = csv.writer(cf) sub_writer.writerow(['fname', 'label', 'prob']) end = time.time() batch_sample_idx = 0 for batch_idx, (input, target) in enumerate(loader): data_time_m.update(time.time() - end) input_var = autograd.Variable(input.cuda(), volatile=True) output = model(input_var) # augmentation reduction #reduce_factor = loader.dataset.get_aug_factor() #if reduce_factor > 1: # output.data = output.data.unfold(0, reduce_factor, reduce_factor).mean(dim=2).squeeze(dim=2) # index = index[0:index.size(0):reduce_factor] # move data to CPU and collect) output_logprob = F.log_softmax(output, dim=1).data.cpu().numpy() output = F.softmax(output, dim=1) output_prob, output_idx = output.max(1) output_prob = output_prob.data.cpu().numpy() output_idx = output_idx.data.cpu().numpy() for i in range(output_logprob.shape[0]): index = batch_sample_idx + i pred_label = dataset.id_to_label[output_idx[i]] pred_prob = output_prob[i] filename = dataset.filename(index) res_writer.writerow([filename] + list(output_logprob[i])) sub_writer.writerow([filename] + [pred_label, pred_prob]) batch_sample_idx += input_var.size(0) batch_time_m.update(time.time() - end) if batch_idx % args.print_freq == 0: print('Inference: [{}/{} ({:.0f}%)] ' 'Time: {batch_time.val:.3f}s, {rate:.3f}/s ' '({batch_time.avg:.3f}s, {rate_avg:.3f}/s) ' 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( batch_sample_idx, len(loader.sampler), 100. * batch_idx / len(loader), batch_time=batch_time_m, rate=input_var.size(0) / batch_time_m.val, rate_avg=input_var.size(0) / batch_time_m.avg, data_time=data_time_m)) end = time.time() # end iterating through dataset except KeyboardInterrupt: pass except Exception as e: print(str(e))
def read(self): type = self.pixiv_type cw = self.customWidget print_ = cw.print_ ui_setting = self.ui_setting if type == 'following': raise NotImplementedError('following') self._format = [None, 'gif', 'webp', 'png'][ui_setting.ugoira_convert.currentIndex()] self._format_name = compatstr(ui_setting.pixivFormat.currentText()) types = [t.lower() for t in query_url(self.url).get('type', [])] if types: s = (u', ').join(sorted(types)) types = set(types) else: s = 'all' types = None print_((u'Type: {}').format(s)) print_((u'info: {}').format(self.info)) api = self.api query = self.id.replace('_bmk', '').replace('_illust', '').replace( 'pixiv_', '').replace('search_', '') if type != 'search': query = int(query) print('pixiv_query:', query) try: if type in ('user', 'bookmark', 'search'): max_pid = get_max_range(cw, 2000) if ui_setting.groupBox_tag.isChecked(): tags = [ compatstr(ui_setting.tagList.item(i).text()) for i in range(ui_setting.tagList.count()) ] else: tags = [] if type == 'search': query = query.replace('+', ' ') name = query else: id = self.id.replace('_bmk', '').replace('pixiv_', '').replace( 'search_', '') print('name', id) name = get_name(id, self.api, cw=cw) cw.artist = name title = u'{} ({})'.format(name, self.id) print_(title) dir = os.path.join(get_outdir('pixiv'), clean_title(title)) imgs = get_imgs(query, type=type, api=api, n=max_pid, tags=tags, types=types, format=self._format, format_name=self._format_name, dir=dir, cw=cw, title=title, info=self.info) elif type == 'illust': for try_ in range(N_TRY): try: detail = api.illust_detail(query, req_auth=True) error = detail.get('error') if error: raise PixivError(error) break except PixivError as e: api = e.api print_(e) if try_ < N_TRY - 1: print_('retry...') sleep(SLEEP) else: raise illust = detail.illust name = illust.title title = (u'{} ({})').format(name, self.id) dir = os.path.join(get_outdir('pixiv'), clean_title(title)) imgs = get_imgs_from_illust(illust, api=api, format=self._format, dir=dir, cw=cw, format_name=self._format_name) except PixivError as e: msg = (u'PixivError: {}').format(e.message) return self.Invalid(msg) self.imgs = imgs for img in imgs: self.urls.append(img.url) self.filenames[img.url] = img.filename self.title = clean_title(title) # 1390
def main(): args = parser.parse_args() if args.output: output_base = args.output else: output_base = './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, args.gp, 'f' + str(args.fold) ]) output_dir = get_outdir(output_base, 'train', exp_name) train_input_root = os.path.join(args.data) batch_size = args.p * args.k num_epochs = args.epochs wav_size = (16000, ) num_classes = 128 # triplet embedding size torch.manual_seed(args.seed) model = model_factory.create_model(args.model, in_chs=1, pretrained=args.pretrained, num_classes=num_classes, drop_rate=args.drop, global_pool=args.gp, embedding_net=True, embedding_norm=2., embedding_act_fn=torch.sigmoid, checkpoint_path=args.initial_checkpoint) dataset_train = dataset.CommandsDataset( root=train_input_root, mode='train', fold=args.fold, wav_size=wav_size, format='spectrogram', train_unknown=False, ) loader_train = data.DataLoader(dataset_train, batch_size=batch_size, pin_memory=True, sampler=dataset.PKSampler(dataset_train, p=args.p, k=args.k), num_workers=args.workers) dataset_eval = dataset.CommandsDataset( root=train_input_root, mode='validate', fold=args.fold, wav_size=wav_size, format='spectrogram', train_unknown=False, ) loader_eval = data.DataLoader(dataset_eval, batch_size=batch_size, pin_memory=True, sampler=dataset.PKSampler(dataset_eval, p=args.p, k=args.k), num_workers=args.workers) train_loss_fn = validate_loss_fn = TripletLoss(margin=0.5, sample=True) train_loss_fn = train_loss_fn.cuda() validate_loss_fn = validate_loss_fn.cuda() opt_params = list(model.parameters()) if args.opt.lower() == 'sgd': optimizer = optim.SGD(opt_params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) elif args.opt.lower() == 'adam': optimizer = optim.Adam(opt_params, lr=args.lr, weight_decay=args.weight_decay, eps=args.opt_eps) elif args.opt.lower() == 'nadam': optimizer = nadam.Nadam(opt_params, lr=args.lr, weight_decay=args.weight_decay, eps=args.opt_eps) elif args.opt.lower() == 'adadelta': optimizer = optim.Adadelta(opt_params, lr=args.lr, weight_decay=args.weight_decay, eps=args.opt_eps) elif args.opt.lower() == 'rmsprop': optimizer = optim.RMSprop(opt_params, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=args.weight_decay) else: assert False and "Invalid optimizer" del opt_params if not args.decay_epochs: print('No decay epoch set, using plateau scheduler.') lr_scheduler = ReduceLROnPlateau(optimizer, patience=10) else: lr_scheduler = None # optionally resume from a checkpoint start_epoch = 0 if args.start_epoch is None else args.start_epoch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: if 'args' in checkpoint: print(checkpoint['args']) new_state_dict = OrderedDict() for k, v in checkpoint['state_dict'].items(): if k.startswith('module'): name = k[7:] # remove `module.` else: name = k new_state_dict[name] = v model.load_state_dict(new_state_dict) if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) if 'loss' in checkpoint: train_loss_fn.load_state_dict(checkpoint['loss']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) start_epoch = checkpoint[ 'epoch'] if args.start_epoch is None else args.start_epoch else: model.load_state_dict(checkpoint) else: print("=> no checkpoint found at '{}'".format(args.resume)) exit(1) saver = CheckpointSaver(checkpoint_dir=output_dir) if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model.cuda() best_loss = None try: for epoch in range(start_epoch, num_epochs): if args.decay_epochs: adjust_learning_rate(optimizer, epoch, initial_lr=args.lr, decay_rate=args.decay_rate, decay_epochs=args.decay_epochs) train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, saver=saver, output_dir=output_dir) # save a recovery in case validation blows up saver.save_recovery( { 'epoch': epoch + 1, 'arch': args.model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'loss': train_loss_fn.state_dict(), 'args': args, 'gp': args.gp, }, epoch=epoch + 1, batch_idx=0) step = epoch * len(loader_train) eval_metrics = validate(step, model, loader_eval, validate_loss_fn, args, output_dir=output_dir) if lr_scheduler is not None: lr_scheduler.step(eval_metrics['eval_loss']) rowd = OrderedDict(epoch=epoch) rowd.update(train_metrics) rowd.update(eval_metrics) with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf: dw = csv.DictWriter(cf, fieldnames=rowd.keys()) if best_loss is None: # first iteration (epoch == 1 can't be used) dw.writeheader() dw.writerow(rowd) # save proper checkpoint with eval metric best_loss = saver.save_checkpoint( { 'epoch': epoch + 1, 'arch': args.model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args, 'gp': args.gp, }, epoch=epoch + 1, metric=eval_metrics['eval_loss']) except KeyboardInterrupt: pass if best_loss is not None: print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0]))
def get_imgs(username, session, title, types, n=0, format='[%y-%m-%d] id_ppage', cw=None): print_ = get_print(cw) # Range n = max(n, get_max_range(cw)) # 2303 ids = set() names = dict() dir_ = os.path.join(get_outdir('twitter'), title) if os.path.isdir(dir_) and cw: for name in cw.names_old: name = os.path.basename(name) id_ = re.find('([0-9]+)_p', name) if id_ is None: continue if get_ext(name).lower() == '.mp4': type_ = 'video' else: type_ = 'img' if type_ not in types: continue id_ = int(id_) ids.add(id_) if id_ in names: names[id_].append(name) else: names[id_] = [name] max_id = max(ids) if ids else 0 # 2303 imgs_old = [] for id_ in sorted(ids, reverse=True): for p, file in enumerate( sorted(os.path.join(dir_, name) for name in names[id_])): img = Image(file, '', id_, 0, p, format, cw, False) img.url = LazyUrl_twitter(None, lambda _: file, img) img.filename = os.path.basename(file) imgs_old.append(img) imgs_new = [] enough = False for tweet in TwitterAPI(session, cw).timeline_media(username): id_ = int(tweet['id_str']) if id_ < max_id: print_('enough') enough = True break imgs_ = get_imgs_from_tweet(tweet, session, types, format, cw) if id_ in ids: print_('duplicate: {}'.format(id_)) continue ids.add(id_) imgs_new += imgs_ if len(imgs_old) + len(imgs_new) >= n: break msg = '{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs_new)) if cw: if not cw.alive: break cw.setTitle(msg) else: print(msg) if not enough and not imgs_new: raise Exception('no imgs') imgs = sorted(imgs_old + imgs_new, key=lambda img: img.id, reverse=True) if len(imgs) < n: imgs = get_imgs_more(username, session, title, types, n, format, cw, imgs=imgs) return imgs
def main(): config = DefaultConfigs() train_input_root = os.path.join(config.data) train_labels_file = 'labels.csv' if config.output: if not os.path.exists(config.output): os.makedirs(config.output) output_base = config.output else: if not os.path.exists(config.output): os.makedirs(config.output) output_base = config.output exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), config.model, str(config.img_size), 'f' + str(config.fold) ]) mask_exp_name = '-'.join( [config.model, str(config.img_size), 'f' + str(config.fold)]) mask_exp_name = glob.glob( os.path.join(output_base, 'train', '*' + mask_exp_name)) if config.resume and mask_exp_name: output_dir = mask_exp_name else: output_dir = get_outdir(output_base, 'train', exp_name) batch_size = config.batch_size test_batch_size = config.test_batch_size num_epochs = config.epochs img_type = config.image_type img_size = (config.img_size, config.img_size) num_classes = get_tags_size(config.labels) torch.manual_seed(config.seed) dataset_train = HumanDataset( train_input_root, train_labels_file, train=True, multi_label=config.multi_label, img_type=img_type, img_size=img_size, fold=config.fold, ) #sampler = WeightedRandomOverSampler(dataset_train.get_sample_weights()) loader_train = data.DataLoader( dataset_train, batch_size=batch_size, shuffle=True, #sampler=sampler, num_workers=config.num_processes) dataset_eval = HumanDataset( train_input_root, train_labels_file, train=False, multi_label=config.multi_label, img_type=img_type, img_size=img_size, test_aug=config.tta, fold=config.fold, ) loader_eval = data.DataLoader(dataset_eval, batch_size=test_batch_size, shuffle=False, num_workers=config.num_processes) # model = model_factory.create_model( # config.model, # pretrained=True, # num_classes=num_classes, # drop_rate=config.drop, # global_pool=config.gp) model = get_net(config.model, num_classes, config.drop, config.channels) if not config.no_cuda: if config.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( config.num_gpu))).cuda() else: model.cuda() if config.opt.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) elif config.opt.lower() == 'adam': optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) elif config.opt.lower() == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) elif config.opt.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=config.lr, alpha=0.9, momentum=config.momentum, weight_decay=config.weight_decay) elif config.opt.lower() == 'yellowfin': optimizer = YFOptimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay, clip_thresh=2) else: assert False and "Invalid optimizer" if not config.decay_epochs: lr_scheduler = ReduceLROnPlateau(optimizer, patience=8) else: lr_scheduler = None if config.class_weights: class_weights = torch.from_numpy( dataset_train.get_class_weights()).float() class_weights_norm = class_weights / class_weights.sum() if not config.no_cuda: class_weights = class_weights.cuda() class_weights_norm = class_weights_norm.cuda() else: class_weights = None class_weights_norm = None if config.loss.lower() == 'nll': #assert not args.multi_label and 'Cannot use crossentropy with multi-label target.' loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights) elif config.loss.lower() == 'mlsm': assert config.multi_label loss_fn = torch.nn.MultiLabelSoftMarginLoss(weight=class_weights) else: assert config and "Invalid loss function" if not config.no_cuda: loss_fn = loss_fn.cuda() # optionally resume from a checkpoint start_epoch = 1 if config.resume: if os.path.isfile(config.resume): print("=> loading checkpoint '{}'".format(config.resume)) checkpoint = torch.load(config.resume) config.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( config.resume, checkpoint['epoch'])) start_epoch = checkpoint['epoch'] else: print("=> no checkpoint found at '{}'".format(config.resume)) exit(-1) use_tensorboard = not config.no_tb and CrayonClient is not None if use_tensorboard: hostname = '127.0.0.1' port = 8889 host_port = config.tbh.split(':')[:2] if len(host_port) == 1: hostname = host_port[0] elif len(host_port) >= 2: hostname, port = host_port[:2] try: cc = CrayonClient(hostname=hostname, port=port) try: cc.remove_experiment(exp_name) except ValueError: pass exp = cc.create_experiment(exp_name) except Exception as e: exp = None print( "Error (%s) connecting to Tensoboard/Crayon server. Giving up..." % str(e)) else: exp = None # Optional fine-tune of only the final classifier weights for specified number of epochs (or part of) if not config.resume and config.ft_epochs > 0.: if config.opt.lower() == 'adam': finetune_optimizer = optim.Adam(model.get_fc().parameters(), lr=config.ft_lr, weight_decay=config.weight_decay) else: finetune_optimizer = optim.SGD(model.get_fc().parameters(), lr=config.ft_lr, momentum=config.momentum, weight_decay=config.weight_decay) finetune_epochs_int = int(np.ceil(config.ft_epochs)) finetune_final_batches = int( np.ceil((1 - (finetune_epochs_int - config.ft_epochs)) * len(loader_train))) print(finetune_epochs_int, finetune_final_batches) for fepoch in range(1, finetune_epochs_int + 1): if fepoch == finetune_epochs_int and finetune_final_batches: batch_limit = finetune_final_batches else: batch_limit = 0 train_epoch(fepoch, model, loader_train, finetune_optimizer, loss_fn, config, class_weights_norm, output_dir, batch_limit=batch_limit) step = fepoch * len(loader_train) score, _ = validate(step, model, loader_eval, loss_fn, config, 0.3, output_dir) score_metric = 'f2' best_loss = None best_f2 = None threshold = 0.2 try: for epoch in range(start_epoch, num_epochs + 1): if config.decay_epochs: adjust_learning_rate(optimizer, epoch, initial_lr=config.lr, decay_epochs=config.decay_epochs) train_metrics = train_epoch(epoch, model, loader_train, optimizer, loss_fn, config, class_weights_norm, output_dir, exp=exp) step = epoch * len(loader_train) eval_metrics, latest_threshold = validate(step, model, loader_eval, loss_fn, config, threshold, output_dir, exp=exp) if lr_scheduler is not None: lr_scheduler.step(eval_metrics['eval_loss']) rowd = OrderedDict(epoch=epoch) rowd.update(train_metrics) rowd.update(eval_metrics) with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf: dw = csv.DictWriter(cf, fieldnames=rowd.keys()) if best_loss is None: # first iteration (epoch == 1 can't be used) dw.writeheader() dw.writerow(rowd) best = False if best_loss is None or eval_metrics['eval_loss'] < best_loss[1]: best_loss = (epoch, eval_metrics['eval_loss']) if score_metric == 'loss': best = True if best_f2 is None or eval_metrics['eval_f2'] > best_f2[1]: best_f2 = (epoch, eval_metrics['eval_f2']) if score_metric == 'f2': best = True save_checkpoint( { 'epoch': epoch + 1, 'arch': config.model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'threshold': latest_threshold, 'config': config }, is_best=best, filename=os.path.join(config.checkpoint_path, 'checkpoint-%d.pth.tar' % epoch), output_dir=output_dir) except KeyboardInterrupt: pass print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0])) print('*** Best f2: {0} (epoch {1})'.format(best_f2[1], best_f2[0]))
def get_imgs(username, title, cw=None): urls = [ 'https://m.facebook.com/{}/photos'.format(username), 'https://m.facebook.com/profile.php?id={}&sk=photos'.format( username), # no custom URL ] for url in urls: print('get_imgs url:', url) try: html = read_html(url) except: continue soup = Soup(html) if soup.find('a', id='signup-button'): raise errors.LoginRequired() photo = soup.find('div', class_='_5v64') if photo is not None: break else: raise Exception('No photo div') cursor = photo.a.attrs['href'].split('/photos/')[1].split('/')[1] print('first cursor:', cursor) href = re.find(r'(/photos/pandora/\?album_token=.+?)"', html) href = urljoin(url, href) href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href) cursors = set([cursor]) imgs = [] dups = {} dir = os.path.join(get_outdir('facebook'), title) try: filenames = os.listdir(dir) except: filenames = [] for filename in filenames: name, ext = os.path.splitext(filename) if name.isdigit(): dups[int(name)] = os.path.join(dir, filename) pages = set() while True: print(href) html = read_html(href) data_raw = html.replace('for (;;);', '') data = json.loads(data_raw) actions = data['payload']['actions'] for action in actions: if action['target'] == 'm_more_photos': break else: print('No more photos') break html = action['html'] soup = Soup(html) photos = soup.findAll('div', class_='_5v64') for photo in photos: for a in photo.findAll('a'): page = a.attrs['href'] page = urljoin(href, page) # remove duplicate pages if page in pages: continue pages.add(page) img = Image(page) id = img.id if id in dups and getsize(dups[id]) > 0: print('skip', id) imgs.append(dups[id]) else: imgs.append(img) s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw is not None: cw.setTitle(s) if not cw.alive: return [] else: print(s) cursor = re.find(PATTERN_CURSOR, data_raw) #print(cursor) if cursor is None: print('no cursor') break if cursor in cursors: print('same cursor') break cursors.add(cursor) href = re.sub('&cursor=[0-9]+', '&cursor={}'.format(cursor), href) return imgs
def main(): args = parser.parse_args() train_input_root = os.path.join(args.data) train_labels_file = './data/labels.csv' output_dir = get_outdir('./output', 'eval', datetime.now().strftime("%Y%m%d-%H%M%S")) batch_size = args.batch_size num_epochs = 1000 if args.tif: img_type = '.tif' else: img_type = '.jpg' img_size = (args.img_size, args.img_size) num_classes = get_tags_size(args.labels) debug_model = False torch.manual_seed(args.seed) if args.train: dataset_train = AmazonDataset( train_input_root, train_labels_file, train=False, train_fold=True, tags_type=args.labels, multi_label=args.multi_label, img_type=img_type, img_size=img_size, fold=args.fold, ) loader_train = data.DataLoader( dataset_train, batch_size=batch_size, shuffle=False, num_workers=args.num_processes ) dataset_eval = AmazonDataset( train_input_root, train_labels_file, train=False, tags_type=args.labels, multi_label=args.multi_label, img_type=img_type, img_size=img_size, test_aug=args.tta, fold=args.fold, ) loader_eval = data.DataLoader( dataset_eval, batch_size=batch_size, shuffle=False, num_workers=args.num_processes ) model = create_model(args.model, pretrained=args.pretrained, num_classes=num_classes, global_pool=args.gp) if not args.no_cuda: if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() if False: class_weights = torch.from_numpy(dataset_train.get_class_weights()).float() class_weights_norm = class_weights / class_weights.sum() if not args.no_cuda: class_weights = class_weights.cuda() class_weights_norm = class_weights_norm.cuda() else: class_weights = None class_weights_norm = None if args.loss.lower() == 'nll': #assert not args.multi_label and 'Cannot use crossentropy with multi-label target.' loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights) elif args.loss.lower() == 'mlsm': assert args.multi_label loss_fn = torch.nn.MultiLabelSoftMarginLoss(weight=class_weights) else: assert False and "Invalid loss function" if not args.no_cuda: loss_fn = loss_fn.cuda() # load a checkpoint if args.restore_checkpoint is not None: assert os.path.isfile(args.restore_checkpoint), '%s not found' % args.restore_checkpoint checkpoint = torch.load(args.restore_checkpoint) print('Restoring model with %s architecture...' % checkpoint['arch']) sparse_checkpoint = True if 'sparse' in checkpoint and checkpoint['sparse'] else False if sparse_checkpoint: print("Loading sparse model") dense_sparse_dense.sparsify(model, sparsity=0.) # ensure sparsity_masks exist in model definition model.load_state_dict(checkpoint['state_dict']) if 'threshold' in checkpoint: threshold = checkpoint['threshold'] threshold = torch.FloatTensor(threshold) print('Using thresholds:', threshold) if not args.no_cuda: threshold = threshold.cuda() else: threshold = 0.5 if 'gp' in checkpoint and checkpoint['gp'] != args.gp: print("Warning: Model created with global pooling (%s) different from checkpoint (%s)" % (args.gp, checkpoint['gp'])) print('Model restored from file: %s' % args.restore_checkpoint) else: assert False and "No checkpoint specified" if args.train: print('Validating training data...') validate( model, loader_train, loss_fn, args, threshold, prefix='train', output_dir=output_dir) print('Validating validation data...') validate( model, loader_eval, loss_fn, args, threshold, prefix='eval', output_dir=output_dir)
def main(): args = parser.parse_args() batch_size = args.batch_size img_size = (args.img_size, args.img_size) num_classes = 17 if args.tif: img_type = '.tif' else: img_type = '.jpg' dataset = AmazonDataset( args.data, train=False, multi_label=args.multi_label, tags_type='all', img_type=img_type, img_size=img_size, test_aug=args.tta, ) tags = get_tags() output_col = ['image_name'] + tags submission_col = ['image_name', 'tags'] loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=args.num_processes) model = create_model(args.model, pretrained=False, num_classes=num_classes, global_pool=args.gp) if not args.no_cuda: if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model.cuda() if args.restore_checkpoint is not None: assert os.path.isfile( args.restore_checkpoint), '%s not found' % args.restore_checkpoint checkpoint = torch.load(args.restore_checkpoint) print('Restoring model with %s architecture...' % checkpoint['arch']) sparse_checkpoint = True if 'sparse' in checkpoint and checkpoint[ 'sparse'] else False if sparse_checkpoint: print("Loading sparse model") dense_sparse_dense.sparsify( model, sparsity=0.) # ensure sparsity_masks exist in model definition model.load_state_dict(checkpoint['state_dict']) if 'args' in checkpoint: train_args = checkpoint['args'] if 'threshold' in checkpoint: threshold = checkpoint['threshold'] threshold = torch.FloatTensor(threshold) print('Using thresholds:', threshold) if not args.no_cuda: threshold = threshold.cuda() else: threshold = 0.5 if 'gp' in checkpoint and checkpoint['gp'] != args.gp: print( "Warning: Model created with global pooling (%s) different from checkpoint (%s)" % (args.gp, checkpoint['gp'])) csplit = os.path.normpath( args.restore_checkpoint).split(sep=os.path.sep) if len(csplit) > 1: exp_name = csplit[-2] + '-' + csplit[-1].split('.')[0] else: exp_name = '' print('Model restored from file: %s' % args.restore_checkpoint) else: assert False and "No checkpoint specified" if args.output: output_base = args.output else: output_base = './output' if not exp_name: exp_name = '-'.join([ args.model, str(train_args.img_size), 'f' + str(train_args.fold), 'tif' if args.tif else 'jpg' ]) output_dir = get_outdir(output_base, 'predictions', exp_name) model.eval() batch_time_m = AverageMeter() data_time_m = AverageMeter() results_raw = [] results_thr = [] results_sub = [] try: end = time.time() for batch_idx, (input, target, index) in enumerate(loader): data_time_m.update(time.time() - end) if not args.no_cuda: input = input.cuda() input_var = autograd.Variable(input, volatile=True) output = model(input_var) # augmentation reduction reduce_factor = loader.dataset.get_aug_factor() if reduce_factor > 1: output.data = output.data.unfold( 0, reduce_factor, reduce_factor).mean(dim=2).squeeze(dim=2) index = index[0:index.size(0):reduce_factor] # output non-linearity and thresholding output = torch.sigmoid(output) if isinstance(threshold, torch.FloatTensor) or isinstance( threshold, torch.cuda.FloatTensor): threshold_m = torch.unsqueeze(threshold, 0).expand_as(output.data) output_thr = (output.data > threshold_m).byte() else: output_thr = (output.data > threshold).byte() # move data to CPU and collect output = output.cpu().data.numpy() output_thr = output_thr.cpu().numpy() index = index.cpu().numpy().flatten() for i, o, ot in zip(index, output, output_thr): #print(dataset.inputs[i], o, ot) image_name = os.path.splitext( os.path.basename(dataset.inputs[i]))[0] results_raw.append([image_name] + list(o)) results_thr.append([image_name] + list(ot)) results_sub.append([image_name] + [vector_to_tags(ot, tags)]) # end iterating through batch batch_time_m.update(time.time() - end) if batch_idx % args.log_interval == 0: print('Inference: [{}/{} ({:.0f}%)] ' 'Time: {batch_time.val:.3f}s, {rate:.3f}/s ' '({batch_time.avg:.3f}s, {rate_avg:.3f}/s) ' 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( batch_idx * len(input), len(loader.sampler), 100. * batch_idx / len(loader), batch_time=batch_time_m, rate=input_var.size(0) / batch_time_m.val, rate_avg=input_var.size(0) / batch_time_m.avg, data_time=data_time_m)) end = time.time() #end iterating through dataset except KeyboardInterrupt: pass results_raw_df = pd.DataFrame(results_raw, columns=output_col) results_raw_df.to_csv(os.path.join(output_dir, 'results_raw.csv'), index=False) results_thr_df = pd.DataFrame(results_thr, columns=output_col) results_thr_df.to_csv(os.path.join(output_dir, 'results_thr.csv'), index=False) results_sub_df = pd.DataFrame(results_sub, columns=submission_col) results_sub_df.to_csv(os.path.join(output_dir, 'submission.csv'), index=False)
def main(): args = parser.parse_args() train_input_root = os.path.join(args.data, 'inputs') train_target_root = os.path.join(args.data, 'targets') train_process_file = os.path.join(args.data, 'processed.csv') train_counts_file = './data/correct_train.csv' train_coords_file = './data/correct_coordinates.csv' output_dir = get_outdir('./output', 'train', datetime.now().strftime("%Y%m%d-%H%M%S")) batch_size = args.batch_size num_epochs = 1000 patch_size = (args.patch_size, args.patch_size) num_outputs = 5 target_type = 'countception' if args.model in ['countception', 'cc' ] else 'density' debug_model = False use_logits = args.use_logits num_logits = 12 if use_logits else 0 torch.manual_seed(args.seed) dataset = SealionDataset( train_input_root, train_target_root, train_counts_file, train_coords_file, train_process_file, train=True, patch_size=patch_size, target_type=target_type, generate_target=True, per_image_norm=True, num_logits=num_logits, ) sampler = RandomPatchSampler(dataset, oversample=32, repeat=16) loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=args.num_processes, sampler=sampler) if args.model == 'cnet': model = ModelCnet(outplanes=num_outputs, target_size=patch_size, debug=debug_model) elif args.model in ['countception', 'cc']: model = ModelCountception(outplanes=num_outputs, use_logits=use_logits, logits_per_output=num_logits, debug=debug_model) else: assert False and "Invalid model" if not args.no_cuda: if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model.cuda() if args.opt.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.opt.lower() == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.opt.lower() == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: assert False and "Invalid optimizer" if args.loss.lower() == 'l1': loss_fn = torch.nn.L1Loss() elif args.loss.lower() == 'smoothl1': loss_fn = torch.nn.SmoothL1Loss() elif args.loss.lower() == 'mse': loss_fn = torch.nn.MSELoss() elif args.loss.lower() in ['crossentropy', 'nll']: loss_fn = torch.nn.CrossEntropyLoss() assert use_logits and "Cross entropy only a valid loss of logits are being used" else: assert False and "Invalid loss function" # optionally resume from a checkpoint start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) start_epoch = checkpoint['epoch'] else: print("=> no checkpoint found at '{}'".format(args.resume)) for epoch in range(start_epoch, num_epochs + 1): adjust_learning_rate(optimizer, epoch, initial_lr=args.lr, decay_epochs=3) train_epoch(epoch, model, loader, optimizer, loss_fn, args, output_dir, use_logits=use_logits) save_checkpoint( { 'epoch': epoch + 1, 'arch': model.name(), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename='checkpoint-%d.pth.tar' % epoch, output_dir=output_dir)