def crawl(self, key): seed = None score = '' if self.img.origin: seed = self.img.origin.url() score = '({:.2f})'.format(self.img.origin._score) msg = [] imgs = [] pool = [] dur = time() # crawl 3 blogs for i in range(3): msg += [index.get_crawler().message()] self.message('\n'.join(['Wait for crawler...','', 'Seed URL: {} {}'.format(seed, score),''] + msg + ['','{}/3'.format(i), '{} images (+{})'.format(len(pool), len(imgs)), 'time: {:.1f}'.format(time()-dur)])) imgs = index.crawl(seed, num=1) pool.extend(imgs) self.pool.extend(pool) # done. prompt status. redraw? # redraw GUI -> show new imgs self.display() # prompt status dur = time()-dur msg += [index.get_crawler().message()] crl = index.get_crawler() self.message('\n'.join(['Done.', 'Crawler returned {} new images in {:.1f} seconds'.format( len(pool), dur), '({:.2f} img/sec avg.).'.format(len(pool)/dur), '']+ msg+['', 'Crawler status:', ' {} visited,'.format(len(crl.visited)), ' {} in queue.'.format(len(crl.frontier))]), confirm=True) self.redraw = True
if url in ["-r", "-rnd"]: seed = tumblr.any().url() elif tumblr.proper_url(url) or url.count(".") < 1: seed = url # if not seed: # seed = sorted(index.blogs(), key=lambda t:len(t.proper_imgs))[-1] # seed = sorted(index.blogs(), key=lambda t:t.score)[-1] # if len(index.blogs())>0: # seed = choice(index.blogs()).url() proceed = True imgs = [] n = 9 while proceed: imgs_new = index.crawl(seed, num=n) imgs.extend(imgs_new) # for i,p in enumerate(imgs_new): # for q in imgs_new[i+1:]: # if p.origin == q.origin: # sim = p.similarity(q) # if sim>.45: # picture.connect(p,q,sim) print "images so far:", len(imgs) seed = None proceed = raw_input("continue downloading? ").lower() in ["y", "yes"] n = max(n - 1, 3) index.save() index.inout.save_log("save")