Esempio n. 1
0
	def crawl(self, key):
		seed = None
		score = ''
		if self.img.origin:
			seed = self.img.origin.url()
			score = '({:.2f})'.format(self.img.origin._score)
		msg = []
		imgs = []
		pool = []
		dur = time()
		# crawl 3 blogs
		for i in range(3):
			msg += [index.get_crawler().message()]
			self.message('\n'.join(['Wait for crawler...','',
				'Seed URL: {} {}'.format(seed, score),''] + msg +
				['','{}/3'.format(i),
				'{} images (+{})'.format(len(pool), len(imgs)),
				'time: {:.1f}'.format(time()-dur)]))
			imgs = index.crawl(seed, num=1)
			pool.extend(imgs)
		self.pool.extend(pool)
		# done. prompt status. redraw?
		# redraw GUI -> show new imgs
		self.display()
		# prompt status
		dur = time()-dur
		msg += [index.get_crawler().message()]
		crl = index.get_crawler()
		self.message('\n'.join(['Done.',
			'Crawler returned {} new images in {:.1f} seconds'.format(
				len(pool), dur),
			'({:.2f} img/sec avg.).'.format(len(pool)/dur),
			'']+
			msg+['', 'Crawler status:',
			' {} visited,'.format(len(crl.visited)),
			' {} in queue.'.format(len(crl.frontier))]),
			confirm=True)
		self.redraw = True
Esempio n. 2
0
    if url in ["-r", "-rnd"]:
        seed = tumblr.any().url()
    elif tumblr.proper_url(url) or url.count(".") < 1:
        seed = url
# if not seed:
# seed = sorted(index.blogs(), key=lambda t:len(t.proper_imgs))[-1]
# seed = sorted(index.blogs(), key=lambda t:t.score)[-1]
# if len(index.blogs())>0:
# seed = choice(index.blogs()).url()

proceed = True
imgs = []

n = 9
while proceed:
    imgs_new = index.crawl(seed, num=n)
    imgs.extend(imgs_new)
    # for i,p in enumerate(imgs_new):
    # for q in imgs_new[i+1:]:
    # if p.origin == q.origin:
    # sim = p.similarity(q)
    # if sim>.45:
    # picture.connect(p,q,sim)
    print "images so far:", len(imgs)
    seed = None
    proceed = raw_input("continue downloading? ").lower() in ["y", "yes"]
    n = max(n - 1, 3)

index.save()
index.inout.save_log("save")