Esempio n. 1
0
def procMain(pid, states):
    states[pid] = STATE_IDLE

    # init black site list
    bSite = BlackSiteList(BLACK_SITE_FILE)
    try:
        while True:
            states[pid] = STATE_CONNECTING
            images = fetchImages()
            if images:
                states[pid] = STATE_BUSY
                for img in images:
                    if (len(img.save_path) > 255):
                        img.save_path = img.save_path[:255]
                    save_dir = getDir() + '/' + img.save_path.replace(
                        '/', '\\')
                    file_name = None
                    if img.name != "":
                        file_name = img.name

                    Log.d("{procMain} downloading image (%s) ", img.url)

                    if Downloader.download(img.url, save_dir, file_name):
                        img.request_state = REQUEST_STATE.SUCC
                        img.save_path = save_dir
                    else:
                        site = img.url.split('/')[2]
                        fail_count = bSite.getFaileSiteCount(site)

                        img.request_state = REQUEST_STATE.FAIL
                        # retry download
                        retry_times = MAX_REQUEST_RETRY_TIME - fail_count
                        if retry_times < 0:
                            retry_times = 0

                        while retry_times > 0:
                            Log.i(
                                "{procMain} retry download image(%s) times(%d)",
                                img.url, retry_times)
                            retry_times = retry_times - 1
                            if Downloader.download(img.url, save_dir,
                                                   file_name):
                                img.request_state = REQUEST_STATE.SUCC
                                img.save_path = save_dir
                                break
                        if img.request_state != REQUEST_STATE.SUCC:
                            bSite.addFaileSite(site)
                            bSite.save()
                            #Log.e("{procMain} download image(%s) failed!", img.url)

                updateImages(images)
            else:
                time.sleep(
                    3)  # sleep for a while to wait for the database update
    except KeyboardInterrupt:
        Log.i("{procMain} downloader process exit for a KeyboardInterrupt")
Esempio n. 2
0
def procMain(pid, states):
	states[pid] = STATE_IDLE

	try:
		while True:
			states[pid] = STATE_CONNECTING
			Log.d("{procMain} fetching unvisited websites ...")
			websites = fetchWebsite()
			Log.d("{procMain} fetched websites(%d)", len(websites))
			if websites:
				states[pid] = STATE_BUSY
				wbs = set()
				images = set()

				for web in websites:
					spider = Spider()
					if spider.fetchForUrl(web.url):
						web.request_state = REQUEST_STATE.SUCC
						for url in spider.hrefs:
							wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url)))
						for img in spider.imgs:
							images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
						web.title = spider.title
					else:
						web.request_state = REQUEST_STATE.FAIL
						retry_times = MAX_REQUEST_RETRY_TIME

						while retry_times > 0:
							Log.i("{procMain} retry fetch url(%s) id(%d) times(%d)", web.url, web.id, retry_times)
							retry_times = retry_times - 1
							if spider.fetchForUrl(web.url):
								web.request_state = REQUEST_STATE.SUCC
								for url in spider.hrefs:
									wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url)))
								for img in spider.imgs:
									images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
								web.title = spider.title
								break
						if web.request_state != REQUEST_STATE.SUCC:
							Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id)
				
				updateWebsiteStates(websites)
				uploadWesites(wbs)
				uploadImages(images)
			else:
				sleep(3) # sleep for a while to wait for the database update
	except KeyboardInterrupt:
		Log.i("{procMain} spider process exit for a KeyboardInterrupt")