Beispiel #1
0
	def fetchForUrl(self, url):
		ret = False
		self.__feedingUrl = url
		try:
			Log.d("{Spider.fetchForUrl} requesting url(%s)", url)
			rs = requests.session()
			resp = rs.get(url, timeout = CONN_TIME_OUT, proxies = PROXY_CONFIG)
			if resp.status_code == 200:
				self.__feedingUrl = url
				resp.encoding = self.__getHtmlEncode(resp.text)
				#print '[Debug]Using encoding{%s} for HTML {%s}' % (resp.encoding, url)
				#print resp.text

				self.feed(resp.text)
				ret = True
			else:
				Log.e("{Spider.fetchForUrl} address(%s) can't be reached!", url)

		except requests.exceptions.ConnectionError as err:
			Log.e("{Spider.fetchForUrl} connect to address(%s) failed, exception<%s>", url, str(err),)
		except requests.exceptions.ReadTimeout as ex:
			Log.e("{Spider.fetchForUrl} connect to address(%s) time out", url)
		finally:
			self.__feedingUrl = None

		return ret
Beispiel #2
0
def procMain(pid, states):
    states[pid] = STATE_IDLE

    # init black site list
    bSite = BlackSiteList(BLACK_SITE_FILE)
    try:
        while True:
            states[pid] = STATE_CONNECTING
            images = fetchImages()
            if images:
                states[pid] = STATE_BUSY
                for img in images:
                    if (len(img.save_path) > 255):
                        img.save_path = img.save_path[:255]
                    save_dir = getDir() + '/' + img.save_path.replace(
                        '/', '\\')
                    file_name = None
                    if img.name != "":
                        file_name = img.name

                    Log.d("{procMain} downloading image (%s) ", img.url)

                    if Downloader.download(img.url, save_dir, file_name):
                        img.request_state = REQUEST_STATE.SUCC
                        img.save_path = save_dir
                    else:
                        site = img.url.split('/')[2]
                        fail_count = bSite.getFaileSiteCount(site)

                        img.request_state = REQUEST_STATE.FAIL
                        # retry download
                        retry_times = MAX_REQUEST_RETRY_TIME - fail_count
                        if retry_times < 0:
                            retry_times = 0

                        while retry_times > 0:
                            Log.i(
                                "{procMain} retry download image(%s) times(%d)",
                                img.url, retry_times)
                            retry_times = retry_times - 1
                            if Downloader.download(img.url, save_dir,
                                                   file_name):
                                img.request_state = REQUEST_STATE.SUCC
                                img.save_path = save_dir
                                break
                        if img.request_state != REQUEST_STATE.SUCC:
                            bSite.addFaileSite(site)
                            bSite.save()
                            #Log.e("{procMain} download image(%s) failed!", img.url)

                updateImages(images)
            else:
                time.sleep(
                    3)  # sleep for a while to wait for the database update
    except KeyboardInterrupt:
        Log.i("{procMain} downloader process exit for a KeyboardInterrupt")
Beispiel #3
0
 def worker(pid, states):
     from time import sleep
     try:
         states[pid] = STATE_IDLE
         sleep(3)
         states[pid] = STATE_BUSY
         sleep(30)
         states[pid] = STATE_TERMINATE
     except KeyboardInterrupt:
         Log.d("worker end!")
Beispiel #4
0
def uploadImages(images):
	Log.d("{uploadImages} uploading images(%d) ...", len(images))
	addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
	try:
		client = get_client(addr)
		client.send(CONFIG.UPLOAD_IMAGE)
		client.send(images)
		if client.recv() == CONFIG.ACTION_FAILED:
			Log.e("{uploadImages} upload fetched images failed!")
		Log.d("{uploadImages} upload images done!")
		client.close()
	except EOFError:
		Log.e("{uploadImages} server has been closed")
Beispiel #5
0
def updateWebsiteStates(websites):
	Log.d("{updateWebsiteStates} updating websites ...")
	addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
	try:
		client = get_client(addr)
		if websites and len(websites) > 0:
			client.send(CONFIG.UPDATE_WESITE_STATE)
			client.send(websites)
			if client.recv() == CONFIG.ACTION_FAILED:
				Log.e("{updateWebsiteStates} tell server to update website state failed!")
			Log.d("{updateWebsiteStates} updating websites done!")
		client.close()
	except EOFError:
		Log.e("{updateWebsiteStates} server has been closed")
Beispiel #6
0
def updateImages(images):
    Log.d("{updateImages} updating images ...")
    addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
    try:
        client = get_client(addr, means='Socket')
        client.send(CONFIG.UPDATE_IMAGE_STATE)
        client.send(images)
        if client.recv() == CONFIG.ACTION_FAILED:
            Log.e("{updateImages} tell server to update images state failed!")
        client.close()
    except EOFError:
        Log.e("{updateImages} server has been closed")

    Log.d("{updateImages} update images done")
Beispiel #7
0
def procMain(pid, states):
	states[pid] = STATE_IDLE

	try:
		while True:
			states[pid] = STATE_CONNECTING
			Log.d("{procMain} fetching unvisited websites ...")
			websites = fetchWebsite()
			Log.d("{procMain} fetched websites(%d)", len(websites))
			if websites:
				states[pid] = STATE_BUSY
				wbs = set()
				images = set()

				for web in websites:
					spider = Spider()
					if spider.fetchForUrl(web.url):
						web.request_state = REQUEST_STATE.SUCC
						for url in spider.hrefs:
							wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url)))
						for img in spider.imgs:
							images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
						web.title = spider.title
					else:
						web.request_state = REQUEST_STATE.FAIL
						retry_times = MAX_REQUEST_RETRY_TIME

						while retry_times > 0:
							Log.i("{procMain} retry fetch url(%s) id(%d) times(%d)", web.url, web.id, retry_times)
							retry_times = retry_times - 1
							if spider.fetchForUrl(web.url):
								web.request_state = REQUEST_STATE.SUCC
								for url in spider.hrefs:
									wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url)))
								for img in spider.imgs:
									images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
								web.title = spider.title
								break
						if web.request_state != REQUEST_STATE.SUCC:
							Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id)
				
				updateWebsiteStates(websites)
				uploadWesites(wbs)
				uploadImages(images)
			else:
				sleep(3) # sleep for a while to wait for the database update
	except KeyboardInterrupt:
		Log.i("{procMain} spider process exit for a KeyboardInterrupt")
Beispiel #8
0
	def run(self):
		while not self.isTerminate:
			try:				
				task = self.__threadpool.tasks.get(block = True, timeout = Worker.THREAD_IDEL_TIME)
			except Exception as e:
				#print "[Debug]None task founded, worker done"
				break;

			task.value = task.callback(*task.param)
			task.done = True
			if self.__threadpool.returns is not None:
				self.__threadpool.returns.put(task)
			#print "[Debug]Task done!"

		self.isTerminate = True
		self.__thread = None
		self.__threadpool.removeThread(self)
		Log.d("{Worker.run} work thread terminated!")
Beispiel #9
0
	def handle_starttag(self, tag, attrs):
		if tag == 'img':
			attrs = dict(attrs)
			if attrs.has_key('src'):
				url = self.__adjustUrl(attrs['src'])
				if self.__isValidateImage(url):
					Log.d("image: %s", url)
					self.imgs.add(url)

		if tag == 'input':
			attrs = dict(attrs)
			if attrs.has_key('type') and attrs['type'] == 'image' and attrs.has_key('src'):
				url = self.__adjustUrl(attrs['src'])
				if self.__isValidateImage(url):
					Log.d("image: %s", url)
					self.imgs.add(url)

		if tag == 'a':
			attrs = dict(attrs)
			if attrs.has_key('href'):
				url = self.__adjustUrl(attrs['href'])
				if self.__isValidateUrl(url):
					Log.d("link: %s", url)
					self.hrefs.add(url)
					
		if self.title == None:
			if self.findTitle == False:
				if tag == 'div':
					attrs = dict(attrs);
					if attrs.has_key('class') and attrs['class'] == 't t2':
						self.findTitle = True;
			elif tag == 'h4':
				self.canGetTitle = True
Beispiel #10
0
	def removeThread(self, thread):
		self.lockObj.acquire()
		self.threads.remove(thread)
		self.lockObj.release()
		Log.d("{Worker.removeThread} pool thread count(%d)", len(self.threads))
Beispiel #11
0
	def addThread(self, thread):
		self.lockObj.acquire()
		self.threads.append(thread)
		self.lockObj.release()
		Log.d("{Worker.addThread} pool thread count(%d)", len(self.threads))
Beispiel #12
0
	def handle_data(self, data):
		if self.title == None and self.findTitle == True and self.canGetTitle == True:
			self.title = data
			Log.d("title: %s", self.title);
Beispiel #13
0
								for img in spider.imgs:
									images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
								web.title = spider.title
								break
						if web.request_state != REQUEST_STATE.SUCC:
							Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id)
				
				updateWebsiteStates(websites)
				uploadWesites(wbs)
				uploadImages(images)
			else:
				sleep(3) # sleep for a while to wait for the database update
	except KeyboardInterrupt:
		Log.i("{procMain} spider process exit for a KeyboardInterrupt")

if __name__ == '__main__':
	Log.setup('spider')
	Log.d('setting up spider......')

	#procMain(1, {})

	num = 1
	if len(os.sys.argv) > 1:
		num = int(os.sys.argv[1])
		if num < 1:
			num = 1

	pm = ProcessManager(procMain, maxWorker = num)
	pm.run()

Beispiel #14
0
                                img.request_state = REQUEST_STATE.SUCC
                                img.save_path = save_dir
                                break
                        if img.request_state != REQUEST_STATE.SUCC:
                            bSite.addFaileSite(site)
                            bSite.save()
                            #Log.e("{procMain} download image(%s) failed!", img.url)

                updateImages(images)
            else:
                time.sleep(
                    3)  # sleep for a while to wait for the database update
    except KeyboardInterrupt:
        Log.i("{procMain} downloader process exit for a KeyboardInterrupt")


if __name__ == '__main__':
    Log.setup("download")
    Log.d('setting up downloader......')
    """
	procMain(1, {})
	"""
    num = 1
    if len(os.sys.argv) > 1:
        num = int(os.sys.argv[1])
        if num < 1:
            num = 1

    pm = ProcessManager(procMain, maxWorker=num)
    pm.run()