Ejemplo n.º 1
0
	def __adjustUrl(self, src_url):
		if not self.__feedingUrl:
			Log.e("{Spider.__adjustUrl} self.__feedingUrl is None")
			return src_url

		if src_url == '#':
			return self.__feedingUrl

		if src_url.startswith('//'):
			src_url = "http:" + src_url
			return src_url

		if src_url.startswith('http://') or src_url.startswith('https://'):
			return src_url

		if src_url.startswith('javascript:'):
			return src_url

		elems = src_url.split('/')
		feeding = self.__feedingUrl.split('/')

		for e in elems:
			if e == '.':
				elems.remove(e)

		while True:
			feeding.pop()
			if elems[0] != '..':
				break;
			elems.pop(0)

		url = feeding + elems
		return '/'.join(url)
Ejemplo n.º 2
0
def uploadImages(images):
	Log.d("{uploadImages} uploading images(%d) ...", len(images))
	addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
	try:
		client = get_client(addr)
		client.send(CONFIG.UPLOAD_IMAGE)
		client.send(images)
		if client.recv() == CONFIG.ACTION_FAILED:
			Log.e("{uploadImages} upload fetched images failed!")
		Log.d("{uploadImages} upload images done!")
		client.close()
	except EOFError:
		Log.e("{uploadImages} server has been closed")
Ejemplo n.º 3
0
def updateWebsiteStates(websites):
	Log.d("{updateWebsiteStates} updating websites ...")
	addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
	try:
		client = get_client(addr)
		if websites and len(websites) > 0:
			client.send(CONFIG.UPDATE_WESITE_STATE)
			client.send(websites)
			if client.recv() == CONFIG.ACTION_FAILED:
				Log.e("{updateWebsiteStates} tell server to update website state failed!")
			Log.d("{updateWebsiteStates} updating websites done!")
		client.close()
	except EOFError:
		Log.e("{updateWebsiteStates} server has been closed")
Ejemplo n.º 4
0
    def download(src, save_dir=DIR_PATH, file_name=None):
        ext = ""
        reg = IMGREG.search(src)
        if reg:
            ext = '.' + reg.group(1)
        if file_name is None:
            sha = hashlib.sha1()
            sha.update(src)
            file_name = sha.hexdigest()

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        path = os.path.join(save_dir, file_name + ext)

        ret = False
        try:
            response = requests.get(src,
                                    stream=True,
                                    verify=False,
                                    timeout=CONNECT_TIME_OUT,
                                    proxies=PROXY_CONFIG)
            try:
                fp = open(path, 'wb')
                try:
                    if response.raw.status == 200:
                        shutil.copyfileobj(response.raw, fp)
                        ret = True
                    else:
                        Log.e(
                            "{Downloader.download} HTML response state(%d) while downloading (%s)",
                            response.raw.status, src)
                except requests.packages.urllib3.exceptions.ReadTimeoutError as e:
                    Log.e(
                        "{Downloader.download} download file (%s) failed! Exception(%s)",
                        src, str(e))
                finally:
                    fp.close()
            except IOError:
                Log.e("{Downloader.download} can't write file to path (%s)",
                      path)
        except requests.exceptions.ConnectionError as err:
            Log.e("{Downloader.download} connect error: Exception<%s>",
                  str(err))
        except Exception as ex:
            Log.e("{Downloader.download} download (%s): Raise exception<%s>",
                  src, str(ex))

        if not ret and os.path.exists(path):
            os.remove(path)  #delete failed image
        return ret
Ejemplo n.º 5
0
def updateImages(images):
    Log.d("{updateImages} updating images ...")
    addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
    try:
        client = get_client(addr, means='Socket')
        client.send(CONFIG.UPDATE_IMAGE_STATE)
        client.send(images)
        if client.recv() == CONFIG.ACTION_FAILED:
            Log.e("{updateImages} tell server to update images state failed!")
        client.close()
    except EOFError:
        Log.e("{updateImages} server has been closed")

    Log.d("{updateImages} update images done")
Ejemplo n.º 6
0
def procMain(pid, states):
	states[pid] = STATE_IDLE

	try:
		while True:
			states[pid] = STATE_CONNECTING
			Log.d("{procMain} fetching unvisited websites ...")
			websites = fetchWebsite()
			Log.d("{procMain} fetched websites(%d)", len(websites))
			if websites:
				states[pid] = STATE_BUSY
				wbs = set()
				images = set()

				for web in websites:
					spider = Spider()
					if spider.fetchForUrl(web.url):
						web.request_state = REQUEST_STATE.SUCC
						for url in spider.hrefs:
							wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url)))
						for img in spider.imgs:
							images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
						web.title = spider.title
					else:
						web.request_state = REQUEST_STATE.FAIL
						retry_times = MAX_REQUEST_RETRY_TIME

						while retry_times > 0:
							Log.i("{procMain} retry fetch url(%s) id(%d) times(%d)", web.url, web.id, retry_times)
							retry_times = retry_times - 1
							if spider.fetchForUrl(web.url):
								web.request_state = REQUEST_STATE.SUCC
								for url in spider.hrefs:
									wbs.add(DBWebsite(url = url, from_url = web.id, priority = calcPriority(url, web.url)))
								for img in spider.imgs:
									images.add(DBImage(url = img, from_website = web.id, save_path = spider.title))
								web.title = spider.title
								break
						if web.request_state != REQUEST_STATE.SUCC:
							Log.e("{procMain} fetch url(%s) id(%d) failed!", web.url, web.id)
				
				updateWebsiteStates(websites)
				uploadWesites(wbs)
				uploadImages(images)
			else:
				sleep(3) # sleep for a while to wait for the database update
	except KeyboardInterrupt:
		Log.i("{procMain} spider process exit for a KeyboardInterrupt")
Ejemplo n.º 7
0
	def fetchForUrl(self, url):
		ret = False
		self.__feedingUrl = url
		try:
			Log.d("{Spider.fetchForUrl} requesting url(%s)", url)
			rs = requests.session()
			resp = rs.get(url, timeout = CONN_TIME_OUT, proxies = PROXY_CONFIG)
			if resp.status_code == 200:
				self.__feedingUrl = url
				resp.encoding = self.__getHtmlEncode(resp.text)
				#print '[Debug]Using encoding{%s} for HTML {%s}' % (resp.encoding, url)
				#print resp.text

				self.feed(resp.text)
				ret = True
			else:
				Log.e("{Spider.fetchForUrl} address(%s) can't be reached!", url)

		except requests.exceptions.ConnectionError as err:
			Log.e("{Spider.fetchForUrl} connect to address(%s) failed, exception<%s>", url, str(err),)
		except requests.exceptions.ReadTimeout as ex:
			Log.e("{Spider.fetchForUrl} connect to address(%s) time out", url)
		finally:
			self.__feedingUrl = None

		return ret
Ejemplo n.º 8
0
def fetchWebsite():
	addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
	websites = []
	try:
		client = get_client(addr, means = CONFIG.COMMUNICATE_MEANS)
		client.send(CONFIG.FETCH_WEBSITE)
		result = client.recv()
		if result == CONFIG.ACTION_FAILED:
			Log.e("{Spider.fetchWebsite} get validate website failed")
		else:
			websites = client.recv()
		client.close()
	except EOFError:
		Log.e("{Spider.fetchWebsite} server has been closed")
	except Exception as e:
		Log.e("{Spider.fetchWebsite} raise exceptions<%s>", str(e))

	return websites
Ejemplo n.º 9
0
def fetchImages():
    addr = (CONFIG.DB_HOST, CONFIG.RPC_PORT)
    images = None
    try:
        client = get_client(addr, means=CONFIG.COMMUNICATE_MEANS)
        client.send(CONFIG.FETCH_IMAGE)
        result = client.recv()
        if result == CONFIG.ACTION_FAILED:
            Log.e("{fetchImages} get download images failed")
        else:
            images = client.recv()

        client.close()
    except EOFError:
        Log.e("{fetchImages} server side has been closed")
    except Exception as e:
        Log.e("{fetchImages} raise exceptions: %s", str(e))

    return images