Beispiel #1
0
	def get_html(self, url) -> Union[BeautifulSoup, None]:
		r = None

		errpoint = 0 if self._proxies_error_count == 0 else (self._proxies_error_count / (self._get_count + self._proxies_error_count))

		if self.proxiesFlag == False and errpoint > 0.5:
			self.proxiesFlag = True
			self.change_proxy()

		if self.proxiesFlag and errpoint > 0.5:
			self._get_count = 0			
			self.change_proxy()

		try:

			self.headers = self.headers if self.headersFlag else None
			self.proxies = self.proxies if self.proxiesFlag else None
			logger.info('parse html {} start, proxies {} ...'.format(url, self.proxies))

			with self._session.get(url, headers = self.headers, 
										proxies = self.proxies, 
										timeout= 15 ) as res:
				if res.ok:
					self._res_Object = res
					self._res_headers = res.headers
					r = BeautifulSoup(res.content, "lxml")
					
					self._error_count = 0 
					self._proxies_error_count = 0
					self._get_count += 1

		except requests.RequestException as e:
			# todo
			if isinstance(e, requests.exceptions.ConnectTimeout) or \
				isinstance(e, requests.exceptions.ReadTimeout) or \
				isinstance(e, requests.exceptions.ProxyError) or \
				isinstance(e, requests.exceptions.ConnectionError):				
				# self.change_proxy()
				self._proxies_error_count += 1 
				logger.info('proxies error, info {}, {} times...'.format(type(e), self._proxies_error_count))
				return self.get_html(url)
			else:				
				self._error_count += 1				
				if self._error_count <= get_config('error_count', 3):				
					self.change_proxy()
					logger.info('wait for re-parsing html  , info {}, {} times...'.format(type(e), self._error_count))
					return self.get_html(url)

		except (KeyboardInterrupt, SystemExit, InterruptedError):
			self.stop()
			return None
		except Exception as e:
			raise
		else:
			pass
		finally:
			pass

		return r
Beispiel #2
0
def create_connection() -> MySQLDatabase:
    """
	create a database connection
	:rtype: MySQLDatabase
	"""
    global _db
    if _db:
        return _db
    else:
        # logger.debug('create new db connection')
        _db = MySQLDatabase(get_config("db_dbname"),
                            user=get_config("db_user"),
                            password=get_config("db_password"),
                            host=get_config("db_host"),
                            port=get_config("db_port"))

        return _db
Beispiel #3
0
def create_connection_sqlite() -> SqliteDatabase:
    """
	create a database connection
	:rtype: SqliteDatabase
	"""
    dbpath = get_abspath(get_config('db_path_qiushibaike', './qiushibaike.db'))
    if (not os.path.exists(dbpath)):
        raise SqliteDbFilenotFoundError("Sqlit db file not exists")

    return SqliteDatabase(dbpath)
Beispiel #4
0
	def create_connection(self) -> SqliteDatabase:
		"""
		create a database connection
		:rtype: SqliteDatabase
		"""
		if self._proxies_db:
			return self._proxies_db
		else: 
			# logger.debug('create new db connection')
			dbpath = get_abspath(get_config('db_path', './scylla.db'))
			if(not os.path.exists(dbpath)):
				raise SqliteDbFilenotFoundError("Sqlit db file not exists")

			self._proxies_db = SqliteDatabase(dbpath)
			return self._proxies_db
Beispiel #5
0
 def __init__(self, arg=""):
     super(Fzdm, self).__init__(arg)
     self._download_queue = queue.Queue(
         get_config("maxsizequeue", default=10))
     self._worker = Worker()
    def get(self, entity):
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
            "cookie":
            "picHost=www-mipengine-org.mipcdn.com/i/p3.manhuapan.com; \
		               _ga=GA1.1.726297105.1589200721; \
		               _ga_1FZE0C2L80=GS1.1.1589465193.9.1.1589465605.0",
            "upgrade-insecure-requests": "1",
        }
        if isinstance(entity, ImgEntity):
            # 图片类型
            logger.debug('download {} start ...'.format(entity.wangluodizhi))
            res = None
            try:
                with requests.get(entity.wangluodizhi, headers = headers,\
                                                    timeout = (self.__connecttime, self.__readtime)) as res:
                    entity.kuozhanming = self.get_img_kuozhanming(res)
                    imgpath = self.get_img_path(
                        hashlib.md5(res.content).hexdigest(), entity)
                    entity.bendidizhi = imgpath

                    with open(imgpath, "wb") as f:
                        f.write(res.content)

                    m = ManhuaSub.update({
                        "bendidizhi":
                        imgpath,
                        "length":
                        res.headers["Content-Length"],
                        "lastModified":
                        res.headers["Last-Modified"]
                    }).where(ManhuaSub.wangluodizhi == entity.wangluodizhi)
                    m.execute()
                    m = m.get(ManhuaSub.wangluodizhi == entity.wangluodizhi)

                    logger.debug('download {} end, img info [{}].'.format(
                        m.biaoti, entity))
                    time.sleep(self.sleeptime)

                    self.__retrycount = 1
                    self.__readtime = 5
                    self.__connecttime = 5

                    return entity

            except requests.exceptions.ConnectionError as e:
                # todo
                logger.debug(
                    'download {} read connect error, wait for re-download {} times'
                    .format(entity.wangluodizhi, self.__retrycount))

                if self.__retrycount < get_config("retrytimes"):
                    self.__retrycount += 1
                    time.sleep(get_config("resleeptime"))
                    self.get(entity)

            except requests.exceptions.ReadTimeout as e:
                # todo
                logger.debug(
                    'download {} read timeout, wait for re-download {} times'.
                    format(entity.wangluodizhi, self.__retrycount))

                if self.__retrycount < get_config("retrytimes"):
                    self.__retrycount += 1
                    self.__readtime += 3
                    time.sleep(get_config("resleeptime"))
                    self.get(entity)

            except requests.exceptions.ConnectTimeout as e:
                # todo
                logger.debug(
                    'download {} connect timeout, wait for re-download {} time'
                    .format(entity.wangluodizhi, self.__retrycount))
                if self.__retrycount < get_config("retrytimes"):
                    self.__retrycount += 1
                    self.__connecttime += 3
                    time.sleep(get_config("resleeptime"))
                    self.get(entity)

            except Exception as e:
                logger.debug(
                    'download {} error, error info {}, stop download'.format(
                        entity.wangluodizhi, e))
            finally:
                res = None