def __init__(self, processName, pipeDictData): multiprocessing.Process.__init__(self) self.processName = processName self.pipeDictData = pipeDictData#任务url消息队列 #数据库模型和控制器 self.__Sendcollection = "Send_collection" self.URL_inf = URLinformation() # self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"]) #这里还是设计成单个进程使用Mongo,不然TM给我警告 self.mogodbControl = None
def __init__(self, processName, pipeDictData): multiprocessing.Process.__init__(self) self.processName = processName #Initial ipProxy and heads self.ipProxy = self.getIpPoolMethod() self.headersEngine = HeadersEngine() self.heads = self.headersEngine.getHeaders() #数据库模型和控制器 self.URL_inf = URLinformation() self.__Sendcollection = "httpsearchccgpgovcn" self.mogodbControl = None self.KafkaOperator = None self.pipeDictData = pipeDictData # 任务url消息队列
def ayncDownloadTask(self, ipProxy,DictData): """ 异步执行爬虫业务 :param ipProxy: :param DictData: :return: """ # Log.i(DictData) global mutex_lock mutex_lock.acquire() # 临界区开始,互斥的开始 if self.URL_inf is None: self.URL_inf = URLinformation() # 源数据处理(实体类) self.URL_inf.dict2class(DictData) # # 查重业务逻辑 # if self.__checkURL(self.URL_inf.Urlname): # return # else: # item = {"_id": self.__getMD5(self.URL_inf.Urlname)} # self.mogodbControl.insert(item, self.__Sendcollection) # TODO 这里沿用王洋以前代码逻辑 self.URL_inf = self.__getSoupAndDeepnumOrDown(ipProxy) if self.URL_inf is None or self.URL_inf.Urlname is None or self.URL_inf.content is None: mutex_lock.release() # 临界区结束,互斥的结束 return # 查重业务逻辑,uid=urlName+content # hashlib.md5((self.Url_inf.Urlname + self.Url_inf.content.decode("utf8", "ignore")).encode( # "utf-8")).hexdigest() # 使用md5编码 #抛出AttributeError: 'NoneType' object has no attribute 'decode'异常 # checkUrlUID = self.URL_inf.Urlname+self.URL_inf.title.decode("utf8","ignore") # checkUrlUID = urllib.parse.unquote(self.URL_inf.Urlname) # checkUrlUID = checkUrlUID.join(str(self.URL_inf.content)) # checkUrlUID = hashlib.md5((urllib.parse.unquote(self.URL_inf.Urlname).join(str(self.URL_inf.content)).encode( # "utf-8"))).hexdigest() # 使用md5编码 # # if self.__checkURL(checkUrlUID): # mutex_lock.release() # 临界区结束,互斥的结束 # return # else: # item = {"_id": self.__getMD5(checkUrlUID)} # self.mogodbControl.insert(item, self.__Sendcollection) #发送子链接 self._sendChildUrl(self.URL_inf, mutex_lock) mutex_lock.release() # 临界区结束,互斥的结束
def readTaskList(): MyUrl_SourceList = [] ftp = open(TASK_FILENAME, 'r') for line in ftp.readlines(): line = line.strip("\n") if not UrlUtil.isLegalUrl(line): break URL_inf = URLinformation(line, 0, 0.0, 0) # 格式 URL_inf.Flag = 0 URL_inf.DeepNum = 1 URL_inf.domain = UrlUtil.getdomain(line) MyUrl_SourceList.append(URL_inf) ftp.close() return MyUrl_SourceList
def readSourceListByParams(begin, end): ''' 构造URL配置文件 :return: URL LIST对象 ''' if USE_BXBLS is True: MyUrl_SourceList = [] ftp = open(SOURCEURL_FILENAME, 'r') # http: // www.ccgp.gov.cn / cggg / zygg / index, 0, 0, 0, 0, 24 # http: // www.ccgp.gov.cn / cggg / dfgg / index, 0, 0, 0, 0, 24 for line in ftp.readlines(): myUrllist = line.split(',') for i in range(int(begin), int(end) + 1): # 每个list 存到 list中 if i == 0: url = myUrllist[0] + ".htm" else: url = myUrllist[0] + "_" + str(i) + ".htm" URL_inf = URLinformation(url, int(myUrllist[1]), 0.0, float(myUrllist[2])) # 格式 URL_inf.Flag = 0 URL_inf.DeepNum = 1 URL_inf.domain = UrlUtil.getdomain(url) MyUrl_SourceList.append(URL_inf) else: MyUrl_SourceList = [] ftp = open(SOURCEURL_FILENAME, 'r') # http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=,&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2013%3A04%3A09&end_time=2014%3A04%3A08&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=,0,0,0,1,9068 for line in ftp.readlines(): myUrllist = line.split(',') for i in range(int(myUrllist[5]), int(myUrllist[6])): # 每个list 存到 list中 url = myUrllist[0] + str(i) + myUrllist[1] URL_inf = URLinformation(url, int(myUrllist[2]), 0.0, float(myUrllist[4])) # 格式 URL_inf.Flag = 0 URL_inf.DeepNum = 1 URL_inf.domain = UrlUtil.getdomain(url) MyUrl_SourceList.append(URL_inf) ftp.close() return MyUrl_SourceList
def __init__(self, processName, crontab): self.processName = processName self.crontab = crontab #Initial ipProxy and heads self.ipProxy = self.getIpPoolMethod() self.headersEngine = HeadersEngine() self.heads = self.headersEngine.getHeaders() #数据库模型和控制器 self.URL_inf = URLinformation() self.__Sendcollection = "httpsearchccgpgovcn" self.mogodbControl = None
def readSourceListRealTime(): """ 构造实时爬取父URL bidType字段:招标类型 page_index字段:页码 start_time=2018%3A06%3A06字段:开始时间,2018年06月06日 end_time=2018%3A06%3A06字段:开始时间,2018年06月06日 """ #http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=, # &end_time=, # &timeType=6&displayZone=&zoneId=&pppStatus=0&agentName= #截取今天时间 # nowTime = datetime.datetime.now().strftime('%Y-%m-%d').split('-') # strNowTime = nowTime[0]+'%3A'+nowTime[1]+'%3A'+nowTime[2] #老罗说要爬取一周的数据 strNowTime = crawlerStartTime strEndTime = crawlerEndTime MyUrl_SourceList = [] ftp = open(SOURCEURL_FILENAME, 'r') for line in ftp.readlines(): myUrllist = line.split(',') # url = myUrllist[0]+strNowTime+myUrllist[1]+strNowTime+myUrllist[2] url = myUrllist[0] + strNowTime + myUrllist[ 1] + strEndTime + myUrllist[2] URL_inf = URLinformation(url.strip('\n'), int(0), 0.0, float(0)) # 格式 URL_inf.Flag = 0 URL_inf.DeepNum = 1 URL_inf.domain = UrlUtil.getdomain(url) MyUrl_SourceList.append(URL_inf) else: ftp.close() return MyUrl_SourceList
def getPageNumFromHome(self, dowloadData): """ 获取分页的页码URL """ if dowloadData['soup'] is None: return [] else: # Log.i(dowloadData['content'].decode('utf-8')) selector = etree.HTML(dowloadData['content'].decode('utf-8')) try: page = (int( selector.xpath( '//div[@class="vT_z"]/div[1]/div/p[1]/span[2]/text()') [0]) // 20) + 3 except: return [] if page is None: return [] parentURL_infor = [] #随机数的方法判断倒序 num = random.randint(3, 7) #存放url tempUrl = '' if (num % 2) == 0: for i in range(1, page): #TODO字符串替换有问题 #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' # x = 'page_index=' + str(i) # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) #TODO 这里拼接数据有问题 # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \ # + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\ # +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' x = 'page_index=' + str(i) tempUrl = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) Log.i("parseUrl<<" + tempUrl) urlChildInfo = URLinformation( Urlname=tempUrl, title=dowloadData['title'], DeepNum=dowloadData['DeepNum'], domain=dowloadData['domain'], fatherUrl=dowloadData['fatherUrl']) parentURL_infor.append(urlChildInfo) else: if parentURL_infor is not None: page = 0 return parentURL_infor else: for i in range(page - 1, 0, -1): #TODO字符串替换有问题 #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' # x = 'page_index=' + str(i) # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \ # + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\ # +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=' x = 'page_index=' + str(i) tempUrl = re.sub(r'page_index=(.)', x, dowloadData['Urlname']) Log.i("parseUrl<<" + tempUrl) urlChildInfo = URLinformation( Urlname=tempUrl, title=dowloadData['title'], DeepNum=dowloadData['DeepNum'], domain=dowloadData['domain'], fatherUrl=dowloadData['fatherUrl']) parentURL_infor.append(urlChildInfo) else: if parentURL_infor is not None: page = 0 return parentURL_infor
def getChildrenLink(self, pageIndex): """ 获取子链接 :return: """ pattern = r'htt(p|ps):\/\/(\w+\.)+\w+/(\w+/)*' pattern = re.compile(pattern) # print("domain" + str(self.Url_inf.Urlname)) Keyvalue = pattern.search(pageIndex['Urlname']) # Keyvalue <_sre.SRE_Match object; span=(0, 26), match='http://search.ccgp.gov.cn/'> # print("Keyvalue " + str(Keyvalue)) # print(self.Url_inf.Urlname) if Keyvalue != None: Keyvalue = Keyvalue.group() else: Keyvalue = domain = urlparse( pageIndex['Urlname']).scheme + "://" + urlparse( pageIndex['Urlname']).netloc domain = Keyvalue URL_infor = [] URL_infor2 = [] Links = [] link2 = '' title = '' currentTime = '' total_title = '' # if self.Url_inf.soup == None: # return [] if USE_BXBLS is True: #分成两个业务 # if self.Url_inf.Urlname.find("zygg"): # ul_content = self.Url_inf.soup.select(".c_list_bid")[0] # elif self.Url_inf.Urlname.find("dfgg"): # ul_content = self.Url_inf.soup.select(".c_list_bid")[0] # else: # ul_content = self.Url_inf.soup if pageIndex['soup'] is None: return [] else: urlInfoList = pageIndex['soup'].select( ".vT-srch-result-list-bid") if urlInfoList is None: return [] if urlInfoList: ul_content = urlInfoList[0] else: return [] for li in ul_content.select("li"): link = li.select("a")[0] # emProvince = li.select("span")[2].get_text() spanProvince = li.select("span")[0] emProvince = spanProvince.select("a")[0].get_text() currentTime = time.time() try: href2 = link['href'] total_title = link['title'] except KeyError: pageIndex['soup'].select("a").remove(link) # else: if ( href2.startswith("/") ): # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False # link2 = urljoin(self.Url_inf.Urlname, href2) # print(str(link2)) # link2=self.Url_inf.Urlname+href2 title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') elif (href2.startswith("../../..")): title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2.replace('../../..',domain) elif href2.startswith(".."): title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2.replace('..',domain) elif href2.startswith("./"): title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2.replace('./',domain+'/') elif 'http' in href2 and 'gov' in href2: title = link.text.replace('\n', '').replace('\t', '').replace(' ', '') # link2=href2 link2 = urljoin(pageIndex['Urlname'], href2) # print("link2 is :" + str(link2)) #title不全的问题 if title.find("...") > -1: title = total_title title = title.strip('\r') myLinkUrl = URLinformation(Urlname=link2, title=title, DeepNum=pageIndex['DeepNum'] - 1, domain=pageIndex['domain'], fatherUrl=pageIndex['Urlname'], province=emProvince, LastTime=currentTime) URL_infor.append(myLinkUrl) else: for link in pageIndex['soup'].select("a"): # print(str(self.Url_inf.soup)) # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201310/t20131008_3148218.htm" style="line-height:18px" target="_blank"> # 南方科技大学等离子体技术基础仪器采购项目招标公告 # </a> # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144053.htm" style="line-height:18px" target="_blank"> # 2013年国家良种补贴牦牛、绵羊、奶牛冻精、肉牛冻精采购项目公开招标公告 # print("children url is : "+ str(link)) try: href2 = link['href'] # 取出href对应的网站信息 具体信息如上 # print("href2: " + str(href2)) # 取出的信息包含情况如下3种 # http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144362.htm # javascript:void(0) # # except KeyError: pageIndex['soup'].select("a").remove(link) else: # try正确运行 则运行else if ( href2.startswith("/") ): # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False # link2 = urljoin(self.Url_inf.Urlname, href2) # print(str(link2)) # link2=self.Url_inf.Urlname+href2 title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') elif (href2.startswith("../../..")): title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2.replace('../../..',domain) elif href2.startswith(".."): title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2.replace('..',domain) elif href2.startswith("./"): title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2.replace('./',domain+'/') elif 'http' in href2 and 'gov' in href2: title = link.text.replace('\n', '').replace( '\t', '').replace(' ', '') # link2=href2 link2 = urljoin(pageIndex['Urlname'], href2) # print("link2 is :" + str(link2)) myLinkUrl = URLinformation(Urlname=link2, title=title, DeepNum=pageIndex['DeepNum'] - 1, domain=pageIndex['domain'], fatherUrl=pageIndex['Urlname']) URL_infor.append(myLinkUrl) if USE_BXBLS is True: Links = list(set(URL_infor)) else: #TODO 出现AttributeError: 'NoneType' object has no attribute 'select' for http in pageIndex['soup'].select('option'): # 暂时未知有何内容 try: http2 = http['value'] # print("option" + str(http)) except KeyError: pageIndex['soup'].select("option").remove(http) else: if "gov" in http2 and 'http' in http2: myLinkUrl2 = URLinformation( Urlname=http2, title=http.text, DeepNum=pageIndex['DeepNum'] - 1, domain=pageIndex['domain'], fatherUrl=pageIndex['Urlname']) URL_infor2.append(myLinkUrl2) Links = list(set(URL_infor + URL_infor2)) #TODO [2018-05-15 18:13:47.492] [INFO] [31469] [getChildrenLink(),ParseCCGPModule.py:129]: This url have 56 children urls1 Log.i("This url have " + str(len(Links)) + " children urls" + str(pageIndex['DeepNum'])) return Links
def __init__(self): self.producer = self.__setproducer() self.consumer = self.__setconsumer() self.URL_inf = URLinformation()
class localKafkaUrlinformation(): def __init__(self): self.producer = self.__setproducer() self.consumer = self.__setconsumer() self.URL_inf = URLinformation() # def __del__(self): # self.producer.close() # self.consumer.close() def __setproducer(self): """ 返回生产父链接话题的生产者对象 :return: """ conf = localKafka_setting producer = KafkaProducer(bootstrap_servers=conf['bootstrap_servers']) return producer def __setconsumer(self): """ 返回消费父链接话题的消费者对象 :return: """ conf = localKafka_setting try: consumer = KafkaConsumer( bootstrap_servers=conf['bootstrap_servers'], group_id=conf['consumer_id']) except KafkaError as e: Log.e(e + 'kafkaConsumer failed') return consumer # @AsycThread.async def producerUrl(self, strurl): """ 生产父链接 :param strurl: """ try: conf = localKafka_setting future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理 #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e + 'send message failed') pass def consumerurl(self): """ 消费父链接 :param queueDictData: """ conf = localKafka_setting self.consumer.subscribe((conf['topic_name'])) # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了) for message in self.consumer: jsondata = str(message.value, "utf-8") Log.i(jsondata) # try: # dictdata = json.loads(jsondata) # except Exception as e: # Log.e(e + jsondata) # continue # @AsycThread.async def producterUUID(self, strurl): """ 生产ggcp话题的uuid :param strurl: """ try: conf = localKafka_setting #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs. future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e + 'send message failed') pass def setURL_inf(self, dictdata): """ url数据模型 :param dictdata: """ self.URL_inf.dict2class(dictdata) def getURL_inf(self): """ url对外接口 :return: """ return self.URL_inf
class Downloader(multiprocessing.Process): def __init__(self, processName, pipeDictData): multiprocessing.Process.__init__(self) self.processName = processName self.pipeDictData = pipeDictData#任务url消息队列 #数据库模型和控制器 self.__Sendcollection = "Send_collection" self.URL_inf = URLinformation() # self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"]) #这里还是设计成单个进程使用Mongo,不然TM给我警告 self.mogodbControl = None def ayncDownloadTask(self, ipProxy,DictData): """ 异步执行爬虫业务 :param ipProxy: :param DictData: :return: """ # Log.i(DictData) global mutex_lock mutex_lock.acquire() # 临界区开始,互斥的开始 if self.URL_inf is None: self.URL_inf = URLinformation() # 源数据处理(实体类) self.URL_inf.dict2class(DictData) # # 查重业务逻辑 # if self.__checkURL(self.URL_inf.Urlname): # return # else: # item = {"_id": self.__getMD5(self.URL_inf.Urlname)} # self.mogodbControl.insert(item, self.__Sendcollection) # TODO 这里沿用王洋以前代码逻辑 self.URL_inf = self.__getSoupAndDeepnumOrDown(ipProxy) if self.URL_inf is None or self.URL_inf.Urlname is None or self.URL_inf.content is None: mutex_lock.release() # 临界区结束,互斥的结束 return # 查重业务逻辑,uid=urlName+content # hashlib.md5((self.Url_inf.Urlname + self.Url_inf.content.decode("utf8", "ignore")).encode( # "utf-8")).hexdigest() # 使用md5编码 #抛出AttributeError: 'NoneType' object has no attribute 'decode'异常 # checkUrlUID = self.URL_inf.Urlname+self.URL_inf.title.decode("utf8","ignore") # checkUrlUID = urllib.parse.unquote(self.URL_inf.Urlname) # checkUrlUID = checkUrlUID.join(str(self.URL_inf.content)) # checkUrlUID = hashlib.md5((urllib.parse.unquote(self.URL_inf.Urlname).join(str(self.URL_inf.content)).encode( # "utf-8"))).hexdigest() # 使用md5编码 # # if self.__checkURL(checkUrlUID): # mutex_lock.release() # 临界区结束,互斥的结束 # return # else: # item = {"_id": self.__getMD5(checkUrlUID)} # self.mogodbControl.insert(item, self.__Sendcollection) #发送子链接 self._sendChildUrl(self.URL_inf, mutex_lock) mutex_lock.release() # 临界区结束,互斥的结束 @AsycThread.async def _sendChildUrl(self,URL_inf, mutex_lock): # # 保存数据并提取子链接重新投入生产对应的话题 KafkaOperator = kafkaUrlinformation() # TODO 这里用类管理不同网站的逻辑 parseCCGPModule = ParserCCGPModule(URL_inf, KafkaOperator) ccgpChildrenLink = parseCCGPModule.getLinks() if ccgpChildrenLink is None: mutex_lock.release() # 临界区结束,互斥的结束 return for link in ccgpChildrenLink: #于浩说不要发父链接给他 if link.DeepNum >= 0: Log.i("produce<<"+json.dumps(link.class2dict())) KafkaOperator.producerUrl(json.dumps(link.class2dict())) def __downlowdFile(self, url, req): # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way """ 下载文件的代码逻辑.沿用王洋逻辑,还没调试 :param url: :param req: """ reqheaders = req.headers revealfn = url.split('/')[-1] if "." in revealfn[-6:]: fileName = revealfn else: if ('Content-Disposition' in reqheaders.keys()): fileName = reqheaders['Content-Disposition'].split('filename=')[1] fileName = fileName.replace('"', '').replace("'", "") else: r = urllib.request.urlopen(url) if r.url != url: fileName = basename(urlsplit(r.url)[2]) self.URL_inf.FileName = fileName _FileName = None if (self.URL_inf.FilePath): _FileName = str(self.URL_inf.FilePath) + fileName else: _FileName = fileName with open(_FileName, "wb") as donefile: for chunk in req.iter_content(chunk_size=1024): if chunk: donefile.write(chunk) Log.i("File:"+_FileName+"downLoaded") def __getSoupAndDeepnumOrDown(self, ipProxy, headers = []): """ 爬虫并简单解析子链接 :param proxiesmmm: :param headers: """ html = None#爬取网页所有内容 ctifety = 0#解析子链接标志位 Flag = 1#爬取完成标志位 count = 0#空网页计算器 #初始化你的头 headers = HEADERS headersEngine = HeadersEngine() #奶奶的个大循环下载内容和文件,很JB耗时i/o操作 while (Flag): url = self.URL_inf.Urlname#中间变量 try: #如果有异常了赶紧换ip if count > 0: ipProxy=self.getIpPoolMethod() protocol = 'https' if 'https' in ipProxy else 'http' proxiesmmm = {protocol: ipProxy} #Request Http请求网页,虽然我不喜欢这个库 # req = requests.get(url, headers=headers, proxies=proxiesmmm, timeout=2) # ,proxies=proxiesmmm,stream=True # req = requests.get(url, headers=headers, proxies=proxiesmmm) #解决HTTP超时异常 https://www.zhihu.com/question/52595659 拒绝默认的301/302重定向 req = requests.get(url, headers=headers, allow_redirects=False, proxies=proxiesmmm, timeout=3) if req.status_code != 200: return None reqheaders = req.headers if "application" in reqheaders["Content-Type"]: self.__downlowdFile(url=url, req=req) self.URL_inf.Download = 1 elif "text" in reqheaders["Content-Type"]: html = req.content self.URL_inf.Download = 0 ctifety = 1 Flag = 0#该回大部队了 else: return None except requests.exceptions.ConnectTimeout as e: Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) if count > 3: return None pass except (ConnectionError, Timeout) as e: Flag = 1 count+=1 Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) headers = headersEngine.getHeaders() #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error s = requests.session() s.keep_alive = False if count > 3: return None pass except Exception as e: Flag = 1 count += 1 #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language #TODO 处理这种httpconnectionpool max retries Failed to establish a new connection: Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e)) headers = headersEngine.getHeaders() #异常Max retries exceeded with url Error的处理 s = requests.session() s.keep_alive = False if count > 3: return None pass if ctifety: self.URL_inf.content = html soup = BeautifulSoup(html, 'html.parser')#很棒棒的bs简单解析下 else: soup = None self.URL_inf.soup = soup # Log.i(self.URL_inf.content.decode('utf-8')) return self.URL_inf#终于TM的爬完和简单解析了 def __getMD5(self, url): """ 如果content为none,只编码url :return: """ return hashlib.md5(url.encode("utf-8")).hexdigest() # 使用md5编码 def __checkURL(self, urlName): """ 查重函数 :param urlName: :return: """ item = {"_id": urlName} value = self.mogodbControl.findone(item, self.__Sendcollection) # 查询到返回文档 if value == None: # 说明没找到 return False else: return True # 说明找到了 def run_downloader(self, pipeDictData): """ 下载驱动器 :param queueDictData:数据源(队列) """ #改成一个ip下载同一个网站机制 ipProxy = None while True: # Log.i('run_downloader in {0}'.format(time.ctime())) #获取数据源 DictData = pipeDictData.recv() # 数据来了再创建数据库链接 if self.mogodbControl is None: self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"]) #数据来了再去获取ip if DictData is not None: # 获取免费ip if ipProxy is None: ipProxy = self.getIpPoolMethod() # 异步执行下载 self.ayncDownloadTask(ipProxy, DictData) # else: # # 休眠n秒(从配置文件中读取) # items = ConfigUtil.getItems('consumerScheduler') # interval_min = items['interval_min'] # interval_max = items['interval_max'] # seconds = random.randint(int(interval_min), int(interval_max)) # Log.i('run_downloader sleep ' + str(seconds) + ' seconds') # time.sleep(seconds) # continue def getIpPoolMethod(self): """ 获取免费代理ip :return: 返回一个免费的ip代理 """ ipProxy = None if ipProxy is None: if USE_PROXY is True: #获取代理的时候保证能至少有一个IP proxyIpPool = getIpProxyPool() if proxyIpPool is not None: ipProxy = proxyIpPool if ipProxy is None: ipProxy = PROXY_NONE_URL else: ipProxy = PROXY_NONE_URL return ipProxy # ipProxy = None # if ipProxy is None: # if USE_PROXY is True: # #获取代理的时候保证能至少有一个IP # proxyIpPool = getIpProxyPool() # proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote() # # if proxyIpPool is None: # ipProxy = proxyIpPoolFromeRemote # else: # ipProxy = proxyIpPool # # if ipProxy is None: # ipProxy = PROXY_NONE_URL # else: # ipProxy = PROXY_NONE_URL # # return ipProxy def run(self): ''' 获取免费IP代理进程执行,循环读取tasks :return: ''' Log.i('Downloader.run() in {0}'.format(time.ctime())) p_list = list() downloaderRun = Process(target=self.run_downloader, args=(self.pipeDictData,)) p_list.append(downloaderRun) for p in p_list: p.daemon = True p.start() for p in p_list: p.join()
class kafkaUrlinformation(): def __init__(self): self.producer = self.__setproducer() self.consumer = self.__setconsumer() self.URL_inf = URLinformation() # def __del__(self): # self.producer.close() # self.consumer.close() def __setproducer(self): """ 返回生产父链接话题的生产者对象 :return: """ conf = kafka_setting context = ssl.create_default_context() context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED # context.check_hostname = True context.load_verify_locations(CACERT_FILENAME) producer = KafkaProducer(bootstrap_servers=conf['bootstrap_servers'], sasl_mechanism="PLAIN", ssl_context=context, security_protocol='SASL_SSL', api_version=(0, 10), retries=5, sasl_plain_username=conf['sasl_plain_username'], sasl_plain_password=conf['sasl_plain_password']) return producer def __setconsumer(self): """ 返回消费父链接话题的消费者对象 :return: """ conf = kafka_setting context = ssl.create_default_context() context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED # context.check_hostname = True context.load_verify_locations(CACERT_FILENAME) try: consumer = KafkaConsumer(bootstrap_servers=conf['bootstrap_servers'], group_id=conf['consumer_id'], sasl_mechanism="PLAIN", ssl_context=context, security_protocol='SASL_SSL', api_version=(0, 10), sasl_plain_username=conf['sasl_plain_username'], sasl_plain_password=conf['sasl_plain_password']) except KafkaError as e: Log.e(e + 'kafkaConsumer failed') return consumer # @AsycThread.async def producerUrl(self, strurl): """ 生产父链接 :param strurl: """ try: conf = kafka_setting future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理 #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e+'send message failed') pass def consumerurl(self,pipeDictData): """ 消费父链接 :param queueDictData: """ conf = kafka_setting self.consumer.subscribe((conf['topic_name'])) # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了) for message in self.consumer: jsondata = str(message.value, "utf-8") # Log.i(jsondata) try: dictdata = json.loads(jsondata) except Exception as e: Log.e(e + jsondata) continue # self.setURL_inf(dictdata) #发送源数据,驱动下载器 pipeDictData.send(dictdata) # queueDictData.put(dictdata) @AsycThread.async def producterUUID(self, strurl): """ 生产ggcp话题的uuid :param strurl: """ try: conf = kafka_setting #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs. future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e+'send message failed') pass def setURL_inf(self, dictdata): """ url数据模型 :param dictdata: """ self.URL_inf.dict2class(dictdata) def getURL_inf(self): """ url对外接口 :return: """ return self.URL_inf
class ComsumerChildUrl(multiprocessing.Process): def __init__(self, processName, pipeDictData): multiprocessing.Process.__init__(self) self.processName = processName #Initial ipProxy and heads self.ipProxy = self.getIpPoolMethod() self.headersEngine = HeadersEngine() self.heads = self.headersEngine.getHeaders() #数据库模型和控制器 self.URL_inf = URLinformation() self.__Sendcollection = "httpsearchccgpgovcn" self.mogodbControl = None self.KafkaOperator = None self.pipeDictData = pipeDictData # 任务url消息队列 def downLoadHtml(self): """ 爬取并提取子链接 :param urlInfor: """ if self.ipProxy is None: self.ipProxy = self.getIpPoolMethod() if self.heads is None: self.heads = self.headersEngine.getHeaders() # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn', # 'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0, # 'content': None, # 'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=', # 'SleepTime': 0.0, 'FilePath': None} html = None #爬取网页所有内容 ctifety = 0 #解析子链接标志位 Flag = 1 #爬取完成标志位 count = 0 #空网页计算器 while (Flag): try: if count > 1: self.ipProxy = self.getIpPoolMethod() protocol = 'https' if 'https' in self.ipProxy else 'http' proxiesmmm = {protocol: self.ipProxy} req = requests.get(self.URL_inf.Urlname, headers=self.heads, allow_redirects=False, proxies=proxiesmmm, timeout=3) # 跳过验证反扒机制 soup_validate = BeautifulSoup(req.text, 'lxml') if soup_validate.find(name='title').string == '安全验证': self.ipProxy = self.getIpPoolMethod() continue if req.status_code != 200: self.ipProxy = self.getIpPoolMethod() continue reqheaders = req.headers if "application" in reqheaders["Content-Type"]: data = self.__downlowdFile(data=self.URL_inf, req=req) data['Download'] = 1 elif "text" in reqheaders["Content-Type"]: html = req.content self.URL_inf.Download = 0 ctifety = 1 Flag = 0 # 该回大部队了 else: continue except requests.exceptions.ConnectTimeout as e: Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) self.heads = self.headersEngine.getHeaders() count += 1 if html is None: Flag = 1 except (ConnectionError, Timeout) as e: Flag = 1 count += 1 Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e)) self.heads = self.headersEngine.getHeaders() #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False count += 1 if html is None: Flag = 1 pass except Exception as e: Flag = 1 count += 1 #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language #TODO 处理这种httpconnectionpool max retries Failed to establish a new connection: Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e)) self.heads = self.headersEngine.getHeaders() #异常Max retries exceeded with url Error的处理 s = requests.session() s.keep_alive = False count += 1 if html is None: Flag = 1 pass if ctifety: self.URL_inf.content = html soup = BeautifulSoup(html, 'html.parser') #很棒棒的bs简单解析下 else: soup = None self.URL_inf.soup = soup Log.i(self.URL_inf.content.decode('utf-8')) return self.URL_inf def __downlowdFile(self, data, req): # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way """ 下载文件的代码逻辑.沿用王洋逻辑,还没调试 :param url: :param req: """ reqheaders = req.headers revealfn = data['Urlname'].split('/')[-1] if "." in revealfn[-6:]: fileName = revealfn else: if ('Content-Disposition' in reqheaders.keys()): fileName = reqheaders['Content-Disposition'].split( 'filename=')[1] fileName = fileName.replace('"', '').replace("'", "") else: r = urllib.request.urlopen(data['Urlname']) if r.url != data['Urlname']: fileName = basename(urlsplit(r.url)[2]) data['FileName'] = fileName _FileName = None if (data['FilePath']): _FileName = str(data['FilePath']) + fileName else: _FileName = fileName with open(_FileName, "wb") as donefile: for chunk in req.iter_content(chunk_size=1024): if chunk: donefile.write(chunk) Log.i("File:" + _FileName + "downLoaded") return data def get_md5(self, url, content): #如果content为none,只编码url """ 用MD5编码内容生成 :return: """ return hashlib.md5((urllib.parse.unquote(url).join( str(content)).encode("utf-8"))).hexdigest() # 使用md5编码 def __checkURL(self, urlName): """ 查重函数 :param urlName: :return: """ item = {"_id": urlName} value = self.mogodbControl.findone(item, self.__Sendcollection) # 查询到返回文档 if value == None: # 说明没找到 return False else: return True # 说明找到了 def getIpPoolMethod(self): """ 获取免费代理ip :return: 返回一个免费的ip代理 """ ipProxy = None if ipProxy is None: if USE_PROXY is True: #获取代理的时候保证能至少有一个IP proxyIpPool = getIpProxyPool() if proxyIpPool is not None: ipProxy = proxyIpPool if ipProxy is None: ipProxy = PROXY_NONE_URL else: ipProxy = PROXY_NONE_URL return ipProxy # @AsycThread.async def savedata(self, data): """ 保存数据到mongodb """ #查重 uuid = self.get_md5(data.Urlname, data.title) urlInfo = { "uuid": uuid, "url": data.Urlname, "title": data.title, "time": datetime.now().timestamp(), "content": data.content, "fatherUrl": data.fatherUrl, "province": data.province, "LastTime": data.LastTime } string = data.domain.replace('.', '').replace('/', '').replace(':', '') #查重 删除 替换 if data.province is not None and data.content is not None: item = {"uuid": uuid} value = self.mogodbControl.findone( item, self.__Sendcollection) # 查询到返回文档 #TODO 插入数据库有问题 if value is None: # item = {"_id": uuid} self.mogodbControl.insert(urlInfo, self.__Sendcollection) # self.mogodbControl.ensure_index(item, self.__Sendcollection) self.KafkaOperator.producterUUID( json.dumps({ "uuid": uuid, 'collection': string })) def run(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #监听数据 DictData = self.pipeDictData.recv() if DictData is None: continue #源数据处理(实体类) self.URL_inf.dict2class(DictData) #检查Mongo if self.mogodbControl is None: self.mogodbControl = Mongodb_Operator( DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"], DbdataCCGPDFZB["db_name"], DbdataCCGPDFZB["default_collection"]) #检查Kafka if self.KafkaOperator is None: self.KafkaOperator = localKafkaUrlinformation() #查重 uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title) item = {"uuid": uuid} value = self.mogodbControl.findone( item, self.__Sendcollection) # 查询到返回文档 # #TODO 插入数据库有问题 if value is not None: continue #获取首页内容 self.URL_inf = self.downLoadHtml() if self.URL_inf is None: continue #异步保存数据 self.savedata(self.URL_inf) # #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('consumerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)