Example #1
0
    def __downlowdFile(self, url, req):
        # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way
        """
        下载文件的代码逻辑.沿用王洋逻辑,还没调试
        :param url:
        :param req:
        """
        reqheaders = req.headers
        revealfn = url.split('/')[-1]

        if "." in revealfn[-6:]:
            fileName = revealfn
        else:
            if ('Content-Disposition' in reqheaders.keys()):
                fileName = reqheaders['Content-Disposition'].split('filename=')[1]
                fileName = fileName.replace('"', '').replace("'", "")
            else:
                r = urllib.request.urlopen(url)
                if r.url != url:
                    fileName = basename(urlsplit(r.url)[2])
            self.URL_inf.FileName = fileName

        _FileName = None
        if (self.URL_inf.FilePath):
            _FileName = str(self.URL_inf.FilePath) + fileName
        else:
            _FileName = fileName

        with open(_FileName, "wb") as donefile:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    donefile.write(chunk)

        Log.i("File:"+_FileName+"downLoaded")
Example #2
0
 def run(self):
     '''
     分发
     :return: 无
     '''
     Log.i('Pipeline.run()')
     if self.task['results'] is not None and len(self.task['results']) > 0:
         #下次任务入队列
         if self.task['next_tasks'] is not None:
             for next_task in self.task['next_tasks']:
                 self.taskUtil.insert_one(next_task)
         #本次解析结果入库
         # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数
         if hasattr(self, 'pipeline_' + self.task['parser']):
             func = getattr(self, 'pipeline_' + self.task['parser'])
             func(self.task['table'])
         else:
             self.pipeline_default(self.task['table'])
         #将完整task存入mongo,并将本条task
         self.task['state'] = 'done'
         self.taskUtil.replace_one(self.task['_id'], self.task)
     else:
         #没有解析出结果,则表示中间出错了,等待下次再启动
         pass
     Log.i('this task is finished')
Example #3
0
 def find_one_and_replace(self, collection_name, filter_dict, replace_dict, upsert=False, auto_uptime=True):
     """
     查找并更新表记录,默认返回false,保证原子性
     :param collection_name: str 集合名
     :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}}
     :param update_dict: dict 更新的字段,如{'$set':{status_key:0,'campaign.status':1},{'$unset':'campaign.name':'test_camp'}}
     :param insert: bool 如果需要更新的记录不存在是否插入
     :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有
     :return: Document 更新成功后的文档
     """
     result = None
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             replace_dict['uptime']=uptime
             replace_dict['uptimestamp'] = uptimestamp
         collection = self.database.get_collection(collection_name)
         document=collection.find_one_and_replace(filter_dict, replace_dict, upsert=upsert,return_document=ReturnDocument.AFTER)
         result = document
         if result is None:
             Log.i("[INFO] find and update nothing!")
         else:
             Log.d("[INFO] find and update success!")
     except Exception as e:
         Log.e('find and update failed: %s' % e)
     finally:
         return result
 def __init__(self, host, port, db_name, default_collection):
     Log.i('Init MongoDB')
     self.client = pymongo.MongoClient(
         host=host, port=port, connect=False
     )  # Connection() 和 MongoClient() safe MongoClient被设计成线程安全、可以被多线程共享的
     self.db = self.client.get_database(db_name)
     self.collection = self.db.get_collection(default_collection)
Example #5
0
 def run(self):
     '''
     每隔1秒,循环读取tasks
     交给Downloader
     :return:
     '''
     #创建进程池
     pool = Pool()
     while True:
         #获取一条待执行的Task,并置为doing状态
         task = self.taskUtil.get_ready()
         if task is not None and len(task) > 0 or True:
             Log.i('-----------------------------')
             #用进程池启动Downloader
             pool.apply_async(self.run_downloader, args=(task, ))
         #休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('scheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('Start sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
     pool.close()
     pool.join()
     log.i('All subprocesses done.')
def run():
    if sys.argv[1] is None and sys.argv[2] is None and sys.argv[3] is None:
        Log.i("no params error")
        os._exit(0)
    else:
        isCrontab = sys.argv[1]
        begin = sys.argv[2]
        end = sys.argv[3]

    #是否crontab命令启动,1代表是,其它代表否
    if isCrontab==str(1):
        crontab = 1
    else:
        crontab = 0

    p_list = list()
    producerProcess = ProducerUrl("producer", crontab, begin, end)
    p_list.append(producerProcess)

    # start = TimeUtil.getDefaultTimeIt()

    for p in p_list:
        p.daemon = True
        p.start()
    for p in p_list:
        p.join()

    # end = TimeUtil.getDefaultTimeIt()
    # Log.i('ProducerUrlParentPid run for %.2fm' %(end - start))
    if crontab==1:
        os._exit(0)
Example #7
0
 def consumerurl(self):
     """
     消费父链接
     :param queueDictData:
     """
     conf = localKafka_setting
     self.consumer.subscribe((conf['topic_name']))
     # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了)
     for message in self.consumer:
         jsondata = str(message.value, "utf-8")
         Log.i(jsondata)
Example #8
0
    def _sendChildUrl(self,URL_inf, mutex_lock):
        # # 保存数据并提取子链接重新投入生产对应的话题
        KafkaOperator = kafkaUrlinformation()
        # TODO 这里用类管理不同网站的逻辑
        parseCCGPModule = ParserCCGPModule(URL_inf, KafkaOperator)
        ccgpChildrenLink = parseCCGPModule.getLinks()

        if ccgpChildrenLink is None:
            mutex_lock.release()  # 临界区结束,互斥的结束
            return

        for link in ccgpChildrenLink:
            #于浩说不要发父链接给他
            if link.DeepNum >= 0:
                Log.i("produce<<"+json.dumps(link.class2dict()))
                KafkaOperator.producerUrl(json.dumps(link.class2dict()))
    def run(self):

        try:
            pass
        except Exception as e:
            Log.i("AsyncThreadScanner run exception<<" + e.message)

        # 移除线程队列
        AsyncThreadScanner.lck.acquire()
        AsyncThreadScanner.tList.remove(self)

        # 如果移除此完成的队列线程数刚好达到上限数值-1,则说明有线程在等待执行,那么我们释放event,让等待事件执行
        if len(AsyncThreadScanner.tList) == AsyncThreadScanner.maxThreads - 1:
            AsyncThreadScanner.event.set()
            AsyncThreadScanner.event.clear()

        AsyncThreadScanner.lck.release()
Example #10
0
    def run(self):
        '''
        获取免费IP代理进程执行,循环读取tasks
        :return:
        '''
        Log.i('Downloader.run() in {0}'.format(time.ctime()))

        p_list = list()

        downloaderRun = Process(target=self.run_downloader, args=(self.pipeDictData,))
        p_list.append(downloaderRun)

        for p in p_list:
            p.daemon = True
            p.start()
        for p in p_list:
            p.join()
Example #11
0
 def run(self):
     '''
     线程执行,默认调用方法,任务分发
     :return:
     '''
     Log.i('Downloader.run()')
     #利用反射机制自动执行download_<parser名>()函数,如果找不到则执行默认的download_default()函数
     if hasattr(self, 'download_' + self.task['parser']):
         func = getattr(self, 'download_' + self.task['parser'])
         func()
     else:
         self.download_default()
     #启动解析器
     parserModule = Setting.PARSER_MODULE
     ParserX = importlib.import_module(parserModule)
     parser = ParserX.Parser(self.task)
     parser.run()
Example #12
0
 def run(self):
     '''
     分发
     :return:
     '''
     Log.i('Parser.run()')
     # 利用反射机制自动执行parse_<parser名>()函数,如果找不到则执行默认的parse_default()函数
     if hasattr(self, 'parse_' + self.task['parser']):
         func = getattr(self, 'parse_' + self.task['parser'])
         func()
     else:
         self.parse_default()
     #启动Pipeline
     pipelineModule = Setting.PIPELINE_MODULE
     PipelineX = importlib.import_module(pipelineModule)
     pipeline = PipelineX.Pipeline(self.task)
     pipeline.run()
def update_proxy():
    """
    获取并校验代理ip地址
    :return:
    """
    if USE_PROXY:
        i = 0
        while True:
            try:
                get_proxy()
                notify_ip_address()
                return True
            except Exception:
                i += 1
                Log.e("代理获取失败,尝试重试,重试次数%s" % (i, ))
    else:
        Log.i('notify address')
        notify_ip_address()
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #监听数据
            DictData = self.pipeDictData.recv()
            if DictData is None:
                continue
            #源数据处理(实体类)
            self.URL_inf.dict2class(DictData)
            #检查Mongo
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(
                    DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                    DbdataCCGPDFZB["db_name"],
                    DbdataCCGPDFZB["default_collection"])
            #检查Kafka
            if self.KafkaOperator is None:
                self.KafkaOperator = localKafkaUrlinformation()
            #查重
            uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title)
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            # #TODO 插入数据库有问题
            if value is not None:
                continue
            #获取首页内容
            self.URL_inf = self.downLoadHtml()
            if self.URL_inf is None:
                continue
            #异步保存数据
            self.savedata(self.URL_inf)

            # #休眠n秒(从配置文件中读取)
            items = ConfigUtil.getItems('consumerScheduler')
            interval_min = items['interval_min']
            interval_max = items['interval_max']
            seconds = random.randint(int(interval_min), int(interval_max))
            Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
            time.sleep(seconds)
def get_proxy():
    """
    获取代理ip,并更新控制器PROXIES
    :return: 可用的ip代理
    """
    if USE_PROXY is False:
        return None

    try:
        Log.i('获取代理···')
        resp = requests.get(PROXY_URL, timeout=TIMEOUT)
        ip_address = resp.text
        proxies = {'http': ip_address, 'https': ip_address}
        # Log.i(proxies)
        PROXIES = proxies
        return PROXIES
    except Exception as e:
        Log.e('无法获取代理信息,请确认代理系统是否启动')
        return None
 def savedata(self):
     """
     保存数据到mongodb
     """
     uuid = self.get_md5()
     item = {
         "uuid": uuid,
         "url": self.Url_inf.Urlname,
         "title": self.Url_inf.title,
         "time": datetime.now().timestamp(),
         "lastTime": self.Url_inf.LastTime,
         "content": self.Url_inf.content,
         "fatherUrl": self.Url_inf.fatherUrl,
         "province": self.Url_inf.province
     }
     string = self.Url_inf.domain.replace('.',
                                          '').replace('/',
                                                      '').replace(':', '')
     # 查重 删除 替换
     if USE_SOURCEURL_TYPE is True:
         if self.Url_inf.province is not None and self.Url_inf.content is not None:
             value = self.myDb.findone({"uuid": uuid})  # 查询到返回文档
             if value is None:
                 Log.i(self.Url_inf.content.decode('utf-8'))
                 self.myDb.insert(item, string)
                 self.myDb.ensure_index("uuid", string)
                 self.KafkaOperator.producterUUID(
                     json.dumps({
                         "uuid": uuid,
                         'collection': string
                     }))
     else:
         self.myDb.insert(item, string)
         self.myDb.ensure_index("uuid", string)
         self.KafkaOperator.producterUUID(
             json.dumps({
                 "uuid": uuid,
                 'collection': string
             }))
Example #17
0
 def run(self):
     '''
     获取免费IP代理进程执行,循环读取tasks
     :return:
     '''
     Log.i('proxyIpPool.run() in {0}'.format(time.ctime()))
     while True:
         #调用本地和远程的免费ip代理api并推进ip消息队列
         proxyIpPool = getIpProxyPool()
         #统一改成本地
         proxyIpPoolFromeRemote = getIpProxyPool()
         # proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote()
         if proxyIpPool is not None:
             self.queueDictData.put(proxyIpPool)
         if proxyIpPoolFromeRemote is not None:
             self.queueDictData.put(proxyIpPoolFromeRemote)
         # 休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('proxyIpScheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('proxyIpPool sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i ('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #生产URL
            if USE_SOURCEURL_TYPE is True:
                if USE_ASYNCTASK_TYPE is True:
                    urlInformationList = ConfigUtil.readSourceListByParams(self.begin, self.end)
                else:
                    urlInformationList = ConfigUtil.readSourceList()
            else:
                urlInformationList = ConfigUtil.readTaskList()

            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()
                diststrjson = json.dumps(data)
                Log.i(diststrjson)
                KafkaOperator = kafkaUrlinformation()
                KafkaOperator.producerUrl(diststrjson)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab==1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items=ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds=random.randint(int(interval_min),int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
Example #19
0
    def simpleRun(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #资源检查
            # KafkaOperator = kafkaUrlinformation()
            KafkaOperator = localKafkaUrlinformation()
            # if self.mogodbControl is None:
            #     self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"],
            #                                       Dbdata["default_collection"])
            #解析数据源
            # if USE_SOURCEURL_TYPE is True:
            #     if USE_ASYNCTASK_TYPE is True:
            #         urlInformationList = ConfigUtil.readSourceListRealTime()
            #     else:
            #         urlInformationList = ConfigUtil.readSourceList()
            # else:
            #     urlInformationList = ConfigUtil.readTaskList()

            urlInformationList = ConfigUtil.readSourceListRealTime()

            #爬取,解析子URL
            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()

                #获取首页内容
                dowloadData = self.downLoadHtml(data)
                if dowloadData is None:
                    continue
                # 解析提取分页url
                pageData = self.getPageNumFromHome(dowloadData)
                if pageData is None:
                    continue
                for pageIndex in pageData:
                    # 获取首页内容
                    dowloadPageData = self.downLoadHtml(pageIndex.class2dict())
                    if dowloadPageData is None:
                        continue
                    #提取子链接
                    # self.URL_inf.dict2class(pageIndex)
                    ccgpChildrenLink = self.getChildrenLink(dowloadPageData)
                    if ccgpChildrenLink is None:
                        continue
                    #KAFKA下发子链接
                    for link in ccgpChildrenLink:
                        # 检查Mongo
                        if self.mogodbControl is None:
                            self.mogodbControl = Mongodb_Operator(
                                DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                                DbdataCCGPDFZB["db_name"],
                                DbdataCCGPDFZB["default_collection"])
                        # 查重,不重复发送到kafka节省资源
                        if link.title is None:  #标题为空的不发送
                            continue
                        uuid = self.get_md5(link.Urlname, link.title)
                        item = {"uuid": uuid}
                        value = self.mogodbControl.findone(
                            item, self.__Sendcollection)  # 查询到返回文档
                        # #TODO 插入数据库有问题
                        if value is not None:  #数据库查重
                            continue
                        # 于浩说不要发父链接给他
                        if link.DeepNum >= 0:
                            producerData = json.dumps(link.class2dict())
                            Log.i("produce<<" + producerData)
                            KafkaOperator.producerUrl(producerData)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab == 1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items = ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds = random.randint(int(interval_min), int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
Example #20
0
    def getPageNumFromHome(self, dowloadData):
        """
        获取分页的页码URL
        """
        if dowloadData['soup'] is None:
            return []
        else:
            # Log.i(dowloadData['content'].decode('utf-8'))
            selector = etree.HTML(dowloadData['content'].decode('utf-8'))

            try:
                page = (int(
                    selector.xpath(
                        '//div[@class="vT_z"]/div[1]/div/p[1]/span[2]/text()')
                    [0]) // 20) + 3
            except:
                return []

            if page is None:
                return []
            parentURL_infor = []
            #随机数的方法判断倒序
            num = random.randint(3, 7)
            #存放url
            tempUrl = ''
            if (num % 2) == 0:
                for i in range(1, page):
                    #TODO字符串替换有问题
                    #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    # x = 'page_index=' + str(i)
                    # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname'])
                    #TODO 这里拼接数据有问题
                    # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \
                    #                          + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\
                    #                          +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    x = 'page_index=' + str(i)
                    tempUrl = re.sub(r'page_index=(.)', x,
                                     dowloadData['Urlname'])
                    Log.i("parseUrl<<" + tempUrl)
                    urlChildInfo = URLinformation(
                        Urlname=tempUrl,
                        title=dowloadData['title'],
                        DeepNum=dowloadData['DeepNum'],
                        domain=dowloadData['domain'],
                        fatherUrl=dowloadData['fatherUrl'])
                    parentURL_infor.append(urlChildInfo)
                else:
                    if parentURL_infor is not None:
                        page = 0
                        return parentURL_infor
            else:
                for i in range(page - 1, 0, -1):
                    #TODO字符串替换有问题
                    #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    # x = 'page_index=' + str(i)
                    # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname'])
                    # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \
                    #                          + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\
                    #                          +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='

                    x = 'page_index=' + str(i)
                    tempUrl = re.sub(r'page_index=(.)', x,
                                     dowloadData['Urlname'])
                    Log.i("parseUrl<<" + tempUrl)
                    urlChildInfo = URLinformation(
                        Urlname=tempUrl,
                        title=dowloadData['title'],
                        DeepNum=dowloadData['DeepNum'],
                        domain=dowloadData['domain'],
                        fatherUrl=dowloadData['fatherUrl'])
                    parentURL_infor.append(urlChildInfo)
                else:
                    if parentURL_infor is not None:
                        page = 0
                        return parentURL_infor
Example #21
0
    def getChildrenLink(self, pageIndex):
        """
        获取子链接
        :return:
        """
        pattern = r'htt(p|ps):\/\/(\w+\.)+\w+/(\w+/)*'
        pattern = re.compile(pattern)
        # print("domain" + str(self.Url_inf.Urlname))
        Keyvalue = pattern.search(pageIndex['Urlname'])
        # Keyvalue  <_sre.SRE_Match object; span=(0, 26), match='http://search.ccgp.gov.cn/'>
        # print("Keyvalue  " + str(Keyvalue))
        # print(self.Url_inf.Urlname)
        if Keyvalue != None:
            Keyvalue = Keyvalue.group()
        else:
            Keyvalue = domain = urlparse(
                pageIndex['Urlname']).scheme + "://" + urlparse(
                    pageIndex['Urlname']).netloc

        domain = Keyvalue
        URL_infor = []
        URL_infor2 = []
        Links = []
        link2 = ''
        title = ''
        currentTime = ''
        total_title = ''

        # if self.Url_inf.soup == None:
        #     return []
        if USE_BXBLS is True:
            #分成两个业务
            # if self.Url_inf.Urlname.find("zygg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # elif self.Url_inf.Urlname.find("dfgg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # else:
            #     ul_content = self.Url_inf.soup
            if pageIndex['soup'] is None:
                return []
            else:
                urlInfoList = pageIndex['soup'].select(
                    ".vT-srch-result-list-bid")

            if urlInfoList is None:
                return []

            if urlInfoList:
                ul_content = urlInfoList[0]
            else:
                return []

            for li in ul_content.select("li"):
                link = li.select("a")[0]

                # emProvince = li.select("span")[2].get_text()
                spanProvince = li.select("span")[0]
                emProvince = spanProvince.select("a")[0].get_text()
                currentTime = time.time()

                try:
                    href2 = link['href']
                    total_title = link['title']
                except KeyError:
                    pageIndex['soup'].select("a").remove(link)
                # else:
                if (
                        href2.startswith("/")
                ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                    # link2 = urljoin(self.Url_inf.Urlname, href2)
                    # print(str(link2))

                    # link2=self.Url_inf.Urlname+href2
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                elif (href2.startswith("../../..")):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('../../..',domain)
                elif href2.startswith(".."):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('..',domain)
                elif href2.startswith("./"):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('./',domain+'/')
                elif 'http' in href2 and 'gov' in href2:
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2

                link2 = urljoin(pageIndex['Urlname'], href2)
                # print("link2 is :" + str(link2))
                #title不全的问题
                if title.find("...") > -1:
                    title = total_title

                title = title.strip('\r')
                myLinkUrl = URLinformation(Urlname=link2,
                                           title=title,
                                           DeepNum=pageIndex['DeepNum'] - 1,
                                           domain=pageIndex['domain'],
                                           fatherUrl=pageIndex['Urlname'],
                                           province=emProvince,
                                           LastTime=currentTime)
                URL_infor.append(myLinkUrl)

        else:
            for link in pageIndex['soup'].select("a"):
                # print(str(self.Url_inf.soup))
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201310/t20131008_3148218.htm" style="line-height:18px" target="_blank">
                #                                         南方科技大学等离子体技术基础仪器采购项目招标公告
                #                                     </a>
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144053.htm" style="line-height:18px" target="_blank">
                #                                         2013年国家良种补贴牦牛、绵羊、奶牛冻精、肉牛冻精采购项目公开招标公告
                # print("children url is : "+ str(link))
                try:
                    href2 = link['href']  # 取出href对应的网站信息 具体信息如上
                    # print("href2:   " + str(href2))
                    # 取出的信息包含情况如下3种
                    # http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144362.htm
                    # javascript:void(0)
                    # #
                except KeyError:
                    pageIndex['soup'].select("a").remove(link)

                else:  # try正确运行 则运行else
                    if (
                            href2.startswith("/")
                    ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                        # link2 = urljoin(self.Url_inf.Urlname, href2)
                        # print(str(link2))

                        # link2=self.Url_inf.Urlname+href2
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                    elif (href2.startswith("../../..")):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('../../..',domain)
                    elif href2.startswith(".."):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('..',domain)
                    elif href2.startswith("./"):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('./',domain+'/')
                    elif 'http' in href2 and 'gov' in href2:
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2

                    link2 = urljoin(pageIndex['Urlname'], href2)
                    # print("link2 is :" + str(link2))
                    myLinkUrl = URLinformation(Urlname=link2,
                                               title=title,
                                               DeepNum=pageIndex['DeepNum'] -
                                               1,
                                               domain=pageIndex['domain'],
                                               fatherUrl=pageIndex['Urlname'])
                    URL_infor.append(myLinkUrl)

        if USE_BXBLS is True:
            Links = list(set(URL_infor))
        else:
            #TODO 出现AttributeError: 'NoneType' object has no attribute 'select'
            for http in pageIndex['soup'].select('option'):  # 暂时未知有何内容
                try:
                    http2 = http['value']
                    # print("option" + str(http))
                except KeyError:
                    pageIndex['soup'].select("option").remove(http)
                else:
                    if "gov" in http2 and 'http' in http2:
                        myLinkUrl2 = URLinformation(
                            Urlname=http2,
                            title=http.text,
                            DeepNum=pageIndex['DeepNum'] - 1,
                            domain=pageIndex['domain'],
                            fatherUrl=pageIndex['Urlname'])
                        URL_infor2.append(myLinkUrl2)

            Links = list(set(URL_infor + URL_infor2))

        #TODO [2018-05-15 18:13:47.492] [INFO] [31469] [getChildrenLink(),ParseCCGPModule.py:129]: This url have 56  children urls1
        Log.i("This url have " + str(len(Links)) + "  children urls" +
              str(pageIndex['DeepNum']))
        return Links
Example #22
0
 def parse_default(self):
     Log.i('run default parser ')
     pass
    def downLoadHtml(self):
        """
        爬取并提取子链接
        :param urlInfor:
        """
        if self.ipProxy is None:
            self.ipProxy = self.getIpPoolMethod()
        if self.heads is None:
            self.heads = self.headersEngine.getHeaders()

        # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn',
        #  'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0,
        #  'content': None,
        #  'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=',
        #  'SleepTime': 0.0, 'FilePath': None}

        html = None  #爬取网页所有内容
        ctifety = 0  #解析子链接标志位
        Flag = 1  #爬取完成标志位
        count = 0  #空网页计算器
        while (Flag):
            try:
                if count > 1:
                    self.ipProxy = self.getIpPoolMethod()

                protocol = 'https' if 'https' in self.ipProxy else 'http'
                proxiesmmm = {protocol: self.ipProxy}

                req = requests.get(self.URL_inf.Urlname,
                                   headers=self.heads,
                                   allow_redirects=False,
                                   proxies=proxiesmmm,
                                   timeout=3)
                # 跳过验证反扒机制
                soup_validate = BeautifulSoup(req.text, 'lxml')
                if soup_validate.find(name='title').string == '安全验证':
                    self.ipProxy = self.getIpPoolMethod()
                    continue
                if req.status_code != 200:
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    data = self.__downlowdFile(data=self.URL_inf, req=req)
                    data['Download'] = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    self.URL_inf.Download = 0
                    ctifety = 1
                    Flag = 0  # 该回大部队了
                else:
                    continue
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                count += 1
                if html is None:
                    Flag = 1
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count += 1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                requests.adapters.DEFAULT_RETRIES = 5
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass

        if ctifety:
            self.URL_inf.content = html
            soup = BeautifulSoup(html, 'html.parser')  #很棒棒的bs简单解析下
        else:
            soup = None

        self.URL_inf.soup = soup
        Log.i(self.URL_inf.content.decode('utf-8'))
        return self.URL_inf