Beispiel #1
0
 def __scheduleProjectCorpInfo__(self):
     try:
         conn = RedisClient()
         spider = ProjectCorpInfo()
         print('开始获取项目')
         key = 'TempProjectCorpInfoID'
         list_id = list(set(conn.all(rediskey=key)))
         spider.run(list_id, main_url=[PROJECTCORPINFO], key=key)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #2
0
 def __scheduleProjectFinishInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司项目竣工信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         print('开始获取公司项目竣工信息')
         spider = ProjectFinishListSpider()
         key = 'TempProFinishListID'
         list_id = list(set(conn.all(rediskey=key)))
         spider.run(list_id, main_url=[FINISHMANAGE], key=key)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #3
0
 def __scheduleBuildLicenceInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司项目施工许可信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         print('开始获取公司项目施工许可信息')
         spider = BuildLicenceListSpider()
         key = 'TempBuildLicenceListID'
         list_id = list(set(conn.all(rediskey=key)))
         spider.run(list_id, main_url=[LICENCEMANAGE], key=key)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #4
0
 def __scheduleContractRecordInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司项目合同登记信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         print('开始获取公司项目合同登记信息')
         spider = ContractRecordListSpider()
         key = 'TempContractListID'
         list_id = list(set(conn.all(rediskey=key)))
         spider.run(list_id, main_url=[CONTRACTRECORD], key=key)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #5
0
 def __scheduleTenderInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取项目招标信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         print('开始获取公司项目招标信息')
         spider = TenderListSpider()
         key = 'TempTenderListID'
         list_id = list(set(conn.all(rediskey=key)))
         spider.run(list_id, main_url=[TENDERAPI], key=key)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #6
0
 def __scheduleProjectInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司项目信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         spider = ProjectInfoSpider()
         print('开始获取公司项目信息')
         print('当前以获取项目ID量为:', conn.count(rediskey='ProjectInfoID'))
         list_id = list(set(conn.all(rediskey='TempProjectListID')) - set(conn.all(rediskey='ProjectInfoID')))
         spider.run(list_id)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #7
0
 def __scheduleCompInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         spider = CompanyInfoSpider()
         print('开始获取公司信息')
         print('当前以获取公司信息量为:', conn.count(rediskey='CompInfoID'))
         list_id = list(set(conn.all(rediskey='TempCompInfoID')) - set(conn.all(rediskey='CompInfoID')))
         # new_list = self.__divList__(list_id=list_id)
         spider.run(list_id)
     except Exception as e:
         print("Error Spider comp info", e)
Beispiel #8
0
 def __scheduleBuildLicencePerson__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司项目施工许可人员信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         spider = BuildLicencePersonSpider()
         print('开始获取公司项目施工许可人员信息')
         print('当前以获取项目施工许可人员ID量为:', conn.count(rediskey='BuildLicencePersonID'))
         list_id = list(
             set(conn.all(rediskey='BuildLicenceInfoID')) - set(conn.all(rediskey='BuildLicencePersonID')))
         spider.run(list_id)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #9
0
 def __scheduleWithinProjectList__(self, cycle=SPIDER_CYCLE, main_url=None, TempList=None):
     """
     获取项目内的各个类型list
     :param cycle:
     :param main_url:
     :param TempList:
     :return:
     """
     try:
         conn = RedisClient()
         key = 'Temp' + TempList
         list_id = list(set(conn.all(rediskey=key)))
         spider = SaveWithinProjectSpider()
         spider.run(list_id, main_url, TempList)
     except Exception as e:
         print("Error Spider project list", e)
Beispiel #10
0
 def __schedulePersonInfo__(self, cycle=SPIDER_CYCLE):
     """
     定时爬取公司人员信息
     :param cycle:
     :return:
     """
     try:
         conn = RedisClient()
         spider = PersonSpider()
         print('开始获取公司人员信息')
         print('当前以获取公司人员信息量为:', conn.count(rediskey='PersonInfoID'))
         list_id = list(set(conn.all(rediskey='CompInfoID')) - set(conn.all(rediskey='PersonInfoID')))
         # new_list = self.__divList__(list_id=list_id)
         spider.run(list_id)
         time.sleep(cycle)
     except Exception as e:
         print("Error Spider Staff info", e)
Beispiel #11
0
 def __init__(self):
     self._redis = RedisClient()
     self._mysql = MySQLClient()
     self._HEADERS = HEADERS
     self.ip = None
     self.port = None
Beispiel #12
0
class SpiderMain(object):
    def __init__(self):
        self._redis = RedisClient()
        self._mysql = MySQLClient()
        self._HEADERS = HEADERS
        self.ip = None
        self.port = None

    async def get_one_page(self, url):
        try:
            if self.ip is None:
                ip, port = self.__getProxy__()
                self.ip = ip
                self.port = port
            real_proxy = 'http://' + str(self.ip) + ":" + str(self.port)
            async with asyncio.Semaphore(MAX_ID):
                async with aiohttp.ClientSession(
                        connector=aiohttp.TCPConnector(ssl=False)) as session:
                    async with session.get(url,
                                           proxy=real_proxy,
                                           headers=self._HEADERS,
                                           timeout=15) as r:
                        if r.status == 200 or r.status == 408:
                            return await r.text()
                        else:
                            return await self.get_one_page(url)
        except Exception as e:
            print('请求异常: ' + str(e))
            ip, port = self.__getProxy__()
            self.ip = ip
            self.port = port
            await self.get_one_page(url)

    # 并发爬取
    async def main(self, urls, comp_id=None):
        try:
            # 任务列表
            tasks = [self.get_one_page(url) for url in urls]
            # 并发执行并保存每一个任务的返回结果
            results = await asyncio.gather(*tasks)
            # 返回解析为字典的数据
            if len(results) > 0:
                if '4bd02be856577e3e61e83b86f51afca55280b5ee9ca16beb9b2a65406045c9497c089d5e8ff97c63000f62b011a6' \
                   '4f4019b64d9a050272bd5914634d030aab69' in results or results[0] is False:
                    #  获取动态ip 传入
                    ip, port = self.__getProxy__()
                    self.ip = ip
                    self.port = port
                    print("动态 ip 为" + str(ip) + ", 端口:" + str(port))
                    my_proxy = 'http://' + str(ip) + ":" + str(port)
                    access_token = getToken(my_proxy)
                    while access_token is None:
                        access_token = getToken()
                    self._HEADERS = {
                        'Accept':
                        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'accessToken': access_token
                    }
                    print(access_token)
                    await self.main(urls, comp_id)
                # 保存数据
                await self.__saveJsonData__(data=results, comp_id=comp_id)
        except Exception as e:
            print(e)

    def __getMaxPage__(self, url):
        try:
            if self.ip is None:
                ip, port = self.__getProxy__()
                self.ip = ip
                self.port = port
            proxyMeta = "http://%(host)s:%(port)s" % {
                "host": self.ip,
                "port": self.port,
            }
            proxies = {
                "http": proxyMeta,
            }
            response = requests.get(url,
                                    proxies=proxies,
                                    headers=self._HEADERS,
                                    verify=False,
                                    timeout=10)
            if '4bd02be856577e3e61e83b86f51afca55280b5ee9ca16beb9b2a65406045c9497c089d5e8ff97c63000f62b011a6' \
               '4f4019b64d9a050272bd5914634d030aab69' in response.text:
                access_token = getToken()
                while access_token is None:
                    access_token = getToken()
                self._HEADERS = {
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'accessToken': access_token
                }
                return self.__getMaxPage__(url)
            res = decrypts(response.text)
            res = str(res).replace(
                "'", "").split('success')[0] + 'success":true}' + "]"
            data_json = json.loads(res)
            if data_json[0]['code'] == 401:
                time.sleep(60)
                return self.__getMaxPage__(url)
            elif data_json[0]['code'] == 200:
                return data_json
            else:
                return self.__getMaxPage__(url)
        except Exception as e:
            print(e)
            ip, port = self.__getProxy__()
            self.ip = ip
            self.port = port
            return self.__getMaxPage__(url)

    def __getID__(self, rediskey=None):
        return self._redis.batch(rediskey=rediskey)

    def __findOneID__(self, idx=None, rediskey=None):
        return self._redis.exists(idx=idx, rediskey=rediskey)

    def __saveOneID__(self, idx=None, rediskey=None, score=None):
        if score is not None:
            self._redis.add(idx=idx, rediskey=rediskey, score=score)
        else:
            self._redis.add(idx=idx, rediskey=rediskey)

    def __saveOneID__(self, idx=None, rediskey=None, score=None):
        if score is not None:
            self._redis.add(idx=idx, rediskey=rediskey, score=score)
        else:
            self._redis.add(idx=idx, rediskey=rediskey)

    def __deleteID__(self, idx=None, rediskey=None):
        return self._redis.deletes(idx=idx, rediskey=rediskey)

    def __saveListID__(self, list_id, rediskey=None):
        for idx in list_id:
            self._redis.add(idx=idx, rediskey=rediskey)

    def __saveOneData__(self, table_name, data):
        print(data)
        return self._mysql.__insertData__(table_name=table_name, data=data)

    def __closeMysql__(self):
        try:
            self._mysql.__closeDB__()
        except Exception as e:
            print('Close Mysql failed!', e)

    def __asyncSpider__(self, list_id=None, comp_id=None):
        loop = asyncio.get_event_loop()
        loop.run_until_complete(self.main(list_id, comp_id))

    """
    获取代理ip
    """

    def __getProxy__(self):
        url = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=42599&port=1&ts=1&lb=1&pb=4&regions='
        response = requests.get(url=url)
        json_str = json.loads(response.text)
        ip = json_str["data"][0]["ip"]
        port = json_str["data"][0]["port"]
        return (ip, port)

    def __getYunProxy__(self):
        url = 'http://gec.ip3366.net/api/?key=20191204153949621&getnum=1&anonymoustype=3&filter=1&area=1&order=2&formats=2'
        response = requests.get(url=url)
        json_str = json.loads(response.text)
        ip = json_str[0]["Ip"]
        port = json_str[0]["Port"]
        return (ip, port)

    def run(self, data_list):
        for data in data_list:
            self.__asyncSpider__(list_id=data)
        self.__closeMysql__()
Beispiel #13
0
 def __init__(self):
     self._redis = RedisClient()
     self._mysql = MySQLClient()
     self._HEADERS = HEADERS
Beispiel #14
0
class SpiderMain(object):
    def __init__(self):
        self._redis = RedisClient()
        self._mysql = MySQLClient()
        self._HEADERS = HEADERS

    async def get_one_page(self, url):
        try:
            async with ClientSession() as session:
                async with session.get(url, headers=self._HEADERS) as r:
                    # res = decrypts(r.text)
                    return await r.text()
        except Exception as e:
            print('请求异常: ' + str(e))
            return {}

    # 并发爬取
    async def main(self, urls):
        # 任务列表
        tasks = [self.get_one_page(url) for url in urls]
        # 并发执行并保存每一个任务的返回结果
        results = await asyncio.gather(*tasks)
        # 返回解析为字典的数据
        if '4bd02be856577e3e61e83b86f51afca55280b5ee9ca16beb9b2a65406045c9497c089d5e8ff97c63000f62b011a6' \
           '4f4019b64d9a050272bd5914634d030aab69' in results:
            accessToken = getToken()
            self._HEADERS = {
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'accessToken': accessToken
            }
            print(accessToken)
            await self.main(urls)
        # 保存数据
        self.__saveJsonData__(data=results)
        # for i in results:
        #     print(decrypts(i))

    def __getMaxPage__(self):
        pass

    def __getID__(self, rediskey=None):
        return self._redis.batch(rediskey=rediskey)

    def __findOneID__(self, idx=None, rediskey=None):
        return self._redis.exists(idx=idx, rediskey=rediskey)

    def __saveOneID__(self, idx=None, rediskey=None, score=None):
        if score is not None:
            self._redis.add(idx=idx, rediskey=rediskey, score=score)
        else:
            self._redis.add(idx=idx, rediskey=rediskey)

    def __saveone__(self, idx=None, rediskey=None):
        self._redis.add_one(idx=idx, rediskey=rediskey)

    def __deleteID__(self, idx=None, rediskey=None):
        return self._redis.deletes(idx=idx, rediskey=rediskey)

    def __saveListID__(self, list_id, rediskey=None):
        for idx in list_id:
            self._redis.add(idx=idx, rediskey=rediskey)

    def __saveOneData__(self, table_name, data):
        print(data)
        return self._mysql.__insertData__(table_name=table_name, data=data)

    def __closeMysql__(self):
        try:
            self._mysql.__closeDB__()
        except Exception as e:
            print('Close Mysql failed!', e)

    def run(self, data_list):
        datas = [data_list[x:x + 5] for x in range(0, len(data_list), 5)]
        for data in datas:
            self.__spiderInfo__(data=data)
Beispiel #15
0
    def __scheduleCompListInfo__(self):

        try:
            conn = RedisClient()
            mysql = MySQLClient()
            sql = 'select * from companyName where flag is null order by id limit %d, %d' % (NUM, PRE)
            list_name = mysql.getAll(sql)
            for list_id in list_name:
                if conn.exists(idx=list_id[1], rediskey='CompName'):
                    print(list_id[1], ' comp info is spiders')
                else:
                    conn.delete_key(rediskey='CompName')
                    conn.delete_key(rediskey='TempCompInfoID')
                    conn.delete_key(rediskey='CompInfoID')
                    conn.delete_key(rediskey='QualificationInfoID')
                    conn.delete_key(rediskey='ProjectID')
                    conn.delete_key(rediskey='TempProjectListID')
                    conn.delete_key(rediskey='ProjectInfoID')
                    conn.delete_key(rediskey='TenderInfoID')
                    conn.delete_key(rediskey='TenderListID')
                    conn.delete_key(rediskey='ContractListID')
                    conn.delete_key(rediskey='ContractInfoID')
                    conn.delete_key(rediskey='BuildLicenceInfoID')
                    conn.delete_key(rediskey='BuildLicenceListID')
                    conn.delete_key(rediskey='BuildLicencePersonID')
                    conn.delete_key(rediskey='ProFinishListID')
                    conn.delete_key(rediskey='ProjectFinishInfoID')
                    conn.delete_key(rediskey='ProjectCorpInfoID')
                    conn.delete_key(rediskey='TempTenderListID')
                    conn.delete_key(rediskey='TempContractListID')
                    conn.delete_key(rediskey='TempProFinishListID')
                    conn.delete_key(rediskey='TempProCensorListID')
                    conn.delete_key(rediskey='TempBuildLicenceListID')
                    conn.delete_key(rediskey='TempProjectCorpInfoID')

                    nowTime_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # 获取当前时间
                    five_time = datetime.datetime.now().strftime('%Y-%m-%d') + " 05:00:00"  # 每天5点
                    six_time = datetime.datetime.now().strftime('%Y-%m-%d') + " 06:00:00"  # 每天6点
                    if nowTime_str < five_time or nowTime_str > six_time:
                        spider = CompanyListSpider()
                        spider.runs(list_id[1])
                        # time.sleep(cycle)
                        self.__scheduleProjectList__()
                        # self.__scheduleQualificationInfo__()
                        self.__scheduleTenderInfo__()
                        self.__scheduleContractRecordInfo__()
                        self.__scheduleProjectCorpInfo__()
                        self.__scheduleBuildLicenceInfo__()
                        # self.__scheduleBuildLicencePerson__()
                        self.__scheduleProjectFinishInfo__()
                        update_sql = 'update companyName set flag = 1 where companyName = "%s" ' % list_id[1]
                        end_time = datetime.datetime.now().strftime('%Y-%m-%d') + " 07:00:00"  # 每天7点
                        if nowTime_str > end_time or nowTime_str < six_time:
                            mysql.__updateData__(update_sql)

        except Exception as e:
            print(e)