Beispiel #1
0
 def process_item(self, item, collection_name, use_id=True):
     collection = self.db[collection_name]
     msg = 'insert data into collection: [%s]' %collection_name
     logger.info(msg)
     if use_id:
         collection.update({'_id':item['_id']}, dict(item), True)
     else:
         collection.insert(dict(item))
Beispiel #2
0
def execute_spider():
    work_spider = WorkSpider()

    work_spider.run()

    # Blocking
    work_queue.join()
    save_queue.join()

    # Done
    logger.info('All Job Finishing, Please Check!')
Beispiel #3
0
    def _connect(self):
        try:
            self.client = pymongo.MongoClient(self.host,self.port, serverSelectionTimeoutMS=database_connect_time_out,connectTimeoutMS=database_connect_time_out)
            self.client.server_info()
            msg = 'host: {}  port:  {}  database_name : {}   MongoDB数据库连接成功'.format(host, port, self.database)
            logger.info(msg)

            self.db = self.client[self.database]
        except ServerSelectionTimeoutError as e:
            msg = 'host: {}  port:  {}  database_name : {}   MongoDB数据库连接失败 原因: 可能配置文件出错或者连接超时  超时时间为:  {} 毫秒'.format(host, port, self.database, database_connect_time_out)
            raise ConnectionFailure(msg)
Beispiel #4
0
    def get_all_IP(self, collection_name):
        collection = self.db[collection_name]
        data = collection.find().sort("insert_time", pymongo.DESCENDING).sort(
            "response_time", pymongo.ASCENDING)
        ips = []
        for i in data:
            ips.append(i.get('ip'))

        if len(ips) == 0:
            logger.info("数据库内暂无IP")
            self.update_ip_pool()

        self.ipList = ips
        return ips
Beispiel #5
0
 def update_ip_pool(self):
     logger.info("开始执行更新IP代理池中的IP并从网上抓取新的IP放入池中")
     start_time = time.time()
     check()
     execute_spider()
     end_time = time.time()
     logger.info("刷新数据库中IP带内存中来")
     self.update_ipList()
     logger.info("IP代理池更新完毕..  使用时间为 {} 秒".format(end_time - start_time))
    def save_ip(self, response):
        website_name = response.get('meta').get('website_name')
        response_time = response.get('content')
        target_url = response.get('meta').get('target_url')
        _ip = response.get('url')

        msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(ip=_ip,
                                                                                            target_url=target_url,
                                                                                            source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip+'_'+target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库 
        self.pipeline.process_item(insert_data, self.collection_name)
Beispiel #7
0
    def request(self, _args, dont_filter):
        url = _args.get('url')
        sleep_time = _args.get('sleep_time') if _args.get(
            'sleep_time') else self.sleep_time
        time_out = _args.get('time_out') if _args.get(
            'time_out') else self.time_out
        retry_times = _args.get('retry_times') if _args.get(
            'retry_times') else self.retry_times
        use_proxy = _args.get('use_proxy') if _args.get(
            'use_proxy') else self.use_proxy
        _ip = _args.get('ip') if _args.get('ip') else self.ip
        ua_type = _args.get('ua_type') if _args.get(
            'ua_type') else self.ua_type
        diy_header = _args.get('diy_header') if _args.get(
            'diy_header') else self.diy_header
        method = _args.get('method') if _args.get('method') else self.method
        post_data = _args.get('submit_data') if _args.get(
            'submit_data') else self.submit_data

        if not dont_filter:
            check_result = self.check(url)
            if not check_result:
                return 'HAS CRAWLED', url
            else:
                msg = 'new url'
                logger.info(msg)

        if not url.startswith('http'):
            raise ValueError('url has to be started with http or https')
        if diy_header:
            header = diy_header
        else:
            host = parse.urlparse(url).netloc
            header = {
                'User-Agent': random.choice(PC_USER_AGENTS),
                'Host': host,
            }

            if ua_type == 'mobile':
                header = {
                    'User-Agent': random.choice(MOBILE_USER_AGENTS),
                    'Host': host
                }

        times = 0
        setting_time = retry_times
        con = None
        while retry_times > 0:
            times += 1
            self.log.info('request %s, times: %d' % (url, times))
            try:
                if use_proxy:
                    ip = _ip
                    if ip:
                        proxy = {
                            'http': 'http://%s' % ip,
                            'https': 'http://%s' % ip
                        }
                        if method == 'get':
                            con = request_session.get(url,
                                                      headers=header,
                                                      proxies=proxy,
                                                      timeout=time_out,
                                                      params=post_data,
                                                      verify=False)
                        elif method == 'post':
                            if post_data and isinstance(post_data, dict):
                                con = request_session.post(url,
                                                           headers=header,
                                                           proxies=proxy,
                                                           timeout=time_out,
                                                           data=post_data,
                                                           verify=False)
                            else:
                                self.log.error(
                                    'while method is post, post_data must be defined and defined as dict'
                                )

                        if con.status_code not in self.status_code:
                            self.log.error('status code is %s' %
                                           con.status_code)
                            raise ValueError(
                                'status code not in the code in config.py, check your log'
                            )
                        time.sleep(sleep_time)
                    else:
                        msg = 'ip can not be none while use_proxy is True'
                        self.log.error(msg)
                        os._exit(0)

                else:
                    if method == 'get':
                        con = request_session.get(url,
                                                  headers=header,
                                                  timeout=time_out,
                                                  params=post_data,
                                                  verify=False)
                    elif method == 'post':
                        if post_data and isinstance(post_data, dict):
                            con = request_session.post(url,
                                                       headers=header,
                                                       timeout=time_out,
                                                       data=post_data,
                                                       verify=False)
                        else:
                            self.log.error(
                                'while method is post, post_data must be defined and defined as dict'
                            )
                            os._exit(0)

                    if con.status_code not in self.status_code:
                        self.log.error('status code is %s' % con.status_code)
                        raise ValueError(
                            'status code not in the code in config.py, check your log'
                        )
                    time.sleep(sleep_time)

            except Exception as e:
                self.log.error(e)
                retry_times -= 1
                self.log.warning(
                    'retrying request: [%s], times: %s    retry_times:  %s ' %
                    (url, times, retry_times))
                if times == setting_time:
                    self.log.error(
                        'give up retrying request: [%s], times: %s is bigger than setting'
                        % (url, times))
                    return None, None
            else:
                self.log.info('[%s] has requested successfully' % url)

                if con:
                    if not dont_filter:
                        url = self.md5_url(url)
                        sbf.add(url)

                    return con.content, con.url
                else:
                    self.log.error('content is None, url is %s' % url)
                    return None, None
                                                                                            source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip+'_'+target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库 
        self.pipeline.process_item(insert_data, self.collection_name)



if __name__ == '__main__':
    # 测试代码
    spidermain = SpiderMain()
    spidermain.run()

    # blocking
    work_queue.join()
    save_queue.join()

    # finishing crawl origin ip
    logger.info('available proxy has been saved in your database, please check!')
Beispiel #9
0
    def _request_with_proxy(self, url, use_proxy):

        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "User-Agent": random.choice(PC_USER_AGENTS)
        }

        # 获取进入while循环的初始时间
        start_time = time.time()
        while True:

            # 获取当前时间和之前的初始时间做比较,如果超出自定义的时间则raise requests.exceptions.ProxyError
            end_time = time.time()
            if int(end_time - start_time) > proxy_timeout:
                logger.info(
                    "request with proxy 方法时间执行过长 可能原因: IP池内IP全部失效或被目标网站封掉IP其他异常错误  当前ip为 {} 程序进行休息状态 休息时长为: {} 秒"
                    .format(self.current_ip, proxy_timeout))
                time.sleep(proxy_timeout)
                self.update_ip_pool()
                msg = "IP代理池休息完毕并更新 请重新进行数据抓取 可能原因: 查找历史日志   当前ip为 {}".format(
                    self.current_ip)
                raise requests.exceptions.ProxyError(msg)
            proxy = {'http': self.current_ip, 'https': self.current_ip}

            if use_proxy:
                try:
                    response = requests.get(url,
                                            proxies=proxy,
                                            timeout=request_timeout,
                                            headers=headers)
                    code = response.status_code
                    msg = "doing http request successfully current proxy ip is {} status_code :{}".format(
                        self.current_ip, code)
                    logger.info(msg)

                    if code == 404:
                        msg = " 404 Client Error: Not Found for url:{}".format(
                            url)
                        logger.info(msg)
                        return response

                    response.raise_for_status()
                    if code == 200 and custom_filter_str != '' and custom_filter_str in response.text:
                        raise Exception

                    return response
                except requests.HTTPError as e:
                    logger.info(e)
                    self.current_ip = self.getRandomOne()
                    msg = "random pick a ip from ipList new ip is {}".format(
                        self.current_ip)
                    logger.info(msg)
                except Exception as e:
                    msg = "ip is {} can't use ".format(self.current_ip)
                    logger.info(msg)
                    self.current_ip = self.getRandomOne()
                    msg = "random pick a ip from ipList new ip is {}".format(
                        self.current_ip)
                    logger.info(msg)
            else:
                print("no use proxy")
                try:
                    response = requests.get(url,
                                            timeout=request_timeout,
                                            headers=headers)
                    return response
                except Exception as e:
                    msg = "ip is {} can't use ".format(self.current_ip)
                    logger.info(msg)
                    self.current_ip = self.getRandomOne()
                    msg = "random pick a ip from ipList new ip is {}".format(
                        self.current_ip)
                    logger.info(msg)
Beispiel #10
0
    # 重写run方法,
    # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request
    def run(self):
        start()
        self.craw()

def execute_spider():
    work_spider = WorkSpider()

    work_spider.run()

    # Blocking
    work_queue.join()
    save_queue.join()

    # Done
    logger.info('All Job Finishing, Please Check!')

if __name__ == '__main__':
    work_spider = WorkSpider()

    work_spider.run()

    # Blocking
    work_queue.join()
    save_queue.join()

    # Done
    logger.info('All Job Finishing, Please Check!')