Esempio n. 1
0
    def save_ip(self, response):
        website_name = response.get('meta').get('website_name')
        response_time = response.get('content')
        target_url = response.get('meta').get('target_url')
        _ip = response.get('url')

        msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(
            ip=_ip, target_url=target_url, source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip + '_' + target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库
        self.pipeline.process_item(insert_data, self.collection_name)
def show_size():
	while 1:
		 if not work_queue.empty() or not save_queue.empty():
            msg = 'AT %s ,work queue size is [%d]' % (time.strftime('%Y-%m-%d %H:%M:%S'), work_queue.qsize())
            logger.info(msg)

            msg = 'AT %s ,save queue size is [%d]' % (time.strftime('%Y-%m-%d %H:%M:%S'), save_queue.qsize())
            logger.info(msg)

            msg = 'work threading total count is [%d], active count is [%d]' % (
                len(work_threading_list), tools.isThreadAlive(work_threading_list))
            logger.info(msg)

            msg = 'save threading total count is [%d], active count is [%d]' % (
                len(save_threading_list), tools.isThreadAlive(save_threading_list))
            logger.info(msg)

            time.sleep(2)
Esempio n. 3
0
    def request(self, _args, dont_filter):
        url = _args.get('url')
        sleep_time = _args.get('sleep_time') if _args.get(
            'sleep_time') else self.sleep_time
        time_out = _args.get('time_out') if _args.get(
            'time_out') else self.time_out
        retry_times = _args.get('retry_times') if _args.get(
            'retry_times') else self.retry_times
        use_proxy = _args.get('use_proxy') if _args.get(
            'use_proxy') else self.use_proxy
        _ip = _args.get('ip') if _args.get('ip') else self.ip
        ua_type = _args.get('ua_type') if _args.get(
            'ua_type') else self.ua_type
        diy_header = _args.get('diy_header') if _args.get(
            'diy_header') else self.diy_header

        if not dont_filter:
            check_result = self.check(url)
            if not check_result:
                return 'HAS CRAWLED', url
            else:
                msg = 'new url'
                logger.info(msg)

        if not url.startswith('http'):
            raise ValueError('url has to be started with http or https')
        if diy_header:
            header = diy_header
        else:
            host = urlparse.urlparse(url).netloc
            header = {
                'User-Agent': random.choice(PC_USER_AGENTS),
                'Host': host,
            }

            if ua_type == 'mobile':
                header = {
                    'User-Agent': random.choice(MOBILE_USER_AGENTS),
                    'Host': host
                }

        times = 0
        con = None
        while retry_times > 0:
            times += 1
            self.log.info('request %s, times: %d' % (url, times))
            try:
                if use_proxy:
                    ip = _ip
                    if ip:
                        proxy = {
                            'http': 'http://%s' % ip,
                            'https': 'http://%s' % ip
                        }
                        con = requests.get(url,
                                           headers=header,
                                           proxies=proxy,
                                           timeout=time_out)
                        if con.status_code not in self.status_code:
                            self.log.error('status code is %s' %
                                           con.status_code)
                            raise ValueError(
                                'status code not in the code in config.py, check your log'
                            )
                        time.sleep(sleep_time)
                    else:
                        msg = 'ip can not be none while use_proxy is True'
                        self.log.error(msg)
                        os._exit(0)

                else:
                    con = requests.get(url, headers=header, timeout=time_out)
                    if con.status_code not in self.status_code:
                        self.log.error('status code is %s' % con.status_code)
                        raise ValueError(
                            'status code not in the code in config.py, check your log'
                        )
                    time.sleep(sleep_time)

            except Exception, e:
                self.log.error(e)
                retry_times -= 1
                self.log.warning('retrying request: [%s], times: %s' %
                                 (url, times))
                if times == 10:
                    self.log.error(
                        'give up retrying request: [%s], times: %s is bigger than setting'
                        % (url, times))
                    return None, None
            else:
                self.log.info('[%s] has requested successfully' % url)

                if con:
                    if not dont_filter:
                        url = self.md5_url(url)
                        sbf.add(url)

                    return con.content, con.url
                else:
                    self.log.error('content is None, url is %s' % url)
                    return None, None
Esempio n. 4
0
    def request(self, _args, dont_filter):
        #如果参数给了具体的值,就设定就可以了
        url = _args.get("url")
        sleep_time = _args.get("sleep_time") if _args.get(
            "sleep_time") else self.sleep_time
        time_out = _args.get("time_out") if _args.get(
            "time_out") else self.time_out
        retry_times = _args.get("retry_times") if _args.get(
            'retry_times') else self.retry_times
        use_proxy = _args.get("use_proxy") if _args.get(
            "use_proxy") else self.use_proxy
        _ip = _args.get("ua_type") if _args.get("ua_type") else self.ua_type
        diy_header = _args.get("diy_header") if _args.get(
            "diy_header") else self.diy_header
        method = _args.get('method') if _args.get('method') else self.method
        post_data = _args.get('submit_data') if _args.get(
            'submit_data') else self.submit_data

        if not dont_filter:
            check_result = self.check(url)
            if not check_result:
                return "HAS CRAWLED", url
            else:
                msg = "new url"
                logger.info(msg)

        #过滤掉非http和https开头的东西
        if not url.startswith("http"):
            raise ValueError("url has to be started with http or https")

        if diy_header:
            header = diy_header
        else:
            host = urlparse.urlparse(url).netloc
            #netloc举例,返回http后www打头的host
            ## import urllib.parse as urlparse
            ## urlparse.urlparse("http://www.baidu.com").netloc
            ## result = 'www.baidu.com'

            ##UA模块,模拟浏览器header
            header = {
                'User-Agent': random.choice(PC_USER_AGENTS),
                'host': host
            }
            if ua_type == 'mobile':
                header = {
                    'User-Agent': random.choice(MOBILE_USER_AGENTS),
                    'Host': host
                }
        times = 0
        con = None
        # Retry和
        while retry_times > 0:
            times += 1
            self.log.info("request %s,time: %d" % (url, times))
            try:
                if use_proxy:
                    ip = _ip
                    if ip:
                        ### IP POOLS!
                        proxy = {
                            'http': 'http://%s' % ip,
                            'https': 'http://%s' % ip
                        }
                        #scraping threads!!!
                        if method == "get":
                            con = request_session.get(url,
                                                      headers=header,
                                                      proxies=proxy,
                                                      timeout=time_out,
                                                      params=post_data,
                                                      verify=False)
                        elif method == "post":
                            if post_data and isinstance(post_data, dict):
                                con = request_session.post(url,
                                                           header=header,
                                                           proxies=proxy,
                                                           timeout=time_out,
                                                           data=post_data,
                                                           verify=False)
                            else:
                                self.log.error(
                                    'while method is post, post_data must be defined and defined as dict'
                                )

                        # if status_code is not correct,raise a
                        if con.status_code not in self.status_code:
                            self.log.error('status code is %s' %
                                           con.status_code)
                            raise ValueError(
                                'status code not in the code in config.py,check your log'
                            )
                        time.sleep(sleep_time)
                    else:
                        msg = 'ip can not be none while use_proxy is True'
                        self.log.error(msg)
                        os._exit(0)

                ## 不设置ip代理的情况
                else:
                    if method == "get":
                        con = request_session.get(url,
                                                  headers=header,
                                                  timeout=time_out,
                                                  params=post_data,
                                                  verify=False)
                    elif method == "post":
                        if post_data and isinstance(post_data, dict):
                            con = request_session.post(url,
                                                       headers=header,
                                                       timeout=time_out,
                                                       verify=False)
                        else:
                            self.log.error(
                                'while method is post,post_data must be defined and defined as a dict'
                            )
                            os._exit(0)

                    if con.status_code not in self.status_code:
                        self.log.error("status code is %s" % con.status_code)
                        raise ValueError(
                            "status code not in the code in config.py,check your log"
                        )
                    time.sleep(sleep_time)

            except Exception as e:
                self.log.error(e)
                retry_times -= 1
                self.log.warning('retrying request:[%s],times:%s' %
                                 (Url, times))
                if times == 10:
                    self.log.error(
                        'give up retrying request:[%s],times:%s is bigger than setting'
                        % (url, times))
                    return None, None

                else:
                    self.log.info('[%s] has requested successfully' % url)

                    if con:
                        if not dont_filter:
                            url = self.md5_url(url)
                            sbf.add(url)
                        return con.content, con.url
                    else:
                        self.log.error('content is None,url is %s' % url)
                        return None, None
Esempio n. 5
0
from threads import start, work_queue, save_queue
from log_format import logger
from proxy_basic_config import url_parse_dict
from _request import valid

from get_proxies_base_spider import SpiderMain


class WorkSpider(SpiderMain):
    def __init__(self):
        super(WorkSpider, self).__init__()

    # 重写run方法,
    # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request
    def run(self):
        start()
        self.craw()


if __name__ == '__main__':
    work_spider = WorkSpider()

    work_spider.run()

    # Blocking
    work_queue.join()
    save_queue.join()

    # Done
    logger.info('All Job Finishing, Please Check!')
Esempio n. 6
0
            ip=_ip, target_url=target_url, source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip + '_' + target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库
        self.pipeline.process_item(insert_data, self.collection_name)


if __name__ == '__main__':
    # 测试代码
    spidermain = SpiderMain()
    spidermain.run()

    # blocking
    work_queue.join()
    save_queue.join()

    # finishing crawl origin ip
    logger.info(
        'available proxy has been saved in your database, please check!')