Exemple #1
0
    def save_ip(self, response):
        website_name = response.get('meta').get('website_name')
        response_time = response.get('content')
        target_url = response.get('meta').get('target_url')
        _ip = response.get('url')

        msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(
            ip=_ip, target_url=target_url, source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip + '_' + target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库
        self.pipeline.process_item(insert_data, self.collection_name)
    def save_ip(self, response):
        website_name = response.get('meta').get('website_name')
        response_time = response.get('content')
        target_url = response.get('meta').get('target_url')
        _ip = response.get('url')

        msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(ip=_ip,
                                                                                            target_url=target_url,
                                                                                            source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip+'_'+target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库 
        self.pipeline.process_item(insert_data, self.collection_name)
Exemple #3
0
        msg = '[{ip}] can visit the target url [{target_url}], source is [{source}]'.format(
            ip=_ip, target_url=target_url, source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip + '_' + target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库
        self.pipeline.process_item(insert_data, self.collection_name)


if __name__ == '__main__':
    # 测试代码
    spidermain = SpiderMain()
    spidermain.run()

    # blocking
    work_queue.join()
    save_queue.join()

    # finishing crawl origin ip
    logger.info(
        'available proxy has been saved in your database, please check!')
                                                                                            source=website_name)
        logger.info(msg)
        # mongodb 集合名称

        insert_data = {}

        insert_data['_id'] = _ip+'_'+target_url
        insert_data['ip'] = _ip
        insert_data['source'] = website_name
        insert_data['response_time'] = response_time
        insert_data['target_url'] = target_url

        insert_data['insert_time'] = time.strftime('%Y-%m-%d %H:%M:%S')

        #  保存数入库 
        self.pipeline.process_item(insert_data, self.collection_name)



if __name__ == '__main__':
    # 测试代码
    spidermain = SpiderMain()
    spidermain.run()

    # blocking
    work_queue.join()
    save_queue.join()

    # finishing crawl origin ip
    logger.info('available proxy has been saved in your database, please check!')
Exemple #5
0
from spider.threads import start, work_queue, save_queue
from spider.log_format import logger
from proxy_basic_config import url_parse_dict
from _request import valid

from get_proxies_base_spider import SpiderMain


class WorkSpider(SpiderMain):
    def __init__(self):
        super(WorkSpider, self).__init__()

    # 重写run方法,
    # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request
    def run(self):
        start()  #启动工作线程和保存线程
        self.craw()


if __name__ == '__main__':
    work_spider = WorkSpider()

    work_spider.run()

    # Blocking
    work_queue.join()
    save_queue.join()

    # Done
    logger.info('All Job Finishing, Please Check!')
Exemple #6
0
from spider.threads import start, work_queue, save_queue
from spider.log_format import logger
from proxy_basic_config import url_parse_dict
from _request import valid

from get_proxies_base_spider import SpiderMain


class WorkSpider(SpiderMain):
    def __init__(self):
        super(WorkSpider, self).__init__()

    # 重写run方法,
    # 若请求的函数为自定义, 则可以在crawl函数中设置: request=your_request_function, 默认为框架中的request
    def run(self):
        start()
        self.craw()


if __name__ == '__main__':
    work_spider = WorkSpider()

    work_spider.run()

    # Blocking
    work_queue.join()
    save_queue.join()

    # Done
    logger.info('All Job Finishing, Please Check!')