#!/usr/bin/env python # -*- coding:utf-8 -*- # Created by weixiong import time import scrapy from scrapy.loader import ItemLoader from common_crawler.instance.tutorial.items import NewsContext from common_crawler.instance.tutorial.spiders.BaseSpider import BaseSpider from dealer.log.logger import get_logger logger = get_logger("reuters") class ReutersSpider(BaseSpider): name = "reuters" task_domain = "cn.reuters.com" logger.info("start reuters spider") def start_requests(self): urls = [ 'https://cn.reuters.com' ] for url in urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): for detail_link in response.css(".story-content a::attr(href)").re(r'.*?/article/.+'): yield response.follow(detail_link, self.parse_detail)
#!/usr/bin/env python # -*- coding:utf-8 -*- # Created by weixiong import json import happybase from kafka import KafkaConsumer, TopicPartition from redis import Redis from dealer.log.logger import get_logger redis = Redis(host="localhost", port=6379, db=1) logger = get_logger("toHbase.py") def toHbase(datas): connection = happybase.Connection('localhost', autoconnect=False) connection.open() # todo:从配置读取 table = connection.table('testtable') try: for jdata in datas: try: url = jdata["url"][0] title = jdata["title"][0] content = jdata["content"][0] date = jdata["date"][0] domain = jdata["domain"][0] data_id = jdata["id"]
#!/usr/bin/env python # -*- coding:utf-8 -*- # Created by weixiong import importlib import time import traceback from multiprocessing import Process from redis import Redis from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from dealer.log.logger import get_logger logger = get_logger("worker") redis = Redis(host="localhost", port=6379, db=1) """ twisted 本身是多线程的,所以在外层使用多线程会 raise Exception,所以这里不用线程池 """ def worker(spider): pass # def run_task(task_class): # try: # process = CrawlerProcess(get_project_settings()) # process.crawl(task_class) # process.start()
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import traceback from kafka import KafkaProducer from redis import Redis import hashlib # todo:从配置读 from dealer.log.logger import get_logger redis = Redis(host="localhost", port=6379, db=1) logger = get_logger("piplines.py") class TutorialPipeline(object): def __init__(self): self.producer = None def process_item(self, item, spider): # 去重 domain = item["domain"][0] md5_domain = hashlib.md5(domain.encode()).hexdigest() md5_url = hashlib.md5(item["url"][0].encode()).hexdigest() unique_prefix = "unique:url:" if redis.sismember(unique_prefix + md5_domain, item["url"][0]): logger.warn("重复的url")
#!/usr/bin/env python # -*- coding:utf-8 -*- # Created by weixiong import time import scrapy from scrapy.loader import ItemLoader from common_crawler.instance.tutorial.spiders.BaseSpider import BaseSpider from common_crawler.instance.tutorial.items import NewsContext from dealer.log.logger import get_logger logger = get_logger("TutorialSpider") class TutorialSpider(BaseSpider): name = "tutorial" task_domain = "www.bbc.com" logger.info("start tutorial spider") def start_requests(self): urls = ['https://www.bbc.com/zhongwen/simp'] for url in urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): for detail_link in response.css(".title-link::attr(href)").re( r'.+?chinese-news.+|.+?world-.+|.+?business-.+'): yield response.follow(detail_link, self.parse_detail)
#!/usr/bin/env python # -*- coding:utf-8 -*- # Created by weixiong import time # from apscheduler.schedulers.blocking import BlockingScheduler # from pytz import utc from redis import Redis from dealer.log.logger import get_logger logger = get_logger("test_scheduler") redis = Redis(host="localhost", port=6379, db=1) def get_task(): raw_task_key = "crawler:task" redis_task_queue = "crawler:task:queue" while (True): interval = 10 rst_list = redis.zrange(raw_task_key, 0, -1, withscores=True) timestamp = int(time.time()) for task_withscores in rst_list: spider_class = task_withscores[0].decode("utf-8") socre = int(task_withscores[1]) if timestamp > socre: logger.info("put class %s to running queue" % spider_class) redis.lpush(redis_task_queue, spider_class) redis.zadd(raw_task_key, spider_class, timestamp + 3600 * 6) interval = 1