Example #1
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Created by weixiong
import time

import scrapy
from scrapy.loader import ItemLoader

from common_crawler.instance.tutorial.items import NewsContext
from common_crawler.instance.tutorial.spiders.BaseSpider import BaseSpider
from dealer.log.logger import get_logger

logger = get_logger("reuters")


class ReutersSpider(BaseSpider):
    name = "reuters"
    task_domain = "cn.reuters.com"
    logger.info("start reuters spider")

    def start_requests(self):
        urls = [
            'https://cn.reuters.com'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for detail_link in response.css(".story-content a::attr(href)").re(r'.*?/article/.+'):
            yield response.follow(detail_link, self.parse_detail)
Example #2
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Created by weixiong
import json

import happybase
from kafka import KafkaConsumer, TopicPartition
from redis import Redis

from dealer.log.logger import get_logger

redis = Redis(host="localhost", port=6379,
              db=1)
logger = get_logger("toHbase.py")


def toHbase(datas):
    connection = happybase.Connection('localhost', autoconnect=False)
    connection.open()
    # todo:从配置读取
    table = connection.table('testtable')

    try:
        for jdata in datas:
            try:
                url = jdata["url"][0]
                title = jdata["title"][0]
                content = jdata["content"][0]
                date = jdata["date"][0]
                domain = jdata["domain"][0]
                data_id = jdata["id"]
Example #3
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Created by weixiong
import importlib
import time
import traceback
from multiprocessing import Process

from redis import Redis
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from dealer.log.logger import get_logger

logger = get_logger("worker")
redis = Redis(host="localhost", port=6379, db=1)

"""
    twisted 本身是多线程的,所以在外层使用多线程会 raise Exception,所以这里不用线程池
"""


def worker(spider):
    pass


# def run_task(task_class):
#     try:
#         process = CrawlerProcess(get_project_settings())
#         process.crawl(task_class)
#         process.start()
Example #4
0
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import traceback

from kafka import KafkaProducer
from redis import Redis
import hashlib

# todo:从配置读
from dealer.log.logger import get_logger

redis = Redis(host="localhost", port=6379, db=1)
logger = get_logger("piplines.py")


class TutorialPipeline(object):
    def __init__(self):
        self.producer = None

    def process_item(self, item, spider):

        # 去重
        domain = item["domain"][0]
        md5_domain = hashlib.md5(domain.encode()).hexdigest()
        md5_url = hashlib.md5(item["url"][0].encode()).hexdigest()
        unique_prefix = "unique:url:"
        if redis.sismember(unique_prefix + md5_domain, item["url"][0]):
            logger.warn("重复的url")
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Created by weixiong

import time

import scrapy
from scrapy.loader import ItemLoader

from common_crawler.instance.tutorial.spiders.BaseSpider import BaseSpider
from common_crawler.instance.tutorial.items import NewsContext
from dealer.log.logger import get_logger

logger = get_logger("TutorialSpider")


class TutorialSpider(BaseSpider):
    name = "tutorial"
    task_domain = "www.bbc.com"
    logger.info("start tutorial spider")

    def start_requests(self):
        urls = ['https://www.bbc.com/zhongwen/simp']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for detail_link in response.css(".title-link::attr(href)").re(
                r'.+?chinese-news.+|.+?world-.+|.+?business-.+'):
            yield response.follow(detail_link, self.parse_detail)
Example #6
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Created by weixiong
import time

# from apscheduler.schedulers.blocking import BlockingScheduler
# from pytz import utc
from redis import Redis

from dealer.log.logger import get_logger

logger = get_logger("test_scheduler")
redis = Redis(host="localhost", port=6379, db=1)


def get_task():
    raw_task_key = "crawler:task"
    redis_task_queue = "crawler:task:queue"

    while (True):
        interval = 10
        rst_list = redis.zrange(raw_task_key, 0, -1, withscores=True)
        timestamp = int(time.time())
        for task_withscores in rst_list:
            spider_class = task_withscores[0].decode("utf-8")
            socre = int(task_withscores[1])
            if timestamp > socre:
                logger.info("put class %s to running queue" % spider_class)
                redis.lpush(redis_task_queue, spider_class)
                redis.zadd(raw_task_key, spider_class, timestamp + 3600 * 6)
                interval = 1