Beispiel #1
0
class RedisSpider(RedisCallbackSpider):
    name = 'RedisSpider'
    custom_settings = ConfigUtils.getItems("REDIS")
    redis_key = 'redisSpider:startId'

    def beforeStartUrl(self, data):
        if (data == None):
            return data
        dict = json.loads(data)
        return dict["url"] if dict.has_key("url") else "http://www.baidu.com"

    def parse(self, response):
        items = []
        hxs = Selector(response)
        jobTemplateFieldList = query(JobTemplateField, type=1)
        for jobTemplateField in jobTemplateFieldList:
            fieldNameEn = jobTemplateField.fieldNameEn
            fieldValue = jobTemplateField.fieldValue
            node = hxs.xpath(fieldValue).extract()
            split = jobTemplateField.split if jobTemplateField.split != None else ""
            value = split.join(node)
            value = value if value != None else ""
            regExp = jobTemplateField.regExp
            if regExp != None and regExp != "":
                pattern = re.compile(regExp)
                matches = pattern.search(value.encode("utf8"))
                if matches != None and len(matches.groups()) > 0:
                    value = regExp.join(matches.groups())
                elif len(matches.groups()) == 0 and matches != None:
                    value = matches.group()
            items[fieldNameEn] = value
        self.log('A response from %s just arrived!' % response.url)
        return items
Beispiel #2
0
class MainRedisSpider(RedisCallbackSpider):  #
    name = ConfigUtils.getSpiderPorperties(KEYMAP.MAIN_SPIDER_NAME)
    custom_settings = ConfigUtils.getItems(KEYMAP.REDIS)
    custom_settings["ITEM_PIPELINES"] = {
        'engine.pipelines.DataBaseSavePipeline': 300
    }
    custom_settings["DOWNLOADER_MIDDLEWARES"] = {
        'engine.useragent.RotateUserAgentMiddleware': 1,
        'engine.middlewares.ProxyMiddleware': 2
    }
    custom_settings = dict(custom_settings.items() +
                           ConfigUtils.getItems(KEYMAP.MYSQL).items())
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)
    id = ""

    # start_urls=["https://www.baidu.com"]
    def beforeStartUrl(self, dataDict):
        if (dataDict == None):
            return dataDict
        try:
            self.taskJob = RequestUtils.parseResToClass(TaskJob, dataDict)
        except Exception, e:
            logging.error("TemplateRedisSpider[beforeStartUrl:error]:%s" % (e))
            return None
        self.params = dataDict
        id = dataDict.get("id")
        if id == None:
            return
        status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
        # if status!=None and str(status)!=str(TaskTable.TaskStatus.RUNNING):
        #     return None;
        url = dataDict["url"] if dataDict.has_key(
            "url") else "http://www.baidu.com"
        self.url = url
        CacheFactory.cache("job_template_url", id, self.params)
        return url
Beispiel #3
0
#coding=utf-8
#Created by xutao on 2017-04-21.
#去重redis数据
import redis

from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP

dereplicationRedisConfig = ConfigUtils.getItems(KEYMAP.DEREPLICATION)
dereHost = dereplicationRedisConfig.get(KEYMAP.REDIS_HOST)
derePort = dereplicationRedisConfig.get(KEYMAP.REDIS_PORT)
dereNamespace = dereplicationRedisConfig.get(KEYMAP.REDIS_NAMESPACE)
derePrefrex = dereplicationRedisConfig.get(KEYMAP.REDIS_PREFIX)

#去重redis连接池
derePool = redis.ConnectionPool(host=dereHost, port=int(derePort))
dereRedis = redis.Redis(connection_pool=derePool)


def lpush(key, value):
    return dereRedis.lpush(key, value)


def hset(namespace, key, value):
    return dereRedis.hset(namespace, key, value)


def hget(namespace, key):
    return dereRedis.hget(namespace, key)

Beispiel #4
0
class DepthSpider(RedisCallbackSpider):
    name = ConfigUtils.getSpiderPorperties(KEYMAP.DEPTH_SPIDER_NAME)
    custom_settings = ConfigUtils.getItems(KEYMAP.REDIS)
    custom_settings["ITEM_PIPELINES"] = {
        'engine.pipelines.CacheHtmlPipeline': 300
    }
    custom_settings["DOWNLOADER_MIDDLEWARES"] = {
        'engine.useragent.RotateUserAgentMiddleware': 1
    }
    # start_urls = ['http://mini.qq.com/']
    custom_settings = dict(custom_settings.items() +
                           ConfigUtils.getItems(KEYMAP.MYSQL).items())
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY)
    id = ""
    allowed_domain = None  # 允许爬取的域名
    rules = [Rule(LinkExtractor(allow=()), callback='parse', follow=True)]
    cur_url_depth = 1  # 当前url的深度值
    depth_limit = 3  # 爬取深度

    def beforeStartUrl(self, dataDict):
        if (dataDict == None):
            return dataDict
        id = dataDict.get("id")
        if id == None:
            return
        status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
        taskJobHistoryId = dataDict.get("taskJobHistoryId")
        if taskJobHistoryId:
            taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                taskJobHistoryId)
            if taskJobHistory:
                taskJobId = taskJobHistory.taskJobId
                self.taskJob = TaskJobDao.loadTaskById(taskJobId)
                self.taskJobHistory = taskJobHistory
        url = dataDict["url"] if dataDict.has_key(
            "url") else "http://www.baidu.com"
        self.url = url
        if self.allowed_domain is None:
            self.allowed_domain = self.get_first_domain(self.get_domain(url))
        self.cur_url_depth = dataDict.get("curUrlDepth")
        self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key(
            "depthLimit") else 3
        return url

    def get_domain(self, url):
        """
        获取url中的域名
        :param url: 
        :return: 
        """
        pattern = r'(?<=//).*?(?=/)'
        result = re.findall(pattern, url)
        if result and len(result) > 0:
            return result[0]
        else:
            pattern = r'(?<=//).*'
            result = re.findall(pattern, url)
            if result and len(result) > 0:
                return result[0]
            else:
                return None

    def get_first_domain(self, domain):
        """获取域名中的一级域名,比如www.baidu.com中的baidu.com"""
        pattern = r'(?<=\.).*'
        result = re.findall(pattern, domain)
        if result and len(result) > 0:
            return result[0]

    def parse(self, response):
        if response.body:
            urlListStatusId = response.meta["urlListStatusId"]
            if urlListStatusId:
                UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS)
            htmlItem = HtmlItem()
            htmlItem["url"] = response.url
            htmlItem["html"] = response.body
            subUrls = []
            URLgroup = LinkExtractor(allow=()).extract_links(response)
            if (self.cur_url_depth < self.depth_limit
                    and self.depth_limit != 0) or self.depth_limit == 0:
                for URL in URLgroup:
                    if self.is_domain_allowed(URL.url):
                        subUrls.append(URL.url)
            htmlItem["subUrls"] = subUrls
            # htmlItem["taskJob"]=self.taskJob
            # htmlItem["taskJobHistory"] = self.taskJobHistory
            htmlItem["curUrlDepth"] = self.cur_url_depth
            return htmlItem

    def is_domain_allowed(self, url):
        """
        判断当前url是否属于允许爬取的域名范围内
        :param url: 
        :return: 
        """
        logging.info("allowed_domain : " + self.allowed_domain)
        logging.info("url : " + url)
        if self.allowed_domain:
            cur_url_domain = self.get_domain(url)
            if cur_url_domain and self.allowed_domain in cur_url_domain:
                return True
            else:
                return False
        else:
            return True
Beispiel #5
0
# coding=utf-8
from hashlib import md5

import redis

from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP
#普通redis数据
redisConfig = ConfigUtils.getItems(KEYMAP.URL_STATUS_REDIS)
port = redisConfig.get(KEYMAP.REDIS_PORT)
host = redisConfig.get(KEYMAP.REDIS_HOST)
namespace = redisConfig.get(KEYMAP.REDIS_NAMESPACE)
prefix = redisConfig.get(KEYMAP.REDIS_PREFIX)

#普通redis连接池
pool = redis.ConnectionPool(host=host, port=int(port))
r = redis.Redis(connection_pool=pool)


def llen(key):
    return r.llen(key)


def keys():
    return r.keys()


def lpush(key, value):
    return r.lpush(key, value)