Beispiel #1
0
def checkFinishJob():
    keys=RedisUtils.hkeys(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY))
    for key in keys :
        temp=RedisUtils.hget(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), key)
        newJobTemplate=json.loads(temp)
        url=newJobTemplate['url']
        try:
            request = urllib2.Request(
                url=url,
                headers=(random.choice(user_agent_list))
            )
            response = urllib2.urlopen(request)
            urldate = response.headers['date']
            tempDate= newJobTemplate['urldate']
            print urldate
            print tempDate
            if urldate == tempDate:
                pass
            else:
                newJobTemplate['urldate']=urldate

                taskJobHistoryId = newJobTemplate['taskJobHistoryId']
                taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.id==taskJobHistoryId,TaskJobHistory.delFlag==False).order_by(" create_time desc").first()
                taskJob=Session.query(TaskJob).filter(TaskJob.id==taskJobHistory.taskJobId).first()
                LoggerDao.addTaskJobLogger(taskJob,LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate['id'],taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库",url=url,status=TaskStatus.RUNNING)

                RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId,stringify(newJobTemplate))
                RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate['id'],stringify(newJobTemplate))
        except Exception,e:
            pass
            print e
Beispiel #2
0
 def loadNext(self, childJobTemplateList, item):
     if childJobTemplateList == None or len(childJobTemplateList) == 0:
         # pcInfo = Pcinfo()
         # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME))
         # if pidList and len(pidList):
         #     RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0)
         #     for pid in pidList:
         #         RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0)
         # else:
         if llen(
                 ConfigUtils.getRedisPorperties(
                     KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
             if self.taskJob.status != TaskStatus.SUCCESS:
                 TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                TaskStatus.SUCCESS)
                 UrlDao.updateUrlStatusListByTaskJobHistoryId(
                     self.taskJobHistoryId,
                     status=UrlStatus.STOP,
                     desc="The task is over and no longer crawls on this URL"
                 )
         return
     for jobTemplate in childJobTemplateList:
         parentId = str(item.get("id"))
         taskJobParam = TaskJobParam(paramNameEn="dataParentId",
                                     paramValue=parentId)
         taskJobParamList = []
         taskJobParamList.append(taskJobParam)
         taskJobParamList.extend(self.taskJobParamList)
         CrawlerService.parseUrlAndInsertRedis(
             taskJob=self.taskJob,
             paramMap=item,
             taskJobParam=taskJobParamList,
             taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId),
             jobTemplate=jobTemplate)
Beispiel #3
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
Beispiel #4
0
 def process_item(self, item, spider):
     try:
         curUrl = item["url"]
         subUrls = item["subUrls"]
         taskJob = spider.taskJob
         self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"])
         taskJobHistory = spider.taskJobHistory
         if subUrls and len(subUrls)>0:
             parentUrlDepth = item["curUrlDepth"]
             for url in subUrls:
                 newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob)
                 newTaskJob.url=url
                 newTaskJob.curUrlDepth=parentUrlDepth+1
                 newTaskJob.parentUrl = curUrl
                 CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory)
         else:
             if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                 if taskJob.status != TaskStatus.SUCCESS:
                     TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS)
                     UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP,
                                                                  desc="depth spider is over")
         return item
     except Exception,e:
         logger.exception("CacheHtmlPipeline:"+str(e))
Beispiel #5
0
class MainRedisSpider(RedisCallbackSpider):  #
    name = ConfigUtils.getSpiderPorperties(KEYMAP.MAIN_SPIDER_NAME)
    custom_settings = ConfigUtils.getItems(KEYMAP.REDIS)
    custom_settings["ITEM_PIPELINES"] = {
        'engine.pipelines.DataBaseSavePipeline': 300
    }
    custom_settings["DOWNLOADER_MIDDLEWARES"] = {
        'engine.useragent.RotateUserAgentMiddleware': 1,
        'engine.middlewares.ProxyMiddleware': 2
    }
    custom_settings = dict(custom_settings.items() +
                           ConfigUtils.getItems(KEYMAP.MYSQL).items())
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)
    id = ""

    # start_urls=["https://www.baidu.com"]
    def beforeStartUrl(self, dataDict):
        if (dataDict == None):
            return dataDict
        try:
            self.taskJob = RequestUtils.parseResToClass(TaskJob, dataDict)
        except Exception, e:
            logging.error("TemplateRedisSpider[beforeStartUrl:error]:%s" % (e))
            return None
        self.params = dataDict
        id = dataDict.get("id")
        if id == None:
            return
        status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
        # if status!=None and str(status)!=str(TaskTable.TaskStatus.RUNNING):
        #     return None;
        url = dataDict["url"] if dataDict.has_key(
            "url") else "http://www.baidu.com"
        self.url = url
        CacheFactory.cache("job_template_url", id, self.params)
        return url
Beispiel #6
0
    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop

        if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()):
            reactor.stop()
        """Returns a request to be scheduled or none."""

        # XXX: Do we need to use a timeout here?
        found = 0
        while found < self.redis_batch_size:
            redis_key = fetch_one(self.redis_key)
            taskJobHistoryId = redis_key
            if taskJobHistoryId != None:
                taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId)
                if taskJobHistory:
                    taskJobId = taskJobHistory.taskJobId
                    taskJob = TaskJobDao.loadTaskById(taskJobId)
                    if taskJob and taskJob.status == TaskStatus.PAUSE:
                        RedisUtils.lpush(
                            ConfigUtils.getRedisPorperties(
                                KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                            taskJobHistoryId)
                        break
                else:
                    break
            else:
                break
            if hashswitch:
                if str(localIP) != str(tjs.get_node(redis_key)):
                    RedisUtils.lpush(self.redis_key, redis_key)
                    return

            redis_key = self.redis_key + "_" + redis_key
            orginData = fetch_one(redis_key)
            data = None
            # data = fetch_one(self.redis_key)
            try:
                logging.info("orginData==" + orginData)
                orginData = json.loads(orginData)
                orginData["taskJobHistoryId"] = taskJobHistoryId
                data = self.beforeStartUrl(orginData)
            except Exception, e:
                logging.error("Error e:")
                logging.error(e)
                logging.error(orginData)
                break
            if not data:
                # Queue empty.
                logging.warning('********dataUrl is null*************')
                break
            try:
                req = self.make_request_from_data(data)
                # req.replace(meta={"id":"123"})
                req.meta["id"] = orginData.get("id")
                req.meta["dataParentId"] = orginData.get("dataParentId")
                req.meta["taskJobHistoryId"] = orginData.get(
                    "taskJobHistoryId")
                req.meta["url"] = orginData.get("url")
                urlListStatusId = req.meta["urlListStatusId"] = orginData.get(
                    "urlListStatusId")
            except Exception, e:
                logging.error("make_request_from_data:e:" + e)
                break
Beispiel #7
0
from beans.TaskTable import TaskStatus
from beans.UrlTable import UrlClazz, UrlStatus
from dao import TaskJobDao
from dao import UrlDao
from dao.TaskJobDao import loadTaskJobHistoryById
from utils import LicenseUtils

from utils import ConfigUtils
from utils import RedisUtils
from utils.ConfigUtils import KEYMAP
from utils.ExportExcelUtils import get_ip
from utils.HashFilter import YHash
from utils import RedisUtils

hashswitch = ConfigUtils.getRedisPorperties(
    KEYMAP.DISTRIBUTED_SPIDER_SWITCH) == str(True)
NodeList = ConfigUtils.getRedisPorperties(
    KEYMAP.DISTRIBUTED_SPIDER_NODE_LIST).split(',')
localIP = get_ip()
tjs = YHash(NodeList, int(ConfigUtils.getRedisPorperties(KEYMAP.VIRTUAL_NODE)))


class RedisCallbackSpider(RedisCrawlSpider):
    def beforeStartUrl(self, data):
        return data

    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop
Beispiel #8
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)
Beispiel #9
0
def parseUrlAndInsertRedis(taskJob,
                           paramMap={},
                           taskJobParam=None,
                           taskJobHistory=None,
                           jobTemplate=None):
    if TaskType.DEPTH == str(taskJob.type):
        if bloomfilter_check(taskJob.id, taskJob.url):
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY),
                taskJobHistory.id)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) +
                "_" + taskJobHistory.id, stringify(taskJob))
    else:
        url = taskJob.url
        taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id)
        if taskJobParam != None:
            if isinstance(taskJobParam, list):
                taskJobParamList.extend(taskJobParam)
            else:
                taskJobParamList.append(taskJobParam)
        jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
            jobTemplate.id)
        if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
            taskJobParamList.extend(jobTemplateParamList)
        if taskJobHistory != None:
            jobTemplateParamTaskJob = JobTemplateParam(
                paramNameEn="task_job_id_sequence",
                paramValue=str(taskJobHistory.id))
            jobTemplateParamList.append(jobTemplateParamTaskJob)
        if taskJobParamList == None or len(taskJobParamList) <= 0:
            if str(taskJob.type) == TaskType.BATCH:
                url = jobTemplate.url
            renderUrl = RenderUtils.render(url, paramMap)

            # if bloomfilter_check(taskJob.id, renderUrl):
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            taskJobHistoryId = taskJobHistory.id
            urlListStatus = UrlClazz(url=jobTemplate.url,
                                     parentUrl=paramMap.get("task_job_url"),
                                     jobTemplateId=jobTemplate.id,
                                     jobTemplateParentId=jobTemplate.parentId,
                                     taskJobId=taskJob.id,
                                     taskJobHistoryId=taskJobHistoryId)
            # try:
            #     request = urllib2.Request(
            #         url=url,
            #         headers={
            #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
            #     )
            #     response = urllib2.urlopen(request)
            #     urldate = response.headers['date']
            # except Exception:
            #     pass
            #     print Exception
            setattr(newJobTemplate, "taskJobId", taskJob.id)
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
            setattr(newJobTemplate, "url", renderUrl)
            setattr(newJobTemplate, "extraData", paramMap)
            # setattr(newJobTemplate, "urldate", urldate)
            setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.URL_TO_REDIS,
                                       jobTemplateId=newJobTemplate.id,
                                       taskJobHistoryId=taskJobHistoryId,
                                       content=u"redis_入库",
                                       url=renderUrl,
                                       status=TaskStatus.RUNNING)
            # if (hashswitch):
            #     tempList.append(stringify(newJobTemplate))
            # else:
            # mainId.append(stringify(newJobTemplate))
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                taskJobHistoryId)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
                + "_" + taskJobHistoryId, stringify(newJobTemplate))
            RedisUtils.hset(
                ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY),
                newJobTemplate.id, stringify(newJobTemplate))
            saveUrlListStatus(urlListStatus)
        else:
            for data in paraseJobTemplateList(taskJobParamList, paramMap):
                if str(taskJob.type) == TaskType.BATCH:
                    url = jobTemplate.url
                parentId = paramMap.get("dataParentId")
                paramMap = dict(paramMap.items() + data.items())
                renderUrl = RenderUtils.render(url, paramMap)
                # if bloomfilter_check(taskJob.id, renderUrl):
                newJobTemplate = ClassCopy.copyToNewInstances(
                    jobTemplate, JobTemplate)
                taskJobHistoryId = taskJobHistory.id
                urlListStatus = UrlClazz(
                    url=renderUrl,
                    parentUrl=paramMap.get("task_job_url"),
                    jobTemplateId=jobTemplate.id,
                    jobTemplateParentId=jobTemplate.parentId,
                    taskJobId=taskJob.id,
                    taskJobHistoryId=taskJobHistoryId)
                # try:
                #     request = urllib2.Request(
                #         url=url,
                #         headers={
                #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                #         }
                #     )
                #     response = urllib2.urlopen(request)
                #     urldate = response.headers['date']
                # except Exception:
                #     pass
                #     print Exception
                setattr(newJobTemplate, "taskJobId", taskJob.id)
                setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
                setattr(newJobTemplate, "url", renderUrl)
                setattr(newJobTemplate, "dataParentId", parentId)
                setattr(newJobTemplate, "extraData", paramMap)
                # setattr(newJobTemplate, "urldate", urldate)
                setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate.id,
                                           taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库_多参数",
                                           url=renderUrl,
                                           status=TaskStatus.RUNNING)
                # if (hashswitch):
                #     tempList.append(newJobTemplate)
                # else:
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" +
                    taskJobHistoryId, stringify(newJobTemplate))
                # mainId.append(stringify(newJobTemplate))
                RedisUtils.hset(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id,
                    stringify(newJobTemplate))
                saveUrlListStatus(urlListStatus)
Beispiel #10
0
from utils import ClassCopy
from utils import ConfigUtils
from utils import RedisUtils
from utils import RenderUtils
from utils.ConfigUtils import KEYMAP
from utils.DBClient import DbClient
from utils.JsonUtils import stringify
from utils.RedisUtils import BloomFilter
from utils.HashFilter import YHash
from utils.RedisUtils import lpush
import urllib
import urllib2

bloomfilter = BloomFilter()

NodeListStr = ConfigUtils.getRedisPorperties(
    KEYMAP.DISTRIBUTED_SPIDER_NODE_LIST)
NodeList = NodeListStr.split(',')
hashConsistency = YHash(
    NodeList, int(ConfigUtils.getRedisPorperties(KEYMAP.VIRTUAL_NODE)))
switch = ConfigUtils.getRedisPorperties(KEYMAP.DISTRIBUTED_SPIDER_SWITCH)
hashswitch = switch == str(True)
tempList = []
nodePool = []
mainId = []


def paraseJobTemplateList(jobTemplateParamList, paramMap, loopFlag=False):
    paramList = []
    length = len(jobTemplateParamList)
    newJobTemplateParamList = []
    if jobTemplateParamList != None and length > 0:
Beispiel #11
0
class DepthSpider(RedisCallbackSpider):
    name = ConfigUtils.getSpiderPorperties(KEYMAP.DEPTH_SPIDER_NAME)
    custom_settings = ConfigUtils.getItems(KEYMAP.REDIS)
    custom_settings["ITEM_PIPELINES"] = {
        'engine.pipelines.CacheHtmlPipeline': 300
    }
    custom_settings["DOWNLOADER_MIDDLEWARES"] = {
        'engine.useragent.RotateUserAgentMiddleware': 1
    }
    # start_urls = ['http://mini.qq.com/']
    custom_settings = dict(custom_settings.items() +
                           ConfigUtils.getItems(KEYMAP.MYSQL).items())
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY)
    id = ""
    allowed_domain = None  # 允许爬取的域名
    rules = [Rule(LinkExtractor(allow=()), callback='parse', follow=True)]
    cur_url_depth = 1  # 当前url的深度值
    depth_limit = 3  # 爬取深度

    def beforeStartUrl(self, dataDict):
        if (dataDict == None):
            return dataDict
        id = dataDict.get("id")
        if id == None:
            return
        status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
        taskJobHistoryId = dataDict.get("taskJobHistoryId")
        if taskJobHistoryId:
            taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                taskJobHistoryId)
            if taskJobHistory:
                taskJobId = taskJobHistory.taskJobId
                self.taskJob = TaskJobDao.loadTaskById(taskJobId)
                self.taskJobHistory = taskJobHistory
        url = dataDict["url"] if dataDict.has_key(
            "url") else "http://www.baidu.com"
        self.url = url
        if self.allowed_domain is None:
            self.allowed_domain = self.get_first_domain(self.get_domain(url))
        self.cur_url_depth = dataDict.get("curUrlDepth")
        self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key(
            "depthLimit") else 3
        return url

    def get_domain(self, url):
        """
        获取url中的域名
        :param url: 
        :return: 
        """
        pattern = r'(?<=//).*?(?=/)'
        result = re.findall(pattern, url)
        if result and len(result) > 0:
            return result[0]
        else:
            pattern = r'(?<=//).*'
            result = re.findall(pattern, url)
            if result and len(result) > 0:
                return result[0]
            else:
                return None

    def get_first_domain(self, domain):
        """获取域名中的一级域名,比如www.baidu.com中的baidu.com"""
        pattern = r'(?<=\.).*'
        result = re.findall(pattern, domain)
        if result and len(result) > 0:
            return result[0]

    def parse(self, response):
        if response.body:
            urlListStatusId = response.meta["urlListStatusId"]
            if urlListStatusId:
                UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS)
            htmlItem = HtmlItem()
            htmlItem["url"] = response.url
            htmlItem["html"] = response.body
            subUrls = []
            URLgroup = LinkExtractor(allow=()).extract_links(response)
            if (self.cur_url_depth < self.depth_limit
                    and self.depth_limit != 0) or self.depth_limit == 0:
                for URL in URLgroup:
                    if self.is_domain_allowed(URL.url):
                        subUrls.append(URL.url)
            htmlItem["subUrls"] = subUrls
            # htmlItem["taskJob"]=self.taskJob
            # htmlItem["taskJobHistory"] = self.taskJobHistory
            htmlItem["curUrlDepth"] = self.cur_url_depth
            return htmlItem

    def is_domain_allowed(self, url):
        """
        判断当前url是否属于允许爬取的域名范围内
        :param url: 
        :return: 
        """
        logging.info("allowed_domain : " + self.allowed_domain)
        logging.info("url : " + url)
        if self.allowed_domain:
            cur_url_domain = self.get_domain(url)
            if cur_url_domain and self.allowed_domain in cur_url_domain:
                return True
            else:
                return False
        else:
            return True
Beispiel #12
0
class AssistRedisSpider(MainRedisSpider):
    name = ConfigUtils.getSpiderPorperties(KEYMAP.ASSIST_SPIDER_NAME)
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
    pass