Beispiel #1
0
def checkFinishJob():
    keys=RedisUtils.hkeys(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY))
    for key in keys :
        temp=RedisUtils.hget(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), key)
        newJobTemplate=json.loads(temp)
        url=newJobTemplate['url']
        try:
            request = urllib2.Request(
                url=url,
                headers=(random.choice(user_agent_list))
            )
            response = urllib2.urlopen(request)
            urldate = response.headers['date']
            tempDate= newJobTemplate['urldate']
            print urldate
            print tempDate
            if urldate == tempDate:
                pass
            else:
                newJobTemplate['urldate']=urldate

                taskJobHistoryId = newJobTemplate['taskJobHistoryId']
                taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.id==taskJobHistoryId,TaskJobHistory.delFlag==False).order_by(" create_time desc").first()
                taskJob=Session.query(TaskJob).filter(TaskJob.id==taskJobHistory.taskJobId).first()
                LoggerDao.addTaskJobLogger(taskJob,LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate['id'],taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库",url=url,status=TaskStatus.RUNNING)

                RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId,stringify(newJobTemplate))
                RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate['id'],stringify(newJobTemplate))
        except Exception,e:
            pass
            print e
Beispiel #2
0
def export(tablename, resultDict, title):

    # 重置游标的位置
    # 搜取所有结果
    # 获取MYSQL里面的数据字段名称
    workbook = xlwt.Workbook(encoding='utf8')
    sheet = workbook.add_sheet('table', cell_overwrite_ok=True)

    # 写上字段信息
    for field in range(0, len(title)):
        sheet.write(0, field, title[field])

    # 获取并写入数据段信息

    for row in range(1, len(resultDict) + 1):
        for col in range(0, len(title)):
            sheet.write(row, col, u'%s' % resultDict[row - 1][col])

    # path2= os.getcwd()
    ExcelFile = tablename + '_' + str(random.randint(100000, 999999)) + '.xls'

    workbook.save("web/static/excel/" + ExcelFile)

    ip = get_ip()

    path = ip + ":" + ConfigUtils.getWebPorperties(KEYMAP.PORT)

    return 'http://' + path + '/static/excel/' + ExcelFile
Beispiel #3
0
 def loadNext(self, childJobTemplateList, item):
     if childJobTemplateList == None or len(childJobTemplateList) == 0:
         # pcInfo = Pcinfo()
         # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME))
         # if pidList and len(pidList):
         #     RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0)
         #     for pid in pidList:
         #         RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0)
         # else:
         if llen(
                 ConfigUtils.getRedisPorperties(
                     KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
             if self.taskJob.status != TaskStatus.SUCCESS:
                 TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                TaskStatus.SUCCESS)
                 UrlDao.updateUrlStatusListByTaskJobHistoryId(
                     self.taskJobHistoryId,
                     status=UrlStatus.STOP,
                     desc="The task is over and no longer crawls on this URL"
                 )
         return
     for jobTemplate in childJobTemplateList:
         parentId = str(item.get("id"))
         taskJobParam = TaskJobParam(paramNameEn="dataParentId",
                                     paramValue=parentId)
         taskJobParamList = []
         taskJobParamList.append(taskJobParam)
         taskJobParamList.extend(self.taskJobParamList)
         CrawlerService.parseUrlAndInsertRedis(
             taskJob=self.taskJob,
             paramMap=item,
             taskJobParam=taskJobParamList,
             taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId),
             jobTemplate=jobTemplate)
Beispiel #4
0
class RedisSpider(RedisCallbackSpider):
    name = 'RedisSpider'
    custom_settings = ConfigUtils.getItems("REDIS")
    redis_key = 'redisSpider:startId'

    def beforeStartUrl(self, data):
        if (data == None):
            return data
        dict = json.loads(data)
        return dict["url"] if dict.has_key("url") else "http://www.baidu.com"

    def parse(self, response):
        items = []
        hxs = Selector(response)
        jobTemplateFieldList = query(JobTemplateField, type=1)
        for jobTemplateField in jobTemplateFieldList:
            fieldNameEn = jobTemplateField.fieldNameEn
            fieldValue = jobTemplateField.fieldValue
            node = hxs.xpath(fieldValue).extract()
            split = jobTemplateField.split if jobTemplateField.split != None else ""
            value = split.join(node)
            value = value if value != None else ""
            regExp = jobTemplateField.regExp
            if regExp != None and regExp != "":
                pattern = re.compile(regExp)
                matches = pattern.search(value.encode("utf8"))
                if matches != None and len(matches.groups()) > 0:
                    value = regExp.join(matches.groups())
                elif len(matches.groups()) == 0 and matches != None:
                    value = matches.group()
            items[fieldNameEn] = value
        self.log('A response from %s just arrived!' % response.url)
        return items
Beispiel #5
0
 def process_request(self, request, spider):
     driver = None
     logging.info("*************PhontomJsMiddleware*************")
     jobTemplateList = CacheFactory.get("job_template_by_url", request.url)
     if jobTemplateList != None and len(jobTemplateList) > 0:
         jobTemplate = jobTemplateList[0]
     else:
         jobTemplate = spider.jobTemplate
     # jobTemplate = spider.jobTemplate
     if jobTemplate:
         if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin:
             userName = jobTemplate.userName
             password = jobTemplate.passWord
             unameId = jobTemplate.unameElementId
             passwordId = jobTemplate.pwdElementId
             submitId = jobTemplate.submitElementId
             return self.login(request=request, username=userName, password=password, username_id=unameId,
                               password_id=passwordId, submit_id=submitId)
         if jobTemplate.phantomjsFlag and mutex.acquire():
             try:
                 logging.info("*************PhontomJsMiddleware*************" + request.url)
                 driver = webdriver.PhantomJS(executable_path=ConfigUtils.getSpiderPorperties(
                     ConfigUtils.KEYMAP.PHANTOMJS))  # executable_path='D:\\developTools\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe'
                 capabilities = self.get_desired_capabilities(spider)
                 driver.start_session(capabilities)
                 driver.set_page_load_timeout(30)
                 driver.set_script_timeout(30)
                 driver.set_window_size(1000, 10000)  # 尽量将窗口设置大一些,以应对某些网站使用懒加载
                 driver.get(request.url)
                 time.sleep(int(jobTemplate.sleepTime))
                 body = driver.page_source
                 logging.info("PhantomJS is visiting " + request.url)
                 htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
                 # driver.close()
                 # driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                 # driver.quit()
                 return htmlResponse
             except Exception, e:
                 urlListStatusId = request.meta.get("urlListStatusId")
                 if urlListStatusId:
                     UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception))
                 logging.exception("time out visiting==>%s,%s" % (request.url, str(e)))
                 # try:
                 #     if driver!=None:
                 #         logging.exception("time out visiting==>%s,%s"%(request.url,str(e)))
                 #         # driver.close()
                 #         driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                 #         driver.quit()
                 # except Exception,e:
                 #     logging.error("451e:" + str(e))
                 # return
             finally:
                 # try:
                 #     driver.close()
                 # except Exception, e:
                 #     logging.error("452e:" + str(e))
                 try:
Beispiel #6
0
def parseResponse(errorCode,dict=None,flag=False):
    result=stringify(parseResponseJson(errorCode,dict))
    if flag:
        return result
    rsp = make_response(result)
    allowCrossOrigin=eval(ConfigUtils.getWebPorperties("ALLOW_CROSS_ORIGIN"))
    if allowCrossOrigin:
        rsp.headers['Access-Control-Allow-Origin'] = '*'
    rsp.mimetype = 'application/json;charset=utf-8'
    return rsp
Beispiel #7
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
Beispiel #8
0
 def __init__(self):
     """
     init
     :return:
     """
     self.dbconfig = []
     self.settings = get_project_settings()
     self.dbparms = dict(
         # MYSQL_HOST="127.0.0.1"
         # MYSQL_DBNAME = "wyy"
         # MYSQL_USER = "******"
         # MYSQL_PASSWORD = "******"
         host=ConfigUtils.getMysqlPorperties(ConfigUtils.KEYMAP.MYSQL_HOST),
         dbname=ConfigUtils.getMysqlPorperties(
             ConfigUtils.KEYMAP.MYSQL_DBNAME),
         username=ConfigUtils.getMysqlPorperties(
             ConfigUtils.KEYMAP.MYSQL_USERNAME),
         password=ConfigUtils.getMysqlPorperties(
             ConfigUtils.KEYMAP.MYSQL_PASSWD),
         charset='utf8',
         cursorclass=MySQLdb.cursors.DictCursor,
     )
     self.db_type = None
Beispiel #9
0
class MainRedisSpider(RedisCallbackSpider):  #
    name = ConfigUtils.getSpiderPorperties(KEYMAP.MAIN_SPIDER_NAME)
    custom_settings = ConfigUtils.getItems(KEYMAP.REDIS)
    custom_settings["ITEM_PIPELINES"] = {
        'engine.pipelines.DataBaseSavePipeline': 300
    }
    custom_settings["DOWNLOADER_MIDDLEWARES"] = {
        'engine.useragent.RotateUserAgentMiddleware': 1,
        'engine.middlewares.ProxyMiddleware': 2
    }
    custom_settings = dict(custom_settings.items() +
                           ConfigUtils.getItems(KEYMAP.MYSQL).items())
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)
    id = ""

    # start_urls=["https://www.baidu.com"]
    def beforeStartUrl(self, dataDict):
        if (dataDict == None):
            return dataDict
        try:
            self.taskJob = RequestUtils.parseResToClass(TaskJob, dataDict)
        except Exception, e:
            logging.error("TemplateRedisSpider[beforeStartUrl:error]:%s" % (e))
            return None
        self.params = dataDict
        id = dataDict.get("id")
        if id == None:
            return
        status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
        # if status!=None and str(status)!=str(TaskTable.TaskStatus.RUNNING):
        #     return None;
        url = dataDict["url"] if dataDict.has_key(
            "url") else "http://www.baidu.com"
        self.url = url
        CacheFactory.cache("job_template_url", id, self.params)
        return url
Beispiel #10
0
 def process_request(self, request, spider):
     driver = None
     logging.info("*************ChromeMiddleware*************")
     jobTemplateList = CacheFactory.get("job_template_by_url", request.url)
     if jobTemplateList != None and len(jobTemplateList) > 0:
         jobTemplate = jobTemplateList[0]
     else:
         jobTemplate = spider.jobTemplate
     if jobTemplate:
         if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin:
             userName = jobTemplate.userName
             password = jobTemplate.passWord
             unameId = jobTemplate.unameElementId
             passwordId = jobTemplate.pwdElementId
             submitId = jobTemplate.submitElementId
             return self.login(request=request, username=userName, password=password, username_id=unameId,
                               password_id=passwordId, submit_id=submitId)
         if jobTemplate.chromeFlag and mutex.acquire():
             try:
                 driver = webdriver.Chrome(
                     executable_path=ConfigUtils.getSpiderPorperties(ConfigUtils.KEYMAP.CHROME))
                 capabilities = self.get_desired_capabilities(spider)
                 driver.start_session(capabilities)
                 driver.set_page_load_timeout(30)
                 driver.set_script_timeout(30)
                 driver.set_window_size(1000, 10000)  # 尽量将窗口设置大一些,以应对某些网站使用懒加载
                 driver.get(request.url)
                 time.sleep(int(jobTemplate.sleepTime))
                 body = driver.page_source
                 logging.info("PhantomJS is visiting " + request.url)
                 htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
                 driver.quit()
                 return htmlResponse
             except Exception, e:
                 urlListStatusId = request.meta.get("urlListStatusId")
                 if urlListStatusId:
                     UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception))
                 logging.exception("time out visiting==>%s,%s" % (request.url, str(e)))
             finally:
                 try:
Beispiel #11
0
 def process_item(self, item, spider):
     try:
         curUrl = item["url"]
         subUrls = item["subUrls"]
         taskJob = spider.taskJob
         self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"])
         taskJobHistory = spider.taskJobHistory
         if subUrls and len(subUrls)>0:
             parentUrlDepth = item["curUrlDepth"]
             for url in subUrls:
                 newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob)
                 newTaskJob.url=url
                 newTaskJob.curUrlDepth=parentUrlDepth+1
                 newTaskJob.parentUrl = curUrl
                 CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory)
         else:
             if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                 if taskJob.status != TaskStatus.SUCCESS:
                     TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS)
                     UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP,
                                                                  desc="depth spider is over")
         return item
     except Exception,e:
         logger.exception("CacheHtmlPipeline:"+str(e))
Beispiel #12
0
#coding=utf-8
#Created by xutao on 2017-05-08.
import sys
import threading

from customize_app.publisher import WebSocket as websocket
from utils import ConfigUtils

reload(sys)
sys.setdefaultencoding('utf-8')

ip = ConfigUtils.getPorperties(ConfigUtils.KEYMAP.WEBSOCKET,
                               ConfigUtils.KEYMAP.WEBSOCKET_IP)
port = ConfigUtils.getPorperties(ConfigUtils.KEYMAP.WEBSOCKET,
                                 ConfigUtils.KEYMAP.WEBSOCKET_PORT)


def init():
    t = threading.Thread(target=websocket.start, args=(ip, int(port)))
    # t.start()
Beispiel #13
0
#coding=utf-8
#Created by xutao on 2017-04-14.
import logging

from Settings import LoggerLevel
from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP
from web import start
from utils.LogUtils import LoggingFormatter, LoggingHandle
if __name__ == "__main__":
    # logging.basicConfig(level=LoggerLevel,
    #                     format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
    formater = LoggingFormatter(filename='startFlask')
    handle = LoggingHandle()
    handle.setLevel(LoggerLevel)
    handle.setFormatter(formater)
    logging.getLogger('').addHandler(handle)
    start(host=ConfigUtils.getWebPorperties(KEYMAP.HOST),
          port=int(ConfigUtils.getWebPorperties(KEYMAP.PORT)))
Beispiel #14
0
def start():
    app.run(host=ConfigUtils.getWebPorperties("CONFIG_HOST"),
            port=5002,
            threaded=True)
Beispiel #15
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)
Beispiel #16
0
def parseUrlAndInsertRedis(taskJob,
                           paramMap={},
                           taskJobParam=None,
                           taskJobHistory=None,
                           jobTemplate=None):
    if TaskType.DEPTH == str(taskJob.type):
        if bloomfilter_check(taskJob.id, taskJob.url):
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY),
                taskJobHistory.id)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) +
                "_" + taskJobHistory.id, stringify(taskJob))
    else:
        url = taskJob.url
        taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id)
        if taskJobParam != None:
            if isinstance(taskJobParam, list):
                taskJobParamList.extend(taskJobParam)
            else:
                taskJobParamList.append(taskJobParam)
        jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId(
            jobTemplate.id)
        if jobTemplateParamList != None and len(jobTemplateParamList) > 0:
            taskJobParamList.extend(jobTemplateParamList)
        if taskJobHistory != None:
            jobTemplateParamTaskJob = JobTemplateParam(
                paramNameEn="task_job_id_sequence",
                paramValue=str(taskJobHistory.id))
            jobTemplateParamList.append(jobTemplateParamTaskJob)
        if taskJobParamList == None or len(taskJobParamList) <= 0:
            if str(taskJob.type) == TaskType.BATCH:
                url = jobTemplate.url
            renderUrl = RenderUtils.render(url, paramMap)

            # if bloomfilter_check(taskJob.id, renderUrl):
            newJobTemplate = ClassCopy.copyToNewInstances(
                jobTemplate, JobTemplate)
            taskJobHistoryId = taskJobHistory.id
            urlListStatus = UrlClazz(url=jobTemplate.url,
                                     parentUrl=paramMap.get("task_job_url"),
                                     jobTemplateId=jobTemplate.id,
                                     jobTemplateParentId=jobTemplate.parentId,
                                     taskJobId=taskJob.id,
                                     taskJobHistoryId=taskJobHistoryId)
            # try:
            #     request = urllib2.Request(
            #         url=url,
            #         headers={
            #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
            #     )
            #     response = urllib2.urlopen(request)
            #     urldate = response.headers['date']
            # except Exception:
            #     pass
            #     print Exception
            setattr(newJobTemplate, "taskJobId", taskJob.id)
            setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
            setattr(newJobTemplate, "url", renderUrl)
            setattr(newJobTemplate, "extraData", paramMap)
            # setattr(newJobTemplate, "urldate", urldate)
            setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
            LoggerDao.addTaskJobLogger(taskJob,
                                       LoggerDao.LoggerType.URL_TO_REDIS,
                                       jobTemplateId=newJobTemplate.id,
                                       taskJobHistoryId=taskJobHistoryId,
                                       content=u"redis_入库",
                                       url=renderUrl,
                                       status=TaskStatus.RUNNING)
            # if (hashswitch):
            #     tempList.append(stringify(newJobTemplate))
            # else:
            # mainId.append(stringify(newJobTemplate))
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                taskJobHistoryId)
            RedisUtils.lpush(
                ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
                + "_" + taskJobHistoryId, stringify(newJobTemplate))
            RedisUtils.hset(
                ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY),
                newJobTemplate.id, stringify(newJobTemplate))
            saveUrlListStatus(urlListStatus)
        else:
            for data in paraseJobTemplateList(taskJobParamList, paramMap):
                if str(taskJob.type) == TaskType.BATCH:
                    url = jobTemplate.url
                parentId = paramMap.get("dataParentId")
                paramMap = dict(paramMap.items() + data.items())
                renderUrl = RenderUtils.render(url, paramMap)
                # if bloomfilter_check(taskJob.id, renderUrl):
                newJobTemplate = ClassCopy.copyToNewInstances(
                    jobTemplate, JobTemplate)
                taskJobHistoryId = taskJobHistory.id
                urlListStatus = UrlClazz(
                    url=renderUrl,
                    parentUrl=paramMap.get("task_job_url"),
                    jobTemplateId=jobTemplate.id,
                    jobTemplateParentId=jobTemplate.parentId,
                    taskJobId=taskJob.id,
                    taskJobHistoryId=taskJobHistoryId)
                # try:
                #     request = urllib2.Request(
                #         url=url,
                #         headers={
                #             'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                #         }
                #     )
                #     response = urllib2.urlopen(request)
                #     urldate = response.headers['date']
                # except Exception:
                #     pass
                #     print Exception
                setattr(newJobTemplate, "taskJobId", taskJob.id)
                setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId)
                setattr(newJobTemplate, "url", renderUrl)
                setattr(newJobTemplate, "dataParentId", parentId)
                setattr(newJobTemplate, "extraData", paramMap)
                # setattr(newJobTemplate, "urldate", urldate)
                setattr(newJobTemplate, "urlListStatusId", urlListStatus.id)
                LoggerDao.addTaskJobLogger(taskJob,
                                           LoggerDao.LoggerType.URL_TO_REDIS,
                                           jobTemplateId=newJobTemplate.id,
                                           taskJobHistoryId=taskJobHistoryId,
                                           content=u"redis_入库_多参数",
                                           url=renderUrl,
                                           status=TaskStatus.RUNNING)
                # if (hashswitch):
                #     tempList.append(newJobTemplate)
                # else:
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId)
                RedisUtils.lpush(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" +
                    taskJobHistoryId, stringify(newJobTemplate))
                # mainId.append(stringify(newJobTemplate))
                RedisUtils.hset(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id,
                    stringify(newJobTemplate))
                saveUrlListStatus(urlListStatus)
Beispiel #17
0
from utils import ClassCopy
from utils import ConfigUtils
from utils import RedisUtils
from utils import RenderUtils
from utils.ConfigUtils import KEYMAP
from utils.DBClient import DbClient
from utils.JsonUtils import stringify
from utils.RedisUtils import BloomFilter
from utils.HashFilter import YHash
from utils.RedisUtils import lpush
import urllib
import urllib2

bloomfilter = BloomFilter()

NodeListStr = ConfigUtils.getRedisPorperties(
    KEYMAP.DISTRIBUTED_SPIDER_NODE_LIST)
NodeList = NodeListStr.split(',')
hashConsistency = YHash(
    NodeList, int(ConfigUtils.getRedisPorperties(KEYMAP.VIRTUAL_NODE)))
switch = ConfigUtils.getRedisPorperties(KEYMAP.DISTRIBUTED_SPIDER_SWITCH)
hashswitch = switch == str(True)
tempList = []
nodePool = []
mainId = []


def paraseJobTemplateList(jobTemplateParamList, paramMap, loopFlag=False):
    paramList = []
    length = len(jobTemplateParamList)
    newJobTemplateParamList = []
    if jobTemplateParamList != None and length > 0:
Beispiel #18
0
#coding=utf-8
import json
import os
from flask import request
from werkzeug.utils import secure_filename
from beans.SeriesEntity import JobTemplateSerialize
from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP
from utils.ResponseUtils import InvalidAPIUsage
from sqlalchemy.orm.attributes import InstrumentedAttribute
import urllib2
import logging
ALLOWED_EXTENSIONS = ConfigUtils.getPorperties(KEYMAP.UPLOAD,
                                               KEYMAP.ALLOW_FILES)
UPLOAD_FOLDER = ConfigUtils.getPorperties(KEYMAP.UPLOAD, KEYMAP.UPLOAD_FOLDER)


def loadParams():
    params = {}
    try:
        if (request.method == "POST"):
            data = urllib2.unquote(str(request.data))  #避免中文参数乱码
            params = json.loads(data)
        else:
            return request.args
    except Exception, e:
        # logging.error('requst %s error --reason: %s' % (request.url,e.message))
        InvalidAPIUsage(e.message, 400)
    if not isinstance(params, dict):
        return {}
    return params
Beispiel #19
0
    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop

        if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()):
            reactor.stop()
        """Returns a request to be scheduled or none."""

        # XXX: Do we need to use a timeout here?
        found = 0
        while found < self.redis_batch_size:
            redis_key = fetch_one(self.redis_key)
            taskJobHistoryId = redis_key
            if taskJobHistoryId != None:
                taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId)
                if taskJobHistory:
                    taskJobId = taskJobHistory.taskJobId
                    taskJob = TaskJobDao.loadTaskById(taskJobId)
                    if taskJob and taskJob.status == TaskStatus.PAUSE:
                        RedisUtils.lpush(
                            ConfigUtils.getRedisPorperties(
                                KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                            taskJobHistoryId)
                        break
                else:
                    break
            else:
                break
            if hashswitch:
                if str(localIP) != str(tjs.get_node(redis_key)):
                    RedisUtils.lpush(self.redis_key, redis_key)
                    return

            redis_key = self.redis_key + "_" + redis_key
            orginData = fetch_one(redis_key)
            data = None
            # data = fetch_one(self.redis_key)
            try:
                logging.info("orginData==" + orginData)
                orginData = json.loads(orginData)
                orginData["taskJobHistoryId"] = taskJobHistoryId
                data = self.beforeStartUrl(orginData)
            except Exception, e:
                logging.error("Error e:")
                logging.error(e)
                logging.error(orginData)
                break
            if not data:
                # Queue empty.
                logging.warning('********dataUrl is null*************')
                break
            try:
                req = self.make_request_from_data(data)
                # req.replace(meta={"id":"123"})
                req.meta["id"] = orginData.get("id")
                req.meta["dataParentId"] = orginData.get("dataParentId")
                req.meta["taskJobHistoryId"] = orginData.get(
                    "taskJobHistoryId")
                req.meta["url"] = orginData.get("url")
                urlListStatusId = req.meta["urlListStatusId"] = orginData.get(
                    "urlListStatusId")
            except Exception, e:
                logging.error("make_request_from_data:e:" + e)
                break
Beispiel #20
0
#coding=utf-8
import logging
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
import sys

import Settings
from Settings import LoggerLevel, SqlLoggerLevel
from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP
reload(sys)
sys.setdefaultencoding('utf-8')
db_config = {
    'host': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_HOST),
    'user': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_USERNAME),
    'passwd': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_PASSWD),
    'db': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_DBNAME),
    'charset': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_ENCODE),
    'port': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_PORT)
}
baseEngine = create_engine(
    'mysql://%s:%s@%s:%s/%s?charset=%s' %
    (db_config['user'], db_config['passwd'], db_config['host'],
     db_config['port'], db_config['db'], db_config['charset']),
    pool_recycle=1800,
    echo=False,
    encoding='utf8',
    isolation_level="READ COMMITTED",
    convert_unicode=True)
logging.basicConfig(
    level=LoggerLevel,
Beispiel #21
0
#coding=utf-8
#Created by xutao on 2017-04-21.
#去重redis数据
import redis

from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP

dereplicationRedisConfig = ConfigUtils.getItems(KEYMAP.DEREPLICATION)
dereHost = dereplicationRedisConfig.get(KEYMAP.REDIS_HOST)
derePort = dereplicationRedisConfig.get(KEYMAP.REDIS_PORT)
dereNamespace = dereplicationRedisConfig.get(KEYMAP.REDIS_NAMESPACE)
derePrefrex = dereplicationRedisConfig.get(KEYMAP.REDIS_PREFIX)

#去重redis连接池
derePool = redis.ConnectionPool(host=dereHost, port=int(derePort))
dereRedis = redis.Redis(connection_pool=derePool)


def lpush(key, value):
    return dereRedis.lpush(key, value)


def hset(namespace, key, value):
    return dereRedis.hset(namespace, key, value)


def hget(namespace, key):
    return dereRedis.hget(namespace, key)

Beispiel #22
0
# coding=utf-8
from hashlib import md5

import redis

from utils import ConfigUtils
from utils.ConfigUtils import KEYMAP
#普通redis数据
redisConfig = ConfigUtils.getItems(KEYMAP.URL_STATUS_REDIS)
port = redisConfig.get(KEYMAP.REDIS_PORT)
host = redisConfig.get(KEYMAP.REDIS_HOST)
namespace = redisConfig.get(KEYMAP.REDIS_NAMESPACE)
prefix = redisConfig.get(KEYMAP.REDIS_PREFIX)

#普通redis连接池
pool = redis.ConnectionPool(host=host, port=int(port))
r = redis.Redis(connection_pool=pool)


def llen(key):
    return r.llen(key)


def keys():
    return r.keys()


def lpush(key, value):
    return r.lpush(key, value)

Beispiel #23
0
from beans.TaskTable import TaskStatus
from beans.UrlTable import UrlClazz, UrlStatus
from dao import TaskJobDao
from dao import UrlDao
from dao.TaskJobDao import loadTaskJobHistoryById
from utils import LicenseUtils

from utils import ConfigUtils
from utils import RedisUtils
from utils.ConfigUtils import KEYMAP
from utils.ExportExcelUtils import get_ip
from utils.HashFilter import YHash
from utils import RedisUtils

hashswitch = ConfigUtils.getRedisPorperties(
    KEYMAP.DISTRIBUTED_SPIDER_SWITCH) == str(True)
NodeList = ConfigUtils.getRedisPorperties(
    KEYMAP.DISTRIBUTED_SPIDER_NODE_LIST).split(',')
localIP = get_ip()
tjs = YHash(NodeList, int(ConfigUtils.getRedisPorperties(KEYMAP.VIRTUAL_NODE)))


class RedisCallbackSpider(RedisCrawlSpider):
    def beforeStartUrl(self, data):
        return data

    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop
Beispiel #24
0
class DepthSpider(RedisCallbackSpider):
    name = ConfigUtils.getSpiderPorperties(KEYMAP.DEPTH_SPIDER_NAME)
    custom_settings = ConfigUtils.getItems(KEYMAP.REDIS)
    custom_settings["ITEM_PIPELINES"] = {
        'engine.pipelines.CacheHtmlPipeline': 300
    }
    custom_settings["DOWNLOADER_MIDDLEWARES"] = {
        'engine.useragent.RotateUserAgentMiddleware': 1
    }
    # start_urls = ['http://mini.qq.com/']
    custom_settings = dict(custom_settings.items() +
                           ConfigUtils.getItems(KEYMAP.MYSQL).items())
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY)
    id = ""
    allowed_domain = None  # 允许爬取的域名
    rules = [Rule(LinkExtractor(allow=()), callback='parse', follow=True)]
    cur_url_depth = 1  # 当前url的深度值
    depth_limit = 3  # 爬取深度

    def beforeStartUrl(self, dataDict):
        if (dataDict == None):
            return dataDict
        id = dataDict.get("id")
        if id == None:
            return
        status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
        taskJobHistoryId = dataDict.get("taskJobHistoryId")
        if taskJobHistoryId:
            taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                taskJobHistoryId)
            if taskJobHistory:
                taskJobId = taskJobHistory.taskJobId
                self.taskJob = TaskJobDao.loadTaskById(taskJobId)
                self.taskJobHistory = taskJobHistory
        url = dataDict["url"] if dataDict.has_key(
            "url") else "http://www.baidu.com"
        self.url = url
        if self.allowed_domain is None:
            self.allowed_domain = self.get_first_domain(self.get_domain(url))
        self.cur_url_depth = dataDict.get("curUrlDepth")
        self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key(
            "depthLimit") else 3
        return url

    def get_domain(self, url):
        """
        获取url中的域名
        :param url: 
        :return: 
        """
        pattern = r'(?<=//).*?(?=/)'
        result = re.findall(pattern, url)
        if result and len(result) > 0:
            return result[0]
        else:
            pattern = r'(?<=//).*'
            result = re.findall(pattern, url)
            if result and len(result) > 0:
                return result[0]
            else:
                return None

    def get_first_domain(self, domain):
        """获取域名中的一级域名,比如www.baidu.com中的baidu.com"""
        pattern = r'(?<=\.).*'
        result = re.findall(pattern, domain)
        if result and len(result) > 0:
            return result[0]

    def parse(self, response):
        if response.body:
            urlListStatusId = response.meta["urlListStatusId"]
            if urlListStatusId:
                UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS)
            htmlItem = HtmlItem()
            htmlItem["url"] = response.url
            htmlItem["html"] = response.body
            subUrls = []
            URLgroup = LinkExtractor(allow=()).extract_links(response)
            if (self.cur_url_depth < self.depth_limit
                    and self.depth_limit != 0) or self.depth_limit == 0:
                for URL in URLgroup:
                    if self.is_domain_allowed(URL.url):
                        subUrls.append(URL.url)
            htmlItem["subUrls"] = subUrls
            # htmlItem["taskJob"]=self.taskJob
            # htmlItem["taskJobHistory"] = self.taskJobHistory
            htmlItem["curUrlDepth"] = self.cur_url_depth
            return htmlItem

    def is_domain_allowed(self, url):
        """
        判断当前url是否属于允许爬取的域名范围内
        :param url: 
        :return: 
        """
        logging.info("allowed_domain : " + self.allowed_domain)
        logging.info("url : " + url)
        if self.allowed_domain:
            cur_url_domain = self.get_domain(url)
            if cur_url_domain and self.allowed_domain in cur_url_domain:
                return True
            else:
                return False
        else:
            return True
# coding=utf-8
# Created by yupengcheng on 2017-08-02.
import logging
import socket
import os
import time
from kazoo.client import KazooClient, KazooState
from kazoo.exceptions import NodeExistsException
from Settings import KEYMAP
from utils import ConfigUtils

zookeeper_hosts = ConfigUtils.getZookeeperHosts(KEYMAP.ZOOKEEPER_HOSTS)
SPIDER_SERVER_NODE = '/spiderServer'


def get_ip():
    try:
        csock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        csock.connect(('8.8.8.8', 80))
        (addr, port) = csock.getsockname()
        csock.close()
        return addr
    except socket.error:
        return "127.0.0.1"


def add_node_to_zookeeper():
    zk = KazooClient(hosts=zookeeper_hosts)

    @zk.add_listener
    def zookeeper_listener(state):
Beispiel #26
0
class AssistRedisSpider(MainRedisSpider):
    name = ConfigUtils.getSpiderPorperties(KEYMAP.ASSIST_SPIDER_NAME)
    redis_key = ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY)
    pass