def checkFinishJob(): keys=RedisUtils.hkeys(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY)) for key in keys : temp=RedisUtils.hget(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), key) newJobTemplate=json.loads(temp) url=newJobTemplate['url'] try: request = urllib2.Request( url=url, headers=(random.choice(user_agent_list)) ) response = urllib2.urlopen(request) urldate = response.headers['date'] tempDate= newJobTemplate['urldate'] print urldate print tempDate if urldate == tempDate: pass else: newJobTemplate['urldate']=urldate taskJobHistoryId = newJobTemplate['taskJobHistoryId'] taskJobHistory=Session.query(TaskJobHistory).filter(TaskJobHistory.id==taskJobHistoryId,TaskJobHistory.delFlag==False).order_by(" create_time desc").first() taskJob=Session.query(TaskJob).filter(TaskJob.id==taskJobHistory.taskJobId).first() LoggerDao.addTaskJobLogger(taskJob,LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate['id'],taskJobHistoryId=taskJobHistoryId, content=u"redis_入库",url=url,status=TaskStatus.RUNNING) RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId,stringify(newJobTemplate)) RedisUtils.hset(ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate['id'],stringify(newJobTemplate)) except Exception,e: pass print e
def export(tablename, resultDict, title): # 重置游标的位置 # 搜取所有结果 # 获取MYSQL里面的数据字段名称 workbook = xlwt.Workbook(encoding='utf8') sheet = workbook.add_sheet('table', cell_overwrite_ok=True) # 写上字段信息 for field in range(0, len(title)): sheet.write(0, field, title[field]) # 获取并写入数据段信息 for row in range(1, len(resultDict) + 1): for col in range(0, len(title)): sheet.write(row, col, u'%s' % resultDict[row - 1][col]) # path2= os.getcwd() ExcelFile = tablename + '_' + str(random.randint(100000, 999999)) + '.xls' workbook.save("web/static/excel/" + ExcelFile) ip = get_ip() path = ip + ":" + ConfigUtils.getWebPorperties(KEYMAP.PORT) return 'http://' + path + '/static/excel/' + ExcelFile
def loadNext(self, childJobTemplateList, item): if childJobTemplateList == None or len(childJobTemplateList) == 0: # pcInfo = Pcinfo() # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME)) # if pidList and len(pidList): # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0) # for pid in pidList: # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0) # else: if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL" ) return for jobTemplate in childJobTemplateList: parentId = str(item.get("id")) taskJobParam = TaskJobParam(paramNameEn="dataParentId", paramValue=parentId) taskJobParamList = [] taskJobParamList.append(taskJobParam) taskJobParamList.extend(self.taskJobParamList) CrawlerService.parseUrlAndInsertRedis( taskJob=self.taskJob, paramMap=item, taskJobParam=taskJobParamList, taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId), jobTemplate=jobTemplate)
class RedisSpider(RedisCallbackSpider): name = 'RedisSpider' custom_settings = ConfigUtils.getItems("REDIS") redis_key = 'redisSpider:startId' def beforeStartUrl(self, data): if (data == None): return data dict = json.loads(data) return dict["url"] if dict.has_key("url") else "http://www.baidu.com" def parse(self, response): items = [] hxs = Selector(response) jobTemplateFieldList = query(JobTemplateField, type=1) for jobTemplateField in jobTemplateFieldList: fieldNameEn = jobTemplateField.fieldNameEn fieldValue = jobTemplateField.fieldValue node = hxs.xpath(fieldValue).extract() split = jobTemplateField.split if jobTemplateField.split != None else "" value = split.join(node) value = value if value != None else "" regExp = jobTemplateField.regExp if regExp != None and regExp != "": pattern = re.compile(regExp) matches = pattern.search(value.encode("utf8")) if matches != None and len(matches.groups()) > 0: value = regExp.join(matches.groups()) elif len(matches.groups()) == 0 and matches != None: value = matches.group() items[fieldNameEn] = value self.log('A response from %s just arrived!' % response.url) return items
def process_request(self, request, spider): driver = None logging.info("*************PhontomJsMiddleware*************") jobTemplateList = CacheFactory.get("job_template_by_url", request.url) if jobTemplateList != None and len(jobTemplateList) > 0: jobTemplate = jobTemplateList[0] else: jobTemplate = spider.jobTemplate # jobTemplate = spider.jobTemplate if jobTemplate: if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin: userName = jobTemplate.userName password = jobTemplate.passWord unameId = jobTemplate.unameElementId passwordId = jobTemplate.pwdElementId submitId = jobTemplate.submitElementId return self.login(request=request, username=userName, password=password, username_id=unameId, password_id=passwordId, submit_id=submitId) if jobTemplate.phantomjsFlag and mutex.acquire(): try: logging.info("*************PhontomJsMiddleware*************" + request.url) driver = webdriver.PhantomJS(executable_path=ConfigUtils.getSpiderPorperties( ConfigUtils.KEYMAP.PHANTOMJS)) # executable_path='D:\\developTools\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe' capabilities = self.get_desired_capabilities(spider) driver.start_session(capabilities) driver.set_page_load_timeout(30) driver.set_script_timeout(30) driver.set_window_size(1000, 10000) # 尽量将窗口设置大一些,以应对某些网站使用懒加载 driver.get(request.url) time.sleep(int(jobTemplate.sleepTime)) body = driver.page_source logging.info("PhantomJS is visiting " + request.url) htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) # driver.close() # driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc # driver.quit() return htmlResponse except Exception, e: urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception)) logging.exception("time out visiting==>%s,%s" % (request.url, str(e))) # try: # if driver!=None: # logging.exception("time out visiting==>%s,%s"%(request.url,str(e))) # # driver.close() # driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc # driver.quit() # except Exception,e: # logging.error("451e:" + str(e)) # return finally: # try: # driver.close() # except Exception, e: # logging.error("452e:" + str(e)) try:
def parseResponse(errorCode,dict=None,flag=False): result=stringify(parseResponseJson(errorCode,dict)) if flag: return result rsp = make_response(result) allowCrossOrigin=eval(ConfigUtils.getWebPorperties("ALLOW_CROSS_ORIGIN")) if allowCrossOrigin: rsp.headers['Access-Control-Allow-Origin'] = '*' rsp.mimetype = 'application/json;charset=utf-8' return rsp
def process_exception(self, request, exception, spider): urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception)) if llen(ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL") logger.info("process_exception ProxyMiddleware") return None
def __init__(self): """ init :return: """ self.dbconfig = [] self.settings = get_project_settings() self.dbparms = dict( # MYSQL_HOST="127.0.0.1" # MYSQL_DBNAME = "wyy" # MYSQL_USER = "******" # MYSQL_PASSWORD = "******" host=ConfigUtils.getMysqlPorperties(ConfigUtils.KEYMAP.MYSQL_HOST), dbname=ConfigUtils.getMysqlPorperties( ConfigUtils.KEYMAP.MYSQL_DBNAME), username=ConfigUtils.getMysqlPorperties( ConfigUtils.KEYMAP.MYSQL_USERNAME), password=ConfigUtils.getMysqlPorperties( ConfigUtils.KEYMAP.MYSQL_PASSWD), charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, ) self.db_type = None
class MainRedisSpider(RedisCallbackSpider): # name = ConfigUtils.getSpiderPorperties(KEYMAP.MAIN_SPIDER_NAME) custom_settings = ConfigUtils.getItems(KEYMAP.REDIS) custom_settings["ITEM_PIPELINES"] = { 'engine.pipelines.DataBaseSavePipeline': 300 } custom_settings["DOWNLOADER_MIDDLEWARES"] = { 'engine.useragent.RotateUserAgentMiddleware': 1, 'engine.middlewares.ProxyMiddleware': 2 } custom_settings = dict(custom_settings.items() + ConfigUtils.getItems(KEYMAP.MYSQL).items()) redis_key = ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY) id = "" # start_urls=["https://www.baidu.com"] def beforeStartUrl(self, dataDict): if (dataDict == None): return dataDict try: self.taskJob = RequestUtils.parseResToClass(TaskJob, dataDict) except Exception, e: logging.error("TemplateRedisSpider[beforeStartUrl:error]:%s" % (e)) return None self.params = dataDict id = dataDict.get("id") if id == None: return status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id) # if status!=None and str(status)!=str(TaskTable.TaskStatus.RUNNING): # return None; url = dataDict["url"] if dataDict.has_key( "url") else "http://www.baidu.com" self.url = url CacheFactory.cache("job_template_url", id, self.params) return url
def process_request(self, request, spider): driver = None logging.info("*************ChromeMiddleware*************") jobTemplateList = CacheFactory.get("job_template_by_url", request.url) if jobTemplateList != None and len(jobTemplateList) > 0: jobTemplate = jobTemplateList[0] else: jobTemplate = spider.jobTemplate if jobTemplate: if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin: userName = jobTemplate.userName password = jobTemplate.passWord unameId = jobTemplate.unameElementId passwordId = jobTemplate.pwdElementId submitId = jobTemplate.submitElementId return self.login(request=request, username=userName, password=password, username_id=unameId, password_id=passwordId, submit_id=submitId) if jobTemplate.chromeFlag and mutex.acquire(): try: driver = webdriver.Chrome( executable_path=ConfigUtils.getSpiderPorperties(ConfigUtils.KEYMAP.CHROME)) capabilities = self.get_desired_capabilities(spider) driver.start_session(capabilities) driver.set_page_load_timeout(30) driver.set_script_timeout(30) driver.set_window_size(1000, 10000) # 尽量将窗口设置大一些,以应对某些网站使用懒加载 driver.get(request.url) time.sleep(int(jobTemplate.sleepTime)) body = driver.page_source logging.info("PhantomJS is visiting " + request.url) htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) driver.quit() return htmlResponse except Exception, e: urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception)) logging.exception("time out visiting==>%s,%s" % (request.url, str(e))) finally: try:
def process_item(self, item, spider): try: curUrl = item["url"] subUrls = item["subUrls"] taskJob = spider.taskJob self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"]) taskJobHistory = spider.taskJobHistory if subUrls and len(subUrls)>0: parentUrlDepth = item["curUrlDepth"] for url in subUrls: newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob) newTaskJob.url=url newTaskJob.curUrlDepth=parentUrlDepth+1 newTaskJob.parentUrl = curUrl CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory) else: if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP, desc="depth spider is over") return item except Exception,e: logger.exception("CacheHtmlPipeline:"+str(e))
#coding=utf-8 #Created by xutao on 2017-05-08. import sys import threading from customize_app.publisher import WebSocket as websocket from utils import ConfigUtils reload(sys) sys.setdefaultencoding('utf-8') ip = ConfigUtils.getPorperties(ConfigUtils.KEYMAP.WEBSOCKET, ConfigUtils.KEYMAP.WEBSOCKET_IP) port = ConfigUtils.getPorperties(ConfigUtils.KEYMAP.WEBSOCKET, ConfigUtils.KEYMAP.WEBSOCKET_PORT) def init(): t = threading.Thread(target=websocket.start, args=(ip, int(port))) # t.start()
#coding=utf-8 #Created by xutao on 2017-04-14. import logging from Settings import LoggerLevel from utils import ConfigUtils from utils.ConfigUtils import KEYMAP from web import start from utils.LogUtils import LoggingFormatter, LoggingHandle if __name__ == "__main__": # logging.basicConfig(level=LoggerLevel, # format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') formater = LoggingFormatter(filename='startFlask') handle = LoggingHandle() handle.setLevel(LoggerLevel) handle.setFormatter(formater) logging.getLogger('').addHandler(handle) start(host=ConfigUtils.getWebPorperties(KEYMAP.HOST), port=int(ConfigUtils.getWebPorperties(KEYMAP.PORT)))
def start(): app.run(host=ConfigUtils.getWebPorperties("CONFIG_HOST"), port=5002, threaded=True)
def _do_upinsert(self, item): now = str(datetime.now()) data = item["data"] url = item["url"] jobTemplateFieldList = item["jobTemplateFieldList"] jobTemplate = item["jobTemplate"] self.dataParentId = jobTemplate.dataParentId if hasattr( jobTemplate, "dataParentId") else None extraData = jobTemplate.extraData self.taskJob = item["taskJob"] # searchTaskJob = item["searchTaskJob"] taskJobHistroy = item["taskJobHistroy"] self.taskJobHistoryId = jobTemplate.taskJobHistoryId taskJobHistroyId = str(taskJobHistroy.id) paramMap = {} self.taskJobParamList = [] if taskJobHistroy != None: self.taskJobParamList.append( TaskJobParam(paramNameEn="task_job_id_sequence", paramValue=taskJobHistroyId)) paramMap["task_job_id_sequence"] = taskJobHistroyId # if searchTaskJob!=None: # self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name)) # paramMap[searchTaskJob.name] = searchTaskJob.name # self.taskJobParamList = [] # if self.taskJobHistoryId!=None: # self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId) # if self.taskJobParamList!=None: # for taskJobParam in self.taskJobParamList: # paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue tableName = jobTemplate.tableName jobTemplateId = jobTemplate.id databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId db = self.dbclient.getConnection(databaseId) if db == None: logging.warning('db is null,please check it with databaseid :%s' % databaseId) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no db") return sqlArray = [] if data == None or len(data) == 0: logging.warning( 'insert data not exist,please retry crawler or check template or check error' ) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no data") return logging.info('----pipelines insert data-----%s' % str(data)) for d in data: d["task_job_url"] = url if self.dataParentId != None: d["parent_id"] = self.dataParentId d["id"] = str(uuid.uuid1()) if self.dbclient.db_type == 'kafka': d['TemplateName'] = jobTemplate.name d['UrlStatus'] = 0 d['Timestamps'] = int(time.time()) if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb': sqlArray.append( db.insert(jobTemplate.id, tableName, d, paramMap)) else: sqlArray.append(db.insert(tableName, d, paramMap)) if jobTemplateId != None: try: childJobTemplateList = TemplateDao.queryJobTemplateListByParentId( jobTemplateId) self.loadNext(childJobTemplateList, dict(extraData.items() + d.items())) except Exception, e: logging.error(e.message)
def parseUrlAndInsertRedis(taskJob, paramMap={}, taskJobParam=None, taskJobHistory=None, jobTemplate=None): if TaskType.DEPTH == str(taskJob.type): if bloomfilter_check(taskJob.id, taskJob.url): RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY), taskJobHistory.id) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) + "_" + taskJobHistory.id, stringify(taskJob)) else: url = taskJob.url taskJobParamList = TaskJobDao.queryTaskJobParam(taskJob.id) if taskJobParam != None: if isinstance(taskJobParam, list): taskJobParamList.extend(taskJobParam) else: taskJobParamList.append(taskJobParam) jobTemplateParamList = TemplateDao.queryJobTemplateParamByJobTemplateId( jobTemplate.id) if jobTemplateParamList != None and len(jobTemplateParamList) > 0: taskJobParamList.extend(jobTemplateParamList) if taskJobHistory != None: jobTemplateParamTaskJob = JobTemplateParam( paramNameEn="task_job_id_sequence", paramValue=str(taskJobHistory.id)) jobTemplateParamList.append(jobTemplateParamTaskJob) if taskJobParamList == None or len(taskJobParamList) <= 0: if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz(url=jobTemplate.url, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(stringify(newJobTemplate)) # else: # mainId.append(stringify(newJobTemplate)) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties(KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus) else: for data in paraseJobTemplateList(taskJobParamList, paramMap): if str(taskJob.type) == TaskType.BATCH: url = jobTemplate.url parentId = paramMap.get("dataParentId") paramMap = dict(paramMap.items() + data.items()) renderUrl = RenderUtils.render(url, paramMap) # if bloomfilter_check(taskJob.id, renderUrl): newJobTemplate = ClassCopy.copyToNewInstances( jobTemplate, JobTemplate) taskJobHistoryId = taskJobHistory.id urlListStatus = UrlClazz( url=renderUrl, parentUrl=paramMap.get("task_job_url"), jobTemplateId=jobTemplate.id, jobTemplateParentId=jobTemplate.parentId, taskJobId=taskJob.id, taskJobHistoryId=taskJobHistoryId) # try: # request = urllib2.Request( # url=url, # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # ) # response = urllib2.urlopen(request) # urldate = response.headers['date'] # except Exception: # pass # print Exception setattr(newJobTemplate, "taskJobId", taskJob.id) setattr(newJobTemplate, "taskJobHistoryId", taskJobHistoryId) setattr(newJobTemplate, "url", renderUrl) setattr(newJobTemplate, "dataParentId", parentId) setattr(newJobTemplate, "extraData", paramMap) # setattr(newJobTemplate, "urldate", urldate) setattr(newJobTemplate, "urlListStatusId", urlListStatus.id) LoggerDao.addTaskJobLogger(taskJob, LoggerDao.LoggerType.URL_TO_REDIS, jobTemplateId=newJobTemplate.id, taskJobHistoryId=taskJobHistoryId, content=u"redis_入库_多参数", url=renderUrl, status=TaskStatus.RUNNING) # if (hashswitch): # tempList.append(newJobTemplate) # else: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY) + "_" + taskJobHistoryId, stringify(newJobTemplate)) # mainId.append(stringify(newJobTemplate)) RedisUtils.hset( ConfigUtils.getRedisPorperties( KEYMAP.FINISH_SPIDER_REDIS_KEY), newJobTemplate.id, stringify(newJobTemplate)) saveUrlListStatus(urlListStatus)
from utils import ClassCopy from utils import ConfigUtils from utils import RedisUtils from utils import RenderUtils from utils.ConfigUtils import KEYMAP from utils.DBClient import DbClient from utils.JsonUtils import stringify from utils.RedisUtils import BloomFilter from utils.HashFilter import YHash from utils.RedisUtils import lpush import urllib import urllib2 bloomfilter = BloomFilter() NodeListStr = ConfigUtils.getRedisPorperties( KEYMAP.DISTRIBUTED_SPIDER_NODE_LIST) NodeList = NodeListStr.split(',') hashConsistency = YHash( NodeList, int(ConfigUtils.getRedisPorperties(KEYMAP.VIRTUAL_NODE))) switch = ConfigUtils.getRedisPorperties(KEYMAP.DISTRIBUTED_SPIDER_SWITCH) hashswitch = switch == str(True) tempList = [] nodePool = [] mainId = [] def paraseJobTemplateList(jobTemplateParamList, paramMap, loopFlag=False): paramList = [] length = len(jobTemplateParamList) newJobTemplateParamList = [] if jobTemplateParamList != None and length > 0:
#coding=utf-8 import json import os from flask import request from werkzeug.utils import secure_filename from beans.SeriesEntity import JobTemplateSerialize from utils import ConfigUtils from utils.ConfigUtils import KEYMAP from utils.ResponseUtils import InvalidAPIUsage from sqlalchemy.orm.attributes import InstrumentedAttribute import urllib2 import logging ALLOWED_EXTENSIONS = ConfigUtils.getPorperties(KEYMAP.UPLOAD, KEYMAP.ALLOW_FILES) UPLOAD_FOLDER = ConfigUtils.getPorperties(KEYMAP.UPLOAD, KEYMAP.UPLOAD_FOLDER) def loadParams(): params = {} try: if (request.method == "POST"): data = urllib2.unquote(str(request.data)) #避免中文参数乱码 params = json.loads(data) else: return request.args except Exception, e: # logging.error('requst %s error --reason: %s' % (request.url,e.message)) InvalidAPIUsage(e.message, 400) if not isinstance(params, dict): return {} return params
def next_requests(self): use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()): reactor.stop() """Returns a request to be scheduled or none.""" # XXX: Do we need to use a timeout here? found = 0 while found < self.redis_batch_size: redis_key = fetch_one(self.redis_key) taskJobHistoryId = redis_key if taskJobHistoryId != None: taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId taskJob = TaskJobDao.loadTaskById(taskJobId) if taskJob and taskJob.status == TaskStatus.PAUSE: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) break else: break else: break if hashswitch: if str(localIP) != str(tjs.get_node(redis_key)): RedisUtils.lpush(self.redis_key, redis_key) return redis_key = self.redis_key + "_" + redis_key orginData = fetch_one(redis_key) data = None # data = fetch_one(self.redis_key) try: logging.info("orginData==" + orginData) orginData = json.loads(orginData) orginData["taskJobHistoryId"] = taskJobHistoryId data = self.beforeStartUrl(orginData) except Exception, e: logging.error("Error e:") logging.error(e) logging.error(orginData) break if not data: # Queue empty. logging.warning('********dataUrl is null*************') break try: req = self.make_request_from_data(data) # req.replace(meta={"id":"123"}) req.meta["id"] = orginData.get("id") req.meta["dataParentId"] = orginData.get("dataParentId") req.meta["taskJobHistoryId"] = orginData.get( "taskJobHistoryId") req.meta["url"] = orginData.get("url") urlListStatusId = req.meta["urlListStatusId"] = orginData.get( "urlListStatusId") except Exception, e: logging.error("make_request_from_data:e:" + e) break
#coding=utf-8 import logging from sqlalchemy import create_engine from sqlalchemy.pool import NullPool import sys import Settings from Settings import LoggerLevel, SqlLoggerLevel from utils import ConfigUtils from utils.ConfigUtils import KEYMAP reload(sys) sys.setdefaultencoding('utf-8') db_config = { 'host': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_HOST), 'user': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_USERNAME), 'passwd': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_PASSWD), 'db': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_DBNAME), 'charset': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_ENCODE), 'port': ConfigUtils.getMysqlPorperties(KEYMAP.MYSQL_PORT) } baseEngine = create_engine( 'mysql://%s:%s@%s:%s/%s?charset=%s' % (db_config['user'], db_config['passwd'], db_config['host'], db_config['port'], db_config['db'], db_config['charset']), pool_recycle=1800, echo=False, encoding='utf8', isolation_level="READ COMMITTED", convert_unicode=True) logging.basicConfig( level=LoggerLevel,
#coding=utf-8 #Created by xutao on 2017-04-21. #去重redis数据 import redis from utils import ConfigUtils from utils.ConfigUtils import KEYMAP dereplicationRedisConfig = ConfigUtils.getItems(KEYMAP.DEREPLICATION) dereHost = dereplicationRedisConfig.get(KEYMAP.REDIS_HOST) derePort = dereplicationRedisConfig.get(KEYMAP.REDIS_PORT) dereNamespace = dereplicationRedisConfig.get(KEYMAP.REDIS_NAMESPACE) derePrefrex = dereplicationRedisConfig.get(KEYMAP.REDIS_PREFIX) #去重redis连接池 derePool = redis.ConnectionPool(host=dereHost, port=int(derePort)) dereRedis = redis.Redis(connection_pool=derePool) def lpush(key, value): return dereRedis.lpush(key, value) def hset(namespace, key, value): return dereRedis.hset(namespace, key, value) def hget(namespace, key): return dereRedis.hget(namespace, key)
# coding=utf-8 from hashlib import md5 import redis from utils import ConfigUtils from utils.ConfigUtils import KEYMAP #普通redis数据 redisConfig = ConfigUtils.getItems(KEYMAP.URL_STATUS_REDIS) port = redisConfig.get(KEYMAP.REDIS_PORT) host = redisConfig.get(KEYMAP.REDIS_HOST) namespace = redisConfig.get(KEYMAP.REDIS_NAMESPACE) prefix = redisConfig.get(KEYMAP.REDIS_PREFIX) #普通redis连接池 pool = redis.ConnectionPool(host=host, port=int(port)) r = redis.Redis(connection_pool=pool) def llen(key): return r.llen(key) def keys(): return r.keys() def lpush(key, value): return r.lpush(key, value)
from beans.TaskTable import TaskStatus from beans.UrlTable import UrlClazz, UrlStatus from dao import TaskJobDao from dao import UrlDao from dao.TaskJobDao import loadTaskJobHistoryById from utils import LicenseUtils from utils import ConfigUtils from utils import RedisUtils from utils.ConfigUtils import KEYMAP from utils.ExportExcelUtils import get_ip from utils.HashFilter import YHash from utils import RedisUtils hashswitch = ConfigUtils.getRedisPorperties( KEYMAP.DISTRIBUTED_SPIDER_SWITCH) == str(True) NodeList = ConfigUtils.getRedisPorperties( KEYMAP.DISTRIBUTED_SPIDER_NODE_LIST).split(',') localIP = get_ip() tjs = YHash(NodeList, int(ConfigUtils.getRedisPorperties(KEYMAP.VIRTUAL_NODE))) class RedisCallbackSpider(RedisCrawlSpider): def beforeStartUrl(self, data): return data def next_requests(self): use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop
class DepthSpider(RedisCallbackSpider): name = ConfigUtils.getSpiderPorperties(KEYMAP.DEPTH_SPIDER_NAME) custom_settings = ConfigUtils.getItems(KEYMAP.REDIS) custom_settings["ITEM_PIPELINES"] = { 'engine.pipelines.CacheHtmlPipeline': 300 } custom_settings["DOWNLOADER_MIDDLEWARES"] = { 'engine.useragent.RotateUserAgentMiddleware': 1 } # start_urls = ['http://mini.qq.com/'] custom_settings = dict(custom_settings.items() + ConfigUtils.getItems(KEYMAP.MYSQL).items()) redis_key = ConfigUtils.getRedisPorperties(KEYMAP.DEPTH_SPIDER_REDIS_KEY) id = "" allowed_domain = None # 允许爬取的域名 rules = [Rule(LinkExtractor(allow=()), callback='parse', follow=True)] cur_url_depth = 1 # 当前url的深度值 depth_limit = 3 # 爬取深度 def beforeStartUrl(self, dataDict): if (dataDict == None): return dataDict id = dataDict.get("id") if id == None: return status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id) taskJobHistoryId = dataDict.get("taskJobHistoryId") if taskJobHistoryId: taskJobHistory = TaskJobDao.loadTaskJobHistoryById( taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId self.taskJob = TaskJobDao.loadTaskById(taskJobId) self.taskJobHistory = taskJobHistory url = dataDict["url"] if dataDict.has_key( "url") else "http://www.baidu.com" self.url = url if self.allowed_domain is None: self.allowed_domain = self.get_first_domain(self.get_domain(url)) self.cur_url_depth = dataDict.get("curUrlDepth") self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key( "depthLimit") else 3 return url def get_domain(self, url): """ 获取url中的域名 :param url: :return: """ pattern = r'(?<=//).*?(?=/)' result = re.findall(pattern, url) if result and len(result) > 0: return result[0] else: pattern = r'(?<=//).*' result = re.findall(pattern, url) if result and len(result) > 0: return result[0] else: return None def get_first_domain(self, domain): """获取域名中的一级域名,比如www.baidu.com中的baidu.com""" pattern = r'(?<=\.).*' result = re.findall(pattern, domain) if result and len(result) > 0: return result[0] def parse(self, response): if response.body: urlListStatusId = response.meta["urlListStatusId"] if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS) htmlItem = HtmlItem() htmlItem["url"] = response.url htmlItem["html"] = response.body subUrls = [] URLgroup = LinkExtractor(allow=()).extract_links(response) if (self.cur_url_depth < self.depth_limit and self.depth_limit != 0) or self.depth_limit == 0: for URL in URLgroup: if self.is_domain_allowed(URL.url): subUrls.append(URL.url) htmlItem["subUrls"] = subUrls # htmlItem["taskJob"]=self.taskJob # htmlItem["taskJobHistory"] = self.taskJobHistory htmlItem["curUrlDepth"] = self.cur_url_depth return htmlItem def is_domain_allowed(self, url): """ 判断当前url是否属于允许爬取的域名范围内 :param url: :return: """ logging.info("allowed_domain : " + self.allowed_domain) logging.info("url : " + url) if self.allowed_domain: cur_url_domain = self.get_domain(url) if cur_url_domain and self.allowed_domain in cur_url_domain: return True else: return False else: return True
# coding=utf-8 # Created by yupengcheng on 2017-08-02. import logging import socket import os import time from kazoo.client import KazooClient, KazooState from kazoo.exceptions import NodeExistsException from Settings import KEYMAP from utils import ConfigUtils zookeeper_hosts = ConfigUtils.getZookeeperHosts(KEYMAP.ZOOKEEPER_HOSTS) SPIDER_SERVER_NODE = '/spiderServer' def get_ip(): try: csock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) csock.connect(('8.8.8.8', 80)) (addr, port) = csock.getsockname() csock.close() return addr except socket.error: return "127.0.0.1" def add_node_to_zookeeper(): zk = KazooClient(hosts=zookeeper_hosts) @zk.add_listener def zookeeper_listener(state):
class AssistRedisSpider(MainRedisSpider): name = ConfigUtils.getSpiderPorperties(KEYMAP.ASSIST_SPIDER_NAME) redis_key = ConfigUtils.getRedisPorperties(KEYMAP.ASSIST_SPIDER_REDIS_KEY) pass