def getNextListPageUrl(self, response): requestUrl = [] self.max_deepth -= 1 if self.max_deepth < 1: logging.info("*********max_deepth : %s *****" % self.max_deepth) return requestUrl # logging.info("*********next_request_url : %s *****" % self.next_request_url) nextListPageURL = self.safeParse(response, self.next_request_url) # logging.info("*********next_page_url_prefix : %s *****" % self.next_page_url_prefix) if self.next_page_url_prefix: nextListPageURL = self.appendDomain(nextListPageURL, self.next_page_url_prefix, False) else: nextListPageURL = self.appendDomain(nextListPageURL, response.url) logging.info("*********nextListPageURL : %s *****" % nextListPageURL) if nextListPageURL: requestUrl.append( Request(nextListPageURL, headers={'Referer': REFERER}, callback=self.parse, dont_filter=True)) return requestUrl
def index(self): beginTime = int(time.time()) records = self.getRecords() if not records: logging.info('no data need sync!!') return False syncOverData = syncCrawlInfos(records) for record in records: uniqueCode = record['unique_code'] if uniqueCode in syncOverData: print "sync success %s " % uniqueCode updateSql = "update " + self.tableName + " set `is_sync` = 1,`sync_times` = `sync_times`+1 where `unique_code` = '" + uniqueCode + "' " else: print "sync fail %s " % uniqueCode updateSql = "update " + self.tableName + " set `sync_times` = `sync_times`+1 where `unique_code` = '" + uniqueCode + "' " self.db.executeSql(updateSql) logging.info('--------------sync records cast time : %s ' % (int(time.time()) - beginTime)) logging.info('--------------sync records success num : %s' % len(syncOverData)) logging.info('--------------sync records success : %s' % syncOverData) logging.info('--------------sync records fail num : %s' % (len(records) - len(syncOverData))) return True
def main(): try: runSpider = RunSpider() runSpider.run() logging.info("----------runSpider end-----------") except Exception, e: logging.info("----------runSpider main function Exception : %s-----" % e)
def index(self): beginTime = int(time.time()) records = self.getRecords() if not records: logging.info('no data need sync!!') return False syncOverData = syncCrawlInfos(records) for record in records: uniqueCode = record['unique_code'] if uniqueCode in syncOverData: print "sync success %s " % uniqueCode updateSql = "update "+self.tableName+" set `is_sync` = 1,`sync_times` = `sync_times`+1 where `unique_code` = '"+uniqueCode+"' " else: print "sync fail %s " % uniqueCode updateSql = "update "+self.tableName+" set `sync_times` = `sync_times`+1 where `unique_code` = '"+uniqueCode+"' " self.db.executeSql(updateSql) logging.info('--------------sync records cast time : %s ' % (int(time.time()) - beginTime) ) logging.info('--------------sync records success num : %s' % len(syncOverData)) logging.info('--------------sync records success : %s' % syncOverData ) logging.info('--------------sync records fail num : %s' % (len(records) - len(syncOverData))) return True
def process_item(self, item, spider): if not item: logging.info('--------item is empty : %s' % item) return True rule_id = item['rule_id'] public_time = int(time.time()) create_time = int(time.time()) img_url = json.dumps(item['img_url']) description = item['description'] if not description: return True title = item['title'].decode('utf8')[0:255].encode('utf8') insertData = { 'source_url': item['source_url'], 'unique_code': toMd5(item['source_url']), 'rule_id': rule_id, 'title': title, 'description': description, 'img_url': img_url, 'public_time': public_time, 'create_time': create_time } self.db.insert(self.tableName, insertData) return True
def process_item(self, item, spider): if not item: logging.info('--------item is empty : %s' % item) return True create_time = int(time.time()) img_url = json.dumps(item['img_url']) if (not item['description']) and (not item['content']): return True title = item['title'].decode('utf8')[0:255].encode('utf8') insertData = { 'source_url': item['source_url'], 'unique_code': toMd5(item['source_url']), 'rule_id': item['rule_id'], 'title': title, 'description': item['description'], 'content': item['content'], 'img_url': img_url, 'source_score': item['source_score'], 'is_sync': '0', 'public_time': item['public_time'], 'create_time': create_time } insertOk = self.db.insert(self.tableName, insertData) if (not insertOk) and spider.is_duplicate: self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'") logging.info('========update.unique_code : %s' % insertData['unique_code']) return True
def parse(self, response): """ 列表页解析 """ last_md5 = '' if self.isFirstListPage: checkText = self.safeParse(response, self.checkTxtXpath) last_md5 = toMd5(checkText) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if ( not self.is_duplicate ) and OPEN_MD5_CHECK and self.isFirstListPage and last_md5 == self.last_md5: yield [] else: for request in self.getDetailPageUrls(response): yield request # 获取下一列表页url if not self.isDone: for request in self.getNextListPageUrl(response): yield request # 同步md5码 & 同步last_id if self.isFirstListPage: syncLastMd5({'last_md5': last_md5, 'id': self.rule_id}) self.isFirstListPage = False
def parse(self, response): """ 列表页解析 """ last_md5 = '' if self.isFirstListPage: checkText = self.safeParse(response, self.checkTxtXpath) last_md5 = toMd5(checkText) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if self.isFirstListPage and last_md5 == self.last_md5: yield [] else: for request in self.getDetailPageUrls(response): yield request # 获取下一列表页url if not self.isDone: for request in self.getNextListPageUrl(response): yield request # 同步md5码 & 同步last_id if self.isFirstListPage: syncLastMd5({'last_md5': last_md5, 'id': self.rule_id}) self.isFirstListPage = False
def getDetailPageUrls(self, response): detailUrls = [ self.appendDomain(t.encode('utf-8'), response.url) for t in self.safeParse(response, self.rule, True, False) ] # 批量验证urls是否重复 logging.info("*********detailUrls : %s *****" % detailUrls) detailUrlsByFilter = self.distinctRequestUrls(detailUrls) logging.info("*********detailUrlsByFilter : %s *****" % detailUrlsByFilter) if len(detailUrls) < 1 or len(detailUrlsByFilter) != len(detailUrls): self.isDone = True requestUrl = [] if detailUrlsByFilter: for detailUrl in detailUrlsByFilter: requestUrl.append( Request(detailUrl, headers={'Referer': REFERER}, callback=self.parse_detail_page, dont_filter=True)) return requestUrl
def run(self): while True: self.syncDagrame() logging.info("---------------sleep %s senconds " % MAIN_LOOP_SLEEP_TIME) time.sleep(MAIN_LOOP_SLEEP_TIME)
def process_item(self, item, spider): if not item: logging.info('--------item is empty : %s' % item) return True create_time = int(time.time()) img_url = json.dumps(item['img_url']) if (not item['description']) and (not item['content']): return True title = item['title'].decode('utf8')[0:255].encode('utf8') insertData = { 'source_url': item['source_url'], 'unique_code': toMd5(item['source_url']), 'rule_id': item['rule_id'], 'title': title, 'description': item['description'], 'content': item['content'], 'img_url': img_url, 'source_score' : item['source_score'], 'is_sync' : '0', 'public_time': item['public_time'], 'create_time': create_time } insertOk = self.db.insert(self.tableName, insertData) if ( not insertOk )and spider.is_duplicate: self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'") logging.info('========update.unique_code : %s' % insertData['unique_code']) return True
def appendDomain(self, url, domain=''): parsed_uri = urlparse.urlparse(domain) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) logging.info("*********apend before : %s *****" % url) if isinstance(url, (buffer, str)) and not self.url_domain_pattern.match(url): url = urlparse.urljoin(domain, url) return url
def syncDagrame(self): """同步数据到线上""" if int(time.time()) - self.beginTime > RUN_SYNC_INTERVAL_TIME: logging.info("**********sync crawl infos ************") sync = SyncCrawlInfos() sync.index() self.beginTime = int(time.time())
def mainLoop(): """ 主循环,捕获异常,并重启rss """ while True: try: sync = syncDagrame() sync.run() except Exception, e: logging.info("---------------main loop exception : %s " % e)
def mainLoop(): """ 主循环,捕获异常,并重启rss """ while True: try: rss = RssPool() rss.run() except Exception, e: logging.info("---------------main loop exception : %s " % e)
def start_requests(self): spiderConfig = getCrawlNoRssRequest() if not spiderConfig: return [] self.initConfig(spiderConfig) logging.info("*********meta******%s****************" % spiderConfig) return [Request(spiderConfig.get('start_urls', '')[0], callback=self.parse, dont_filter=True)]
def getCrawlNoRssRequestLength(): try: http = HttpRequest() url = requst_norss_length_url response = http.setUrl(url).setBody({}).encrypt([]).post() res = json.loads(response)['data'] if res == 'null': res = None except Exception, e: logging.info("-----%s-----" % e) return None
def getCrawlNoRssRequestLength(): try: http = HttpRequest() url = requst_norss_length_url response = http.setUrl(url).setBody({}).encrypt([]).post() res = json.loads(response)["data"] if res == "null": res = None except Exception, e: logging.info("-----%s-----" % e) return None
def filterAndPackageDgrate(self): if not OPEN_REDIS_DISTINCT: return self.item uniqueCodeList = self.item.keys() repeatUniqueCode = requstDistinct(uniqueCodeList) logging.info('------------distinct before : %s ' % uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del (self.item[unique]) logging.info('------------distinct after : %s ' % self.item.keys()) return self.item
def process_item(self, item): if not item: logging.info('------------page not crawl data ') return True self.item = item insertDataList = self.filterAndPackageDgrate() for index in insertDataList: self.db.insert(self.tableName, insertDataList[index]) return True
def process_item(self, item, spider): if not item: logging.info('-----------------------list page repeat ') return True self.item = item insertDataList = self.filterAndPackageDgrate() for index in insertDataList: self.db.insert(self.tableName, insertDataList[index]) return True
def filterAndPackageDgrate(self): if not OPEN_REDIS_DISTINCT: return self.item uniqueCodeList = self.item.keys() repeatUniqueCode = requstDistinct(uniqueCodeList) logging.info('------------distinct before : %s ' % uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del(self.item[unique]) logging.info('------------distinct after : %s ' % self.item.keys()) return self.item
def addRssSpider(self): configList = getCrawlRssRequest() if not configList: self.start = True return True try: spider = CommonFeedRss() self.pool.spawn(spider.run, configList) except Exception, e: logging.info("------------------add spider exception : %s " % e)
def run(self, config): self.initConfig(config) d = feedparser.parse(config.get('start_urls', '')[0]) # md5校验 last_md5 = toMd5(d.entries) logging.info("*********last_md5 : %s self.last_md5 : %s*****" % (last_md5, self.last_md5)) if OPEN_MD5_CHECK and self.last_md5 == last_md5: return True self.parse(d) # 解析rss syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
def getNextListPageUrl(self, response): logging.info("*********next_request_url : %s *****" % self.next_request_url) nextListPageURL = self.appendDomain( self.safeParse(response, self.next_request_url), response.url) # .encode('utf-8')) logging.info("*********nextListPageURL : %s *****" % nextListPageURL) requestUrl = [] if nextListPageURL: requestUrl.append( Request(nextListPageURL, headers={'Referer': REFERER}, callback=self.parse, dont_filter=True)) return requestUrl
def syncLastMd5(params): try: http = HttpRequest() url = sync_last_md5_url response = http.setUrl(url).setBody(params).encrypt([]).post() res = json.loads(response)['data'] if res == 'null': res = None except Exception, e: print e logging.info("-----%s-----" % e) return None
def getCrawlRssRequest(params={}): try: http = HttpRequest() url = request_rss_url response = http.setUrl(url).setBody(params).encrypt([]).post() res = json.loads(response)["data"] if res == "null": res = None except Exception, e: print e logging.info("-----%s-----" % e) return None
def getCrawlRssRequest(params={}): try: http = HttpRequest() url = request_rss_url response = http.setUrl(url).setBody(params).encrypt([]).post() res = json.loads(response)['data'] if res == 'null': res = None except Exception, e: print e logging.info("-----%s-----" % e) return None
def syncLastMd5(params): try: http = HttpRequest() url = sync_last_md5_url response = http.setUrl(url).setBody(params).encrypt([]).post() res = json.loads(response)["data"] if res == "null": res = None except Exception, e: print e logging.info("-----%s-----" % e) return None
def parse_detail_page(self, response): logging.info('--------------------parse detail page-----------') item = XmlFeedItem() item['title'] = self.safeParse(response, self.titleXpath) imageAndDescriptionInfos = self.parseDescriptionAndImages(response) item['img_url'] = imageAndDescriptionInfos['img_url'] item['description'] = imageAndDescriptionInfos['description'] item['public_time'] = self.safeParse(response, self.pubDateXpath) item['source_url'] = self.appendDomain(self.safeParse(response, self.guidXpath), response.url) item['rule_id'] = self.rule_id yield item
def requstDistinct(hashCode): try: http = HttpRequest() url = requst_distinct_url hashCode = ",".join(hashCode) body = {"field": hashCode} encryptFields = [] response = http.setUrl(url).setBody(body).encrypt(encryptFields).post() res = json.loads(response)["data"] if not res: return [] return res except Exception, e: res = [] logging.info("-----------%s-------" % e) return res
def requstDistinct(hashCode): try: http = HttpRequest() url = requst_distinct_url hashCode = ",".join(hashCode) body = {'field': hashCode} encryptFields = [] response = http.setUrl(url).setBody(body).encrypt(encryptFields).post() res = json.loads(response)['data'] if not res: return [] return res except Exception, e: res = [] logging.info('-----------%s-------' % e) return res
def run(self): while True: if (not self.start) and (not self.pool.full()): self.addRssSpider() # self.syncDagrame() continue self.start = False if self.pool.free_count() < RSS_MAX_POOL_NUM: logging.info("---------------join run ") self.pool.join() else: logging.info("---------------not data ,sleep %s senconds " % MAIN_LOOP_SLEEP_TIME) time.sleep(MAIN_LOOP_SLEEP_TIME)
def parse_detail_page(self, response): logging.info('--------------------parse detail page-----------') item = CrawlItem() item['title'] = self.safeParse(response, self.titleXpath) imageAndContentInfos = self.parseContentAndImages(response) item['img_url'] = imageAndContentInfos['img_url'] item['content'] = imageAndContentInfos['content'] item['description'] = self.parseDescription(imageAndContentInfos['content']) item['source_score'] = self.parse_score(response) item['public_time'] = self.safeParse(response, self.pubDateXpath) item['source_url'] = response.url item['rule_id'] = self.rule_id yield item
def parse_detail_page(self, response): logging.info('--------------------parse detail page-----------') item = CrawlItem() item['title'] = self.safeParse(response, self.titleXpath) imageAndContentInfos = self.parseContentAndImages(response) item['img_url'] = imageAndContentInfos['img_url'] item['content'] = imageAndContentInfos['content'] item['description'] = self.parseDescription( imageAndContentInfos['content']) item['source_score'] = self.parse_score(response) item['public_time'] = self.safeParse(response, self.pubDateXpath) item['source_url'] = response.url item['rule_id'] = self.rule_id yield item
def getDetailPageUrls(self, response): detailUrls = [self.appendDomain(t.encode('utf-8'), response.url) for t in self.safeParse(response, self.rule, True, False)] # 批量验证urls是否重复 logging.info("*********detailUrls : %s *****" % detailUrls) detailUrlsByFilter = self.distinctRequestUrls(detailUrls) logging.info("*********detailUrlsByFilter : %s *****" % detailUrlsByFilter) if len(detailUrls) < 1 or len(detailUrlsByFilter) != len(detailUrls): self.isDone = True requestUrl = [] if detailUrlsByFilter: for detailUrl in detailUrlsByFilter: requestUrl.append( Request(detailUrl, headers={'Referer': REFERER}, callback=self.parse_detail_page, dont_filter=True)) return requestUrl
def syncCrawlInfos(dataList): try: http = HttpRequest() http.setTimeout(900) url = sync_crawl_infos_url sqlList = json.dumps(dataList) body = {"sql": sqlList, "checksum": toMd5(sqlList)} encryptFields = [] headerDict = {"Content-Encoding": "gzip", "Accept-Encoding": "gzip"} response = http.setUrl(url).setBody(body).setHeader(headerDict).encrypt(encryptFields).post() res = json.loads(response)["data"] if not res: return [] return res except Exception, e: res = [] logging.info("-----------%s-------" % e, True) return res
def syncCrawlInfos(dataList): try: http = HttpRequest() http.setTimeout(900) url = sync_crawl_infos_url sqlList = json.dumps(dataList) body = {'sql': sqlList, 'checksum': toMd5(sqlList)} encryptFields = [] headerDict = {'Content-Encoding': 'gzip', 'Accept-Encoding': "gzip"} response = http.setUrl(url).setBody(body).setHeader( headerDict).encrypt(encryptFields).post() res = json.loads(response)['data'] if not res: return [] return res except Exception, e: res = [] logging.info('-----------%s-------' % e, True) return res
def process_item(self, item, spider): if not item: logging.info('-----------------------list page repeat : %s' % item) return True public_time = int(time.time()) create_time = int(time.time()) for i in xrange(0, len(item['url'])): insertData = { 'title': item['title'][i], 'url': item['url'][i], 'unique_code': toMd5(item['url'][i]), 'share_num': item['share_num'][i], 'rss_num': item['rss_num'][i], 'public_time': public_time, 'create_time': create_time } self.db.insert(self.tableName, insertData) return True
def run(self): while True: num = getCrawlNoRssRequestLength() logging.info("********need deal request num : %s " % num) if not num: if self.runNum >= 1: logging.info("*****************size:%s********runNum:%s********" % (self.size, self.runNum)) self.runSpider() break else: self.initSpider() if self.runNum >= self.size: logging.info("*****************size:%s********runNum:%s********" % (self.size, self.runNum)) self.runSpider() break
def run(self): while True: num = getCrawlNoRssRequestLength() logging.info("********need deal request num : %s " % num) if not num: if self.runNum >= 1: logging.info( "*****************size:%s********runNum:%s********" % (self.size, self.runNum)) self.runSpider() break else: self.initSpider() if self.runNum >= self.size: logging.info( "*****************size:%s********runNum:%s********" % (self.size, self.runNum)) self.runSpider() break
def startScript(): times = 0 # beginTime = int(time.time()) while True: try: times += 1 num = getCrawlNoRssRequestLength() logging.info("**********need deal request num :%s************" % num) if not num: logging.info("**********sleep:%s************" % MAIN_LOOP_SLEEP_TIME) time.sleep(MAIN_LOOP_SLEEP_TIME) else: os.system('python runSpider.py') # if times > RUN_SYNC_INTERVAL_TIMES or int(time.time()) - beginTime > RUN_SYNC_INTERVAL_TIME: # logging.info("**********sync crawl infos ************") # sync = SyncCrawlInfos() # sync.index() # times = 0 # beginTime = int(time.time()) except Exception, e: logging.info("--------------%s------------" % e)