def _next_request(self, spider): try: slot = self.slots[spider] except KeyError: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = slot.start_requests.next() except StopIteration: slot.start_requests = None except Exception, exc: log.err(None, 'Obtaining request from start requests', \ spider=spider) else: if request: self.crawl(request, spider)
def __init__(self, fresh=False, nosentry=False, *args, **kwargs): super(TorrentsSpider, self).__init__(*args, **kwargs) try: cfg = ConfigParser() cfg.readfp(open('scrapy.properties')) self.db = pymongo.MongoClient(cfg.get('mongo','host'),int(cfg.get('mongo','port'))).torrents self.db_sources = pymongo.MongoClient(cfg.get('mongo','host_sources'),int(cfg.get('mongo','port_sources'))).foofind self.load_srcs_tables() self.sentry = raven.Client(cfg.get('sentry', 'dsn')) \ if not nosentry and 'sentry' in cfg.sections() else None except IOError as e: # file not found log.err('Configuration file not found: %s' % 'scrapy.properties') raise except NoSectionError as e: # section missing log.err(str(e)) raise self.fresh = fresh # are we only picking fresh links? self.first_unknown_site = True # used to warn at the 1st unknown site
def parseMobileProductDetail(self, response): err("555555555555555555555555") head = response.meta product_size_info = ProductSizeItem() product_sale_value = ProductSaleValueItem() size_json = re.search( "priceInfo\":(.*?),\"resultCode\":0,\"wanrentuanInfo", response.body) if size_json: size_json = size_json.group(1) size_json = size_json.decode("gbk") size_dict = json.loads(size_json) for skuid in size_dict: sku = size_dict[skuid] price = sku['price']['amount'] print "skuid[" + str(skuid) + "] price[" + str(price) + "]" product_size_info['productId'] = response.meta['productId'] product_size_info['taskId'] = response.meta['taskId'] product_size_info['skuId'] = skuid product_size_info['promot_price'] = price yield product_size_info sale_num = re.search("sellCount\":([0-9])*", response.body) value_num = re.search("rateCounts\":([0-9])*", response.body) if value_num and value_num != None: product_sale_value["value_num"] = int(value_num.group(1)) else: product_sale_value["value_num"] = -1 if sale_num and value_num != None: product_sale_value["sale_num"] = int(sale_num.group(1)) else: product_sale_value["sale_num"] = -1 product_sale_value["productId"] = response.meta["productId"] product_sale_value["taskId"] = response.meta["taskId"] yield product_sale_value
def spider_closed(self, spider, reason): if hasattr(spider, 'debug') and spider.debug: log.msg(utils.Y(u'disable logger'), level=log.WARNING) return if hasattr(spider, 'logger'): try: from pymongo import uri_parser, MongoClient uri = spider.logger if not uri: return log.msg('post bot stats to <{}>'.format(uri)) cnn, db, tbl = utils.connect_uri(uri) ago = self.stats.get_value('start_time', datetime.utcnow()) now = datetime.utcnow() self.stats.set_value('finish_time', now, spider=spider) self.stats.set_value('elapsed_time', (now-ago).total_seconds(), spider=spider) self.stats.set_value('finish_reason', reason, spider=spider) self.stats.set_value('bot_ip', utils.get_ipaddr('eth0')) self.stats.set_value('bot_name', self.crawler.settings.get('BOT_NAME', 'unknown')) self.stats.set_value('spider_name', spider.name) self.stats.set_value('config_path', spider.config) self.stats.set_value('job_id', os.getenv('SCRAPY_JOB', None)) tbl.insert({k.replace('.', '_'):v for k,v in self.stats.get_stats().iteritems()}) cnn.close() except Exception as ex: log.err('cannot post bot stats')
def start_requests(self): err("00000000000000000000000000") while True: conn = httplib.HTTPConnection("182.92.67.121", "8888") task_url = "/gettask?spider_name=" + self.spider_name + "&spider_type=" + self.name conn.request('GET', task_url) response_result = conn.getresponse().read() if response_result.find("taskId") == -1: continue if response_result.find("productId") == -1: continue if response_result.find("keyword") == -1: continue conn.close() response_dic = json.loads(response_result) pc_start_url = self.tmall_fp_url_prefix + response_dic[0][ "keyword"] + self.tmall_fp_url_suffix head = {} seller_shop_name = response_dic[0]["keyword"] head["seller_shop_name"] = self.get_seller_shop_name( seller_shop_name) head["taskId"] = response_dic[0]["taskId"] head["list_url"] = pc_start_url print pc_start_url yield Request(pc_start_url, callback=self.parse, meta=head, dont_filter=True)
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named): """Like pydispatcher.robust.sendRobust but it also logs errors and returns Failures instead of exceptions. """ dont_log = named.pop('dont_log', None) spider = named.get('spider', None) responses = [] for receiver in liveReceivers(getAllReceivers(sender, signal)): try: response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named) if isinstance(response, Deferred): log.msg("Cannot return deferreds from signal handler: %s" % \ receiver, log.ERROR, spider=spider) except dont_log: result = Failure() except Exception: result = Failure() log.err(result, "Error caught on signal handler: %s" % receiver, \ spider=spider) else: result = response responses.append((receiver, result)) return responses
def __init__(self, fresh=False, nosentry=False, *args, **kwargs): super(TorrentsSpider, self).__init__(*args, **kwargs) try: cfg = ConfigParser() cfg.readfp(open('scrapy.properties')) self.db = pymongo.MongoClient(cfg.get('mongo', 'host'), int(cfg.get('mongo', 'port'))).torrents self.db_sources = pymongo.MongoClient( cfg.get('mongo', 'host_sources'), int(cfg.get('mongo', 'port_sources'))).foofind self.load_srcs_tables() self.sentry = raven.Client(cfg.get('sentry', 'dsn')) \ if not nosentry and 'sentry' in cfg.sections() else None except IOError as e: # file not found log.err('Configuration file not found: %s' % 'scrapy.properties') raise except NoSectionError as e: # section missing log.err(str(e)) raise self.fresh = fresh # are we only picking fresh links? self.first_unknown_site = True # used to warn at the 1st unknown site
def parse_product_list(self,response): err("33333333333333333333333") head = response.meta hxs = HtmlXPathSelector(response) product_detail_url = hxs.select("//dd[@class='thumb']/../dt[@class='photo']/a/@href").extract() for letter in product_detail_url: yield Request("http:"+letter,callback=self.product,meta=head)
def parseMobileProductDetail(self, response): err("555555555555555555555555") head = response.meta product_size_info = ProductSizeItem() product_sale_value = ProductSaleValueItem() size_json = re.search("priceInfo\":(.*?),\"resultCode\":0,\"wanrentuanInfo", response.body) if size_json: size_json = size_json.group(1) size_json = size_json.decode("gbk") size_dict = json.loads(size_json) for skuid in size_dict: sku = size_dict[skuid] price = sku['price']['amount'] print "skuid[" + str(skuid) + "] price[" + str(price) + "]" product_size_info['productId'] = response.meta['productId'] product_size_info['taskId'] = response.meta['taskId'] product_size_info['skuId'] = skuid product_size_info['promot_price'] = price yield product_size_info sale_num = re.search("sellCount\":([0-9])*",response.body) value_num = re.search("rateCounts\":([0-9])*",response.body) if value_num and value_num != None : product_sale_value["value_num"] = int(value_num.group(1)) else: product_sale_value["value_num"] = -1 if sale_num and value_num != None : product_sale_value["sale_num"] = int(sale_num.group(1)) else: product_sale_value["sale_num"] = -1 product_sale_value["productId"] = response.meta["productId"] product_sale_value["taskId"] = response.meta["taskId"] yield product_sale_value
def kaka(self,response): product = ProductItem() hxs=HtmlXPathSelector(response) he = response.meta # product_id_pattern="/([0-9]+)/" product_object=hxs.select("//div[@class='product-frame']/div[@class='table-holder']/table[@class='table-info']/tbody/tr/td[@class='discount']") err("===========================================") for index in range(len(product_object)): # product_productId=re.search(product_id_pattern,response.url) product_productId=product_object[index].select("./../td[@class='first']/div[1]/@id").extract() product['productId']=product_productId[0].replace("radio_","") product['brandId']=he['code'] product_name=hxs.select("//div[@class='product-frame']/h2/text()").extract() product['name']=product_name[0] product_price=product_object[index].select("./../td[@class='last price']/text()").extract() product_price_one=product_price[0].replace(u"Â¥","") if "," in product_price_one: product['price']=product_price_one.replace(",","") else: product['price']=product_price_one product_standard=product_object[index].select("./../td[@class='first']/div[2]/label/text()").extract() product['standard']=product_standard[0].strip() product_introduce=hxs.select("//div[@class='tab-content']/div[@id='tab1_3']/div["+str(index+1)+"]/ul[@class='desc-ul']/li/text()").extract() product['introduce']="\n".join(product_introduce) yield product
def handle_spider_error(self, _failure, request, spider, propagated_failure=None): referer = request.headers.get('Referer', None) msg = "Spider error processing <%s> (referer: <%s>)" % \ (request.url, referer) log.err(_failure, msg, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider)
def spider_closed(self, spider, reason): if self.enabled and hasattr(spider, 'logger'): try: uri = spider.logger if not uri: return log.msg('post bot stats to <{}>'.format(uri)) cnn, db, tbl = utils.connect_uri(uri) ago = self.stats.get_value('start_time', datetime.utcnow()) now = datetime.utcnow() self.stats.set_value('finish_time', now, spider=spider) self.stats.set_value('elapsed_time', (now - ago).total_seconds(), spider=spider) self.stats.set_value('finish_reason', reason, spider=spider) self.stats.set_value('bot_ip', utils.get_ipaddr()) self.stats.set_value( 'bot_name', self.crawler.settings.get('BOT_NAME', 'unknown')) self.stats.set_value('spider_name', spider.name) self.stats.set_value('config_path', spider.config) self.stats.set_value('job_id', os.getenv('SCRAPY_JOB', None)) tbl.insert({ k.replace('.', '_'): v for k, v in self.stats.get_stats().iteritems() }) cnn.close() except Exception as ex: log.err('cannot post bot stats')
def test_err_noargs(self): try: a = 1 / 0 except: log.err() self.failUnless('Traceback' in self.logged()) self.failUnless('ZeroDivisionError' in self.logged())
def media_downloaded(self, response, request, info): referer = request.headers.get('Referer') if response.status != 200: log.msg(format='File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>', level=log.WARNING, spider=info.spider, status=response.status, request=request, referer=referer) raise FileException('download-error') if not response.body: log.msg(format='File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content', level=log.WARNING, spider=info.spider, request=request, referer=referer) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' log.msg(format='File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>', level=log.DEBUG, spider=info.spider, status=status, request=request, referer=referer) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: whyfmt = 'File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s' log.msg(format=whyfmt, level=log.WARNING, spider=info.spider, request=request, referer=referer, errormsg=str(exc)) raise except Exception as exc: whyfmt = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>' log.err(None, whyfmt % {'request': request, 'referer': referer}, spider=info.spider) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def parse(self, response): site=Selector(response) base_xpath = site.xpath('//div[@id="resultList"]/div[@class="el"]') detail_urls=[] log.msg("current page is {0}".format(response.url), level=log.INFO) for var in base_xpath: try: item = Job51CrawlerItem() item['job_id'] = var.xpath('p[1]/input[1]/@value').extract()[0] # if is_job_id_exists(item['job_id']):continue item['job_name'] = var.xpath('p[1]/span[1]/a[1]/@title').extract()[0] item['job_url'] = var.xpath('p[1]/span[1]/a[1]/@href').extract()[0] detail_urls.append(item['job_url']) item['company_name'] = var.xpath('span[@class="t2"][1]/a[1]/@title').extract()[0] item['company_url'] = var.xpath('span[@class="t2"][1]/a[1]/@href').extract()[0] try:item['job_address'] = var.xpath('span[@class="t3"][1]/text()').extract()[0] except:item['job_address'] = '' try:item['job_salary'] = var.xpath('span[@class="t4"][1]/text()').extract()[0] except:item['job_salary'] = '' item['pub_date'] = var.xpath('span[@class="t5"][1]/text()').extract()[0] salary_temp = self.salary_unicode2int(item['job_salary']) if salary_temp == None:salary_low = 0;salary_high = 0 else:salary_low = salary_temp.get('low');salary_high = salary_temp.get('high') item['salary_low'] = salary_low item['salary_high'] = salary_high yield scrapy.Request(item['job_url'], meta={'item': item}, callback=self.parse_detail) except Exception as e:log.err(e)
def process_item(self, item, spider): if item.__class__ == HouseItem: try: self.cursor.execute("""select * from house where id = %s""", item["id"]) ret = self.cursor.fetchone() if ret: self.cursor.execute( """update house set h_name = %s,detail_url = %s,community_name = %s, area = %s,pattern = %s,latitude = %s,longitude = %s,remark = %s where id = %s""", (item['h_name'], item['detail_url'], item['community_name'], item['area'], item['pattern'], item['latitude'], item['longitude'], item['remark'], item['id'])) self.insert_or_update_house_daily(mode=2) else: self.cursor.execute( """insert into house(id,h_name,detail_url,community_name,area, pattern,latitude,longitude, remark) value (%s,%s,%s,%s,%s,%s,%s,%s,%s)""", (item['id'], item['h_name'], item['detail_url'], item['community_name'], item['area'], item['pattern'], item['latitude'], item['longitude'], item['remark'])) self.insert_or_update_house_daily(mode=3) self.connect.commit() except Exception as error: log.err(error) print(error) return item else: pass
def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception as exc: slot.start_requests = None log.err(None, 'Obtaining request from start requests', \ spider=spider) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider)
def media_downloaded(self, response, request, info): referer = request.headers.get('Referer') if response.status != 200: log.msg(format='Image (code: %(status)s): Error downloading image from %(request)s referred in <%(referer)s>', level=log.WARNING, spider=info.spider, status=response.status, request=request, referer=referer) raise ImageException if not response.body: log.msg(format='Image (empty-content): Empty image from %(request)s referred in <%(referer)s>: no-content', level=log.WARNING, spider=info.spider, request=request, referer=referer) raise ImageException status = 'cached' if 'cached' in response.flags else 'downloaded' log.msg(format='Image (%(status)s): Downloaded image from %(request)s referred in <%(referer)s>', level=log.DEBUG, spider=info.spider, status=status, request=request, referer=referer) self.inc_stats(info.spider, status) try: key = self.image_key(request.url) checksum = self.image_downloaded(response, request, info) except ImageException, ex: log.err('image_downloaded hook failed: %s' % ex, level=log.WARNING, spider=info.spider) raise
def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception as exc: log.err(None, 'Obtaining request from start requests', \ spider=spider) else: self.crawl(request, spider) if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider)
def _sql(self): if not self["table_name"] or not self["table_action"]: log.err("SQL: table_name or table_action is None") return None if self["table_action"] == "insert": n_str="" v_str="" for key_name in self["table_keys"]: if self.get(key_name): n_str += key_name+"," v_str += "'%s'," %str(self.get(key_name)).replace("'","\\'") if v_str: _sql_str = "Insert into %s (%s) values (%s) ON DUPLICATE KEY UPDATE " %(self["table_name"],n_str[:-1],v_str[:-1]) # _sql_str = "REPLACE into %s (%s) values (%s) where not exists(select * from %s where %s.%s = %s )" %(self["table_action"],self["table_name"], # n_str[:-1],v_str[:-1],self["table_name"],self["table_name"],self["table_primray_key"],self.get(self["table_primray_key"])) elif self["table_action"] == "update" and self["table_primray_key"] and self.get(self["table_primray_key"]): v_str="" for key_name in self["table_keys"].keys(): if key_name != self["table_primray_key"] and self.get(key_name): v_str += key_name+"='"+str(self.get(key_name)).replace("'","\\'")+"'" if v_str: _sql_str = "UPDATE %s SET %s WHERE %s=%s " %(self["table_name"],v_str,self["table_primray_key"],self.get(self["table_primray_key"])) return _sql_str
def test_err_noargs(self): try: a = 1/0 except: log.err() self.failUnless('Traceback' in self.logged()) self.failUnless('ZeroDivisionError' in self.logged())
def process_item(self, item, spider): if isinstance(item, NovelItem): # novelItem = dict(item) # self.novel_coll.insert(novelItem) try: self.cursor.execute( """insert into novel_novel(id_book, book_name, author, category_id, status, image, description, novel_url, update_time) value (%s, %s, %s, %s, %s, %s, %s, %s, %s)""", (item['id_book'], item['book_name'], item['author'], item['category_id'], item['status'], item['image'], item['description'], item['novel_url'], item['update_time'])) self.conn.commit() except Exception as e: err(e) elif isinstance(item, ChapterItem): # chapterItem = dict(item) # self.chapter_coll.insert(chapterItem) try: self.cursor.execute( """insert into novel_chapter(title, chapter_url, content, book_id, insert_num) value (%s, %s, %s, %s, %s)""", (item['title'], item['chapter_url'], item['content'], item['book_id'], item['insert_num'])) self.conn.commit() except Exception as e: print(e) return item
def fetch_errback(self, twisted_request, failure): msg = ( "The request to the web-server failed. " "The crawler engine returned an error: %s" % failure.getErrorMessage() ) log.err(failure) finish_request(twisted_request, error=msg)
def process_item(self, item, spider): sql = '' params = '' if isinstance(item, VesselsItem): sql = 'insert into expected_vessels(today, date, ata_eta, vessel, cargo, quantity, ie, agent) values (%s, %s, %s, %s, %s, %s, %s, %s)' params = (item['today'], item['date'], item['ata_eta'], item['vessel'], item['cargo'], item['quantity'], item['ie'], item['agent']) elif isinstance(item, MovementItem): sql = 'insert into shipping_movement(date, state, vessel_name, berth_allotted, pilot_boarding_time) values (%s, %s, %s, %s, %s)' params = (item['date'], item['state'], item['vessel_name'], item['berth_allotted'], item['pilot_boarding_time']) elif isinstance(item, PositionItem): sql = '''insert into vessel_position(date, berth, vessel, ie, fc, date_of_berthing, cargo, quantity, day_s_handling, up_to_date_hanfling, balance, load_or_discharge_port, agent) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' params = (item['date'], item['berth'], item['vessel'], item['ie'], item['fc'], item['date_of_berthing'], item['cargo'], item['quantity'], item['day_s_handling'], item['up_to_date_hanfling'], item['balance'], item['load_or_discharge_port'], item['agent']) try: self.cursor.execute(sql, params) self.connect.commit() except Exception as error: log.err(error) return item
def test_err_noargs(self): try: a = 1/0 except: log.err() self.assertIn('Traceback', self.logged()) self.assertIn('ZeroDivisionError', self.logged())
def messi(self,response): head=response.meta for letter in self.kaka(): index=letter.find('search/') head['code']=letter[index+7] err("================================") yield Request(letter,callback=self.cluo,meta=head)
def item_completed(self, results, item, info): """Called per item when all media requests has been processed""" if self.LOG_FAILED_RESULTS: for success, result in results: if not success: log.err(result, '%s found errors proessing %s' % (self.__class__.__name__, item)) return item
def cluo(self,response): hea=response.meta hxs=HtmlXPathSelector(response) total=TotalItem() for letter in hxs.select("//div[@class='clear']/div[@class='ymppsy06 pleft30']/div[@class='ymppsy04']"): total_src=letter.select("./div[@class='ymppsy05']/a/img/@src").extract() total['src']=total_src[0] total_china=letter.select("./div[@class='txtCenter lh17 ptop3']/text()[1]").extract() total['china']=total_china[0] total_english=letter.select("./div[@class='txtCenter lh17 ptop3']/text()[2]").extract() total['english']=total_english[0] total['name']=total['china']+".jpg" total_hold=letter.select("./div[@class='lh20 pleft25 ptop5']/span[3]/text()").extract() if total_hold: total['hold']=total_hold[0] else: total['hold']="0" path="/dir/image_b/"+total['name'] urllib.urlretrieve(total['src'],path) err("++++++++++++++++++++++++++++++++++++++++") yield total for lett in (hxs.select("//div[@class='ymppfy01 clear txtRright']/a[@class='ymppfy02']/text()").extract()): if re.search("[1-9]",lett): newurl= "http://hzp.rayli.com.cn/brandlist/search/"+hea['code']+"_3_"+lett+".html" err("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") if newurl not in self.set_url: (self.set_url).append(newurl) yield Request(newurl,callback=self.cluo,meta=hea)
def test_err_noargs(self): try: a = 1 / 0 except: log.err() self.assertIn('Traceback', self.logged()) self.assertIn('ZeroDivisionError', self.logged())
def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.err(value, msg, spider=info.spider) bookfile_paths_urls = [(x['path'], x['url']) for ok, x in results if ok] bookfile_path_url = list_first_item(bookfile_paths_urls) if bookfile_path_url: item['book_file'] = os.path.join( os.path.abspath(self.bookfile_store), bookfile_path_url[0]) item['book_file_url'] = bookfile_path_url[1] return item else: if self.item_download[item['original_url']]: next = list_first_item( self.item_download[item['original_url']]) self.item_download[item['original_url']] = self.item_download[ item['original_url']][1:] return Request(next) else: return item
def media_downloaded(self, response, request, info): referer = request.headers.get("Referer") if response.status != 200: log.msg( format="Image (code: %(status)s): Error downloading image from %(request)s referred in <%(referer)s>", level=log.WARNING, spider=info.spider, status=response.status, request=request, referer=referer, ) raise ImageException("download-error") if not response.body: log.msg( format="Image (empty-content): Empty image from %(request)s referred in <%(referer)s>: no-content", level=log.WARNING, spider=info.spider, request=request, referer=referer, ) raise ImageException("empty-content") status = "cached" if "cached" in response.flags else "downloaded" log.msg( format="Image (%(status)s): Downloaded image from %(request)s referred in <%(referer)s>", level=log.DEBUG, spider=info.spider, status=status, request=request, referer=referer, ) self.inc_stats(info.spider, status) try: key = self.image_key(request.url) result_hash = self.image_downloaded(response, request, info) checksum = result_hash["checksum"] width = result_hash["width"] height = result_hash["height"] size = result_hash["size"] self.inc_image_size(info.spider, size) except ImageException as exc: whyfmt = "Image (error): Error processing image from %(request)s referred in <%(referer)s>: %(errormsg)s" log.msg( format=whyfmt, level=log.WARNING, spider=info.spider, request=request, referer=referer, errormsg=str(exc), ) raise except Exception as exc: whyfmt = "Image (unknown-error): Error processing image from %(request)s referred in <%(referer)s>" log.err(None, whyfmt % {"request": request, "referer": referer}, spider=info.spider) raise ImageException(str(exc)) return {"url": request.url, "path": key, "checksum": checksum, "width": width, "height": height, "size": size}
def full_run_required(self): if not os.path.exists( os.path.join(HERE, 'chainreactioncycles_products.csv')): log.err("Does not exist") return True # run full only on Mondays return datetime.now().weekday() == 0
def item_completed(self, results, item, info): """Called per item when all media requests has been processed""" if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.err(value, msg, spider=info.spider) return item
def parse_product_list(self, response): err("33333333333333333333333") head = response.meta hxs = HtmlXPathSelector(response) product_detail_url = hxs.select( "//dd[@class='thumb']/../dt[@class='photo']/a/@href").extract() for letter in product_detail_url: yield Request("http:" + letter, callback=self.product, meta=head)
def product_list(self,response): err("**************************") head = response.meta for letter in self.seller_name: list_url = "http://"+letter+".tmall.hk/search.htm" head["list_url"] = list_url head["seller_shop_name"] = self.get_seller_shop_name(letter) yield Request(list_url,callback=self.get_product_page,meta=head)
def process_item(self, item, spider): try: line = item['name']+","+item['address']+"\n" self.file.write(line) except BaseException,x: print 'Error:',x print item log.err(x.message)
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or "cancelled") return log.err(_failure, "Spider error processing %s" % request, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
def kaka(self): url=[] string_a="http://hzp.rayli.com.cn/brandlist/search/" mu=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] for letter in mu: url.append(string_a+letter+"_3_1.html") (self.set_url).extend(url) err("########################################") return url
def get_product_page(self,response): err("%%%%%%%%%%%%%%%%%%%%%%%%%%%%") hea = response.meta hxs = HtmlXPathSelector(response) list_page_num = hxs.select("//b[@class='ui-page-s-len']/text()").extract() list_page_num = list_page_num[0].split("/")[1] for le in range(1,int(list_page_num)+1): product_list_url = hea["list_url"] + "?pageNo=" + str(le) yield Request(product_list_url,callback=self.parse_product_list,meta=hea)
def pyproducer(topic,_key,_value): log.msg('sending message!') producer = KafkaProducer(bootstrap_servers='115.231.103.59:9092,115.231.103.212:9092,115.231.103.60:9092',retries=3,api_version='0.8.2') future = producer.send(topic,key=_key,value=_value) try: record_metadata = future.get(timeout=10) except Exception,e: # Decide what to do if produce request failed... log.err(str(e)) pass
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return log.err(_failure, "Spider error processing %s" % request, spider=spider) self.signals.send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \ spider=spider) self.crawler.stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider)
def media_downloaded(self, response, request, info): referer = request.headers.get("Referer") if response.status != 200: log.msg( format="File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>", level=log.WARNING, spider=info.spider, status=response.status, request=request, referer=referer, ) raise FileException("download-error") if not response.body: log.msg( format="File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content", level=log.WARNING, spider=info.spider, request=request, referer=referer, ) raise FileException("empty-content") status = "cached" if "cached" in response.flags else "downloaded" log.msg( format="File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>", level=log.DEBUG, spider=info.spider, status=status, request=request, referer=referer, ) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: whyfmt = "File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s" log.msg( format=whyfmt, level=log.WARNING, spider=info.spider, request=request, referer=referer, errormsg=str(exc), ) raise except Exception as exc: whyfmt = "File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>" log.err(None, whyfmt % {"request": request, "referer": referer}, spider=info.spider) raise FileException(str(exc)) return {"url": request.url, "path": path, "checksum": checksum}
def open_spider(self, spider): if hasattr(spider, 'mongo'): try: uri = spider.mongo log.msg('connect <{}>'.format(uri)) self.cnn, self.db, self.tbl = utils.connect_uri(uri) return except Exception as ex: log.err('cannot connect to mongodb: {}'.format(ex)) self.cnn = self.db = None
def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.err(value, msg, spider=info.spider) image_paths = [x['path'] for ok, x in results if ok] image_path = list_first_item(image_paths) item['book_covor_image_path'] = os.path.join(os.path.abspath(self.images_store),image_path) if image_path else "" return item
def open_spider(self, spider): if hasattr(spider, 'zmq'): try: from utils.api import MessageSender uri = spider.zmq log.msg('connect <{}>'.format(uri)) self.sender = MessageSender(uri) return except Exception as ex: log.err('cannot connect to zmq: {}'.format(ex)) self.sender = None
def parse(self,response): search_result = re.search('"&token=([0-9a-z]{32})"',response.body_as_unicode()) if search_result: token = search_result.groups()[0] token_item = Token() token_item['type'] = self.name token_item['token'] = token token_item['token_date'] = datetime.datetime.now().strftime('%Y%m%d') yield token_item else: log.err('token get error!') yield None
def open_spider(self, spider): if hasattr(spider, 'zmq'): try: self.zmq = __import__('zmq') uri = spider.zmq log.msg('connect <{}>'.format(uri)) self.ctx, self.skt, _ = utils.connect_uri(uri) return except Exception as ex: log.err('cannot connect to zmq: {}'.format(ex)) self.ctx = self.skt = None
def parse_page2(self, response): urls = None try: try: urls = response.xpath('//div[@class="blog_title"]/a/@href').extract() except: urls = response.xpath('//span[@class="atc_title"]/strong/a/@href').extract() except: log.err('Cannot extract any blog urls from %s' % response.url) for url in urls: # log.msg('scarped new post:' + url) yield scrapy.Request(url, callback=self.parse_page3)
def open_spider(self, spider): if hasattr(spider, 'mysql'): try: uri = spider.mysql log.msg('connect <{}>'.format(uri)) self.cnn, _, self.tbl = utils.connect_uri(uri) self.cur = self.cnn.cursor() return except Exception as ex: traceback.print_exc() log.err('cannot connect to mysql: {}'.format(ex)) self.cnn = self.cur = None
def _next_request(self, spider): slot = self.slot if not slot: return if self.paused: slot.nextcall.schedule(5) return while not self._needs_backout(spider): if not self._next_request_from_scheduler(spider): break if slot.start_requests and not self._needs_backout(spider): try: request = next(slot.start_requests) except StopIteration: slot.start_requests = None except Exception as exc: slot.start_requests = None log.err(None, 'Obtaining request from start requests', \ spider=spider) else: self.crawl(request, spider) elif not slot.start_requests: if self.spider_is_idle(spider): self.locker.acquire() try: if not self._next_request_from_scheduler(spider): log.msg(message = 'Request queue is empty, get request from start requests', _level = log.INFO) assert spider in self.open_spiders, \ "Spider %r not opened when crawling: %s" % (spider.name, request) start_requests = spider.start_requests() while True: try: request = next(start_requests) except StopIteration: break except Exception as exc: log.err(None, 'Obtaining request from start requests', \ spider=spider) break else: self.schedule(request, spider) slot.nextcall.schedule() finally: self.locker.release() if self.spider_is_idle(spider) and slot.close_if_idle: self._spider_idle(spider)
def fetch_callback(self, response): request = response.meta['twisted_request'] result_response = dict(status=response.status, headers=response.headers.to_string()) if response.status != 200: finish_request(request, response=result_response) return if not isinstance(response, (HtmlResponse, XmlResponse)): msg = "Non-html response: %s" % response.headers.get( 'content-type', 'no content type') finish_request(request, error=msg) return try: params = response.meta['slyd_request_params'] original_html = extract_html(response) cleaned_html = html4annotation(original_html, response.url) # we may want to include some headers fingerprint = request_fingerprint(response.request) result_response = dict(status=response.status, headers=response.headers.to_string()) result = dict(page=cleaned_html, original=original_html, fp=fingerprint, response=result_response) # HACKY: return the spider but also return the template specs. # We need them to map the template_id to the template name. spider, templates = self.create_spider(request.project, request.auth_info, params) if spider is not None: items = [] links = [] for value in spider.parse(response): if isinstance(value, Request): links.append(value.url) elif isinstance(value, DictItem): value['_template_name'] = self._get_template_name( value['_template'], templates) items.append(value._values) else: raise ValueError("Unexpected type %s from spider" % type(value)) result['items'] = items result['links'] = links finish_request(request, **result) except Exception as ex: log.err(ex) finish_request(request, response=result_response, error="unexpected internal error: %s" % ex)
def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider): """Log and silence the bugs raised outside of spiders, but still allow spiders to be notified about general failures while downloading spider generated requests """ # ignored requests are commonly propagated exceptions safes to be silenced if isinstance(spider_failure.value, IgnoreRequest): return elif spider_failure is propagated_failure: log.err(spider_failure, 'Unhandled error propagated to spider', \ spider=spider) return # stop propagating this error else: return spider_failure # exceptions raised in the spider code
def parse_resume(self, response): resume = response.meta['resume'] data = json.loads(response.body)['resumeHtml'] try: log.msg( 'http://www.zhaopingou.com/resume/detail?source=1&resumeId=%s' % (data['id'])) resume[ 'url'] = 'http://www.zhaopingou.com/resume/detail?source=1&resumeId=%s' % ( data['id']) resume['photo'] = data['resumeImg'] resume['birthday'] = data['birthday'] resume['hometown'] = data['residence'] resume['age'] = data['age'] resume['live_city'] = data['address'] resume['degree'] = data['degreesName'] if data.has_key('evaluate'): resume['self_intro'] = data['evaluate'] resume['exp_mode'] = '' resume['exp_city'] = data['hopeAddress'] log.msg(resume['exp_city']) resume['exp_pos'] = data['hopePosition'] resume['exp_industry'] = data['hopeIndustry'] resume['exp_salary'] = data['hopeSalary'] tmplist = [] tmp_dict = {} tmp_dict['company'] = data['last_company'] tmp_dict['position'] = data['last_company_pname'] tmp_dict['time_range'] = data['last_company_time'] tmplist.append(tmp_dict) resume['work_exp'] = json.dumps(tmplist, ensure_ascii=False) resume['work_exp_ds'] = '' resume['edu_exp'] = '' resume['lang_skill'] = '' resume['cert'] = '' resume['ability'] = data['skills'] resume['showme'] = '' resume['refresh_time'] = data['crate_time'] #resume['html_filepath'] = 'http://www.zhaopingou.com'+data['html_filepath'] #resume['doc_filepath'] = 'http://www.zhaopingou.com'+data['doc_filepath'] yield resume except Exception, e: log.err(e)