def get_sitemaps(self): rr = crawle.quick_request(self.SITEMAP_INDEX, redirects=1) if rr.response_status != 200: print 'Could not get index: %d' % rr.response_status sys.exit(1) data = rr.response_body return [x for x in self.LOC_RE.findall(data) if 'product' in x]
def get_product_ids(self): sitemaps = self.get_sitemaps() for sitemap in sitemaps: rr = crawle.quick_request(sitemap, redirects=1) if rr.response_status != 200: print 'Error fetching sitemap: %d' % rr.response_status print rr.request_url, rr.response_url sys.exit(1) body = gzip.GzipFile(fileobj=StringIO(rr.response_body)).read() self.item_ids.extend(self.ITEM_ID_RE.findall(body))