class RestaurantSpider(CrawlSpider): name = "RestaurantSpider" allowed_domains = ["domiciliosbogota.com"] start_urls = ('http://www.domiciliosbogota.com/', ) productLinkGetter = ProductLinkGetter() rules = [ Rule(LinkExtractor(allow=(r"http://www\.domiciliosbogota\.com/$")), 'parseMain') ] def parseMain(self, response): self.restaurantIDsGetter = RestaurantIDsGetter(response) linksExtractor = LinkExtractor( allow=(r"http\:\/\/www\.domiciliosbogota\.com\/domicilios\-.*")) links = linksExtractor.extract_links(response) for link in links: yield Request(link.url, callback=self.parseRestaurants) def parseRestaurants(self, response): sel = RestaurantSelector(response) restaurant = Restaurant() restaurant["url"] = response.url restaurant["name"] = sel.getName() restaurant["id"] = self.restaurantIDsGetter.getID( "/" + response.url.split("/")[-1]) restaurant["deliveryTimeInMinutes"] = sel.getDeliveryTimeInMinutes() restaurant["minOrderPrice"] = sel.getMinOrderPrice() restaurant["deliveryCost"] = sel.getDeliveryCost() restaurant["payMethods"] = sel.getPayMethods() restaurant["menu"] = sel.getMenuCategories() restaurant["tagCategories"] = sel.getTagCategories() restaurant["averagePunctuation"] = sel.getAveragePunctuation() restaurant["quantityOfComments"] = sel.getQuantityOfComments() return restaurant
class ProductSpider(CrawlSpider): name = 'Product' allowed_domains = ['domiciliosbogota.com'] start_urls = ['http://www.domiciliosbogota.com/'] productLinkGetter = ProductLinkGetter() rules = ( Rule(LinkExtractor(allow=()), follow=True), Rule(LinkExtractor(allow=(), canonicalize = False, tags = "li", attrs = ("id",), process_value = productLinkGetter.getLink), callback='parseProduct', follow=True), ) def parseProduct(self, response): product = Product() product["product"] = json.loads(response.body) return product
def testReturnNoneWhenLinkIsNotNumeric(self): id = "http://www.domiciliosbogota.com/cat-12345" productLinkGetter = ProductLinkGetter() self.assertEqual(None, productLinkGetter.getLink(id))
def testGetUUrlFromProductID(self): id = "http://www.domiciliosbogota.com/12345" productLinkGetter = ProductLinkGetter() expectedLink = "http://www.domiciliosbogota.com/establecimientos/producto/12345" self.assertEqual(expectedLink, productLinkGetter.getLink(id))