def root_category_subpages(self, response): """ Crawl the root category pages to get all their sub categories. """ hxs = HtmlXPathSelector(response) sub_cats = hxs.select( '//tr[@class="forum"]/td[@class="c_forum"]/strong/a') items_and_reqs = [] for cat_selector in sub_cats: cat_load = ForumLoader(ForumItem(), cat_selector) cat_load.add_xpath('zeta_id', '@href') cat_load.add_xpath('title', 'text()') cat_load.add_value('parent', response.request.meta['parent']) cat = cat_load.load_item() items_and_reqs.append(cat) # Send off a request to recursively look for categories in the # category subpage req = Request(cat_selector.select('@href').extract()[0] + "?cutoff=100", meta={'parent': cat['zeta_id']}, callback=self.root_category_subpages) items_and_reqs.append(req) # Send off a request to process the category subpage. # including the parent forum in the meta. # req = Request(cat_selector.select('@href').extract()[0], # meta={'forum': cat['zeta_id']}, # callback=self.thread_list) # items_and_reqs.append(req) return items_and_reqs + self.thread_list( response, response.request.meta['parent'])
def root_index(self, response): """ Crawl the main index to get all root categories. """ hxs = HtmlXPathSelector(response) # Get all root categories on the page, store them and then fire of a # request to go visit their specific subpage so we can get the subcats. root_cats = hxs.select('//table[@class="cat_head"]//h2[not(@class)]/a') if len(root_cats) == 0: print "ERROR" import code code.interact(local=dict(globals(), **locals())) items_and_reqs = [] for cat_selector in root_cats: # Load root category as a ForumItem (with no parent) cat_load = ForumLoader(ForumItem(), cat_selector) cat_load.add_xpath('zeta_id', '@href') cat_load.add_xpath('title', 'text()') cat = cat_load.load_item() items_and_reqs.append(cat) # Send off a request to process the root category subpage. # including the parent forum in the meta. req = Request(cat_selector.select('@href').extract()[0] + "?cutoff=100", meta={'parent': cat['zeta_id']}, callback=self.root_category_subpages) items_and_reqs.append(req) return items_and_reqs
def root_category_subpages(self, response): """ Crawl the root category pages to get all their sub categories. """ hxs = HtmlXPathSelector(response) sub_cats = hxs.select('//tr[@class="forum"]/td[@class="c_forum"]/strong/a') items_and_reqs = [] for cat_selector in sub_cats: cat_load = ForumLoader(ForumItem(), cat_selector) cat_load.add_xpath('zeta_id', '@href') cat_load.add_xpath('title', 'text()') cat_load.add_value('parent', response.request.meta['parent']) cat = cat_load.load_item() items_and_reqs.append(cat) # Send off a request to process the root category subpage. # including the parent forum in the meta. req = Request(cat_selector.select('@href').extract()[0], meta={'forum': cat['zeta_id']}, callback=self.thread_list) items_and_reqs.append(req) return items_and_reqs
def root_index(self, response): """ Crawl the main index to get all root categories. """ hxs = HtmlXPathSelector(response) # Get all root categories on the page, store them and then fire of a # request to go visit their specific subpage so we can get the subcats. root_cats = hxs.select('//table[@class="cat_head"]//h2[not(@class)]/a') items_and_reqs = [] for cat_selector in root_cats: # Load root category as a ForumItem (with no parent) cat_load = ForumLoader(ForumItem(), cat_selector) cat_load.add_xpath('zeta_id', '@href') cat_load.add_xpath('title', 'text()') cat = cat_load.load_item() items_and_reqs.append(cat) # Send off a request to process the root category subpage. # including the parent forum in the meta. req = Request(cat_selector.select('@href').extract()[0], meta={'parent': cat['zeta_id']}, callback=self.root_category_subpages) items_and_reqs.append(req) return items_and_reqs