Example #1
0
    def root_category_subpages(self, response):
        """
        Crawl the root category pages to get all their sub categories.
        """
        hxs = HtmlXPathSelector(response)
        sub_cats = hxs.select(
            '//tr[@class="forum"]/td[@class="c_forum"]/strong/a')
        items_and_reqs = []
        for cat_selector in sub_cats:
            cat_load = ForumLoader(ForumItem(), cat_selector)
            cat_load.add_xpath('zeta_id', '@href')
            cat_load.add_xpath('title', 'text()')
            cat_load.add_value('parent', response.request.meta['parent'])
            cat = cat_load.load_item()
            items_and_reqs.append(cat)
            # Send off a request to recursively look for categories in the
            # category subpage
            req = Request(cat_selector.select('@href').extract()[0] +
                          "?cutoff=100",
                          meta={'parent': cat['zeta_id']},
                          callback=self.root_category_subpages)
            items_and_reqs.append(req)
            # Send off a request to process the category subpage.
            # including the parent forum in the meta.


#            req = Request(cat_selector.select('@href').extract()[0],
#                          meta={'forum': cat['zeta_id']},
#                          callback=self.thread_list)
#            items_and_reqs.append(req)
        return items_and_reqs + self.thread_list(
            response, response.request.meta['parent'])
Example #2
0
 def root_index(self, response):
     """
     Crawl the main index to get all root categories.
     """
     hxs = HtmlXPathSelector(response)
     # Get all root categories on the page, store them and then fire of a
     # request to go visit their specific subpage so we can get the subcats.
     root_cats = hxs.select('//table[@class="cat_head"]//h2[not(@class)]/a')
     if len(root_cats) == 0:
         print "ERROR"
         import code
         code.interact(local=dict(globals(), **locals()))
     items_and_reqs = []
     for cat_selector in root_cats:
         # Load root category as a ForumItem (with no parent)
         cat_load = ForumLoader(ForumItem(), cat_selector)
         cat_load.add_xpath('zeta_id', '@href')
         cat_load.add_xpath('title', 'text()')
         cat = cat_load.load_item()
         items_and_reqs.append(cat)
         # Send off a request to process the root category subpage.
         # including the parent forum in the meta.
         req = Request(cat_selector.select('@href').extract()[0] +
                       "?cutoff=100",
                       meta={'parent': cat['zeta_id']},
                       callback=self.root_category_subpages)
         items_and_reqs.append(req)
     return items_and_reqs
Example #3
0
 def root_category_subpages(self, response):
     """
     Crawl the root category pages to get all their sub categories.
     """
     hxs = HtmlXPathSelector(response)
     sub_cats = hxs.select('//tr[@class="forum"]/td[@class="c_forum"]/strong/a')
     items_and_reqs = []
     for cat_selector in sub_cats:
         cat_load = ForumLoader(ForumItem(), cat_selector)
         cat_load.add_xpath('zeta_id', '@href')
         cat_load.add_xpath('title', 'text()')
         cat_load.add_value('parent', response.request.meta['parent'])
         cat = cat_load.load_item()
         items_and_reqs.append(cat)
         # Send off a request to process the root category subpage.
         # including the parent forum in the meta. 
         req = Request(cat_selector.select('@href').extract()[0],
                       meta={'forum': cat['zeta_id']},
                       callback=self.thread_list)
         items_and_reqs.append(req)
     return items_and_reqs
Example #4
0
 def root_index(self, response):
     """
     Crawl the main index to get all root categories.
     """
     hxs = HtmlXPathSelector(response)
     # Get all root categories on the page, store them and then fire of a 
     # request to go visit their specific subpage so we can get the subcats.
     root_cats = hxs.select('//table[@class="cat_head"]//h2[not(@class)]/a')
     items_and_reqs = []
     for cat_selector in root_cats:
         # Load root category as a ForumItem (with no parent)
         cat_load = ForumLoader(ForumItem(), cat_selector)
         cat_load.add_xpath('zeta_id', '@href')
         cat_load.add_xpath('title', 'text()')
         cat = cat_load.load_item()
         items_and_reqs.append(cat)
         # Send off a request to process the root category subpage.
         # including the parent forum in the meta. 
         req = Request(cat_selector.select('@href').extract()[0],
                       meta={'parent': cat['zeta_id']},
                       callback=self.root_category_subpages)
         items_and_reqs.append(req)
     return items_and_reqs