def parse_search(self, response): common.dump_response(self.settings, response) body = response.body_as_unicode() jsonresponse = json.loads(body) for project in jsonresponse['projects']: yield scrapy.http.Request(project['urls']['web']['project'], callback=self.parse_project, meta={'json': project}) yield self.generate_search_request(response.meta['index'] + 1)
def parse_search(self, response): common.dump_response(self.settings, response) body = response.body_as_unicode() jsonresponse = json.loads(body) for project in jsonresponse['projects']: yield scrapy.http.Request( project['urls']['web']['project'], callback = self.parse_project, meta = {'json': project} ) yield self.generate_search_request(response.meta['index'] + 1)
def parse(self, response): common.dump_response(self.settings, response) sel = Selector(response) for game_title in sel.xpath('//table[@id="mof_object_list"]//a[contains(@href, "/game/")]/text()'): item = MobyItem() item['value'] = game_title.extract() yield item for pagination_links in sel.xpath('//div[@class="mobFooter"]'): for link in pagination_links.xpath('.//a/@href'): yield scrapy.http.Request(urljoin_rfc(response.url, link.extract()), callback = self.parse) pass break
def parse(self, response): common.dump_response(self.settings, response) sel = Selector(response) for game_title in sel.xpath( '//table[@id="mof_object_list"]//a[contains(@href, "/game/")]/text()' ): item = MobyItem() item['value'] = game_title.extract() yield item for pagination_links in sel.xpath('//div[@class="mobFooter"]'): for link in pagination_links.xpath('.//a/@href'): yield scrapy.http.Request(urljoin_rfc(response.url, link.extract()), callback=self.parse) pass break
def parse_project(self, response): common.dump_response(self.settings, response) json = response.meta['json'] sel = Selector(response) item = KickstarterItem() item['title'] = json['name'] item['currency'] = json['currency'] item['goal'] = float(json['goal']) item['date'] = int(json['deadline']) # Remove html tags from description here since we're in the scrapy context and have relevant utilities item['rawtext'] = ' '.join( map(lambda sel: sel.extract(), sel.xpath('//div[@class="full-description"]//text()')) ) + ' ' + ' '.join( map(lambda sel: sel.extract(), sel.xpath('//div[@class="short_blurb"]//text()'))) item['web'] = response.url return [item]
def parse_project(self, response): common.dump_response(self.settings, response) json = response.meta['json'] sel = Selector(response) item = KickstarterItem() item['title'] = json['name'] item['currency'] = json['currency'] item['goal'] = float(json['goal']) item['date'] = int(json['deadline']) # Remove html tags from description here since we're in the scrapy context and have relevant utilities item['rawtext'] = ' '.join(map( lambda sel: sel.extract(), sel.xpath('//div[@class="full-description"]//text()') )) + ' ' + ' '.join(map( lambda sel: sel.extract(), sel.xpath('//div[@class="short_blurb"]//text()') )) item['web'] = response.url return [item]