def parse(self, response): """this is the dump of a cms running on django nonrel and google appengine I wrote in 2010 or s/t. Appengine got too expensive for me.""" logging.log(logging.INFO, "Parsed XML-Input-File %s"%response.url) ## # bvg is still online, we get e/t from there by recrawling recursively for dj_au_obj in response.xpath('/django-objects/object[@model = "stag.augmentedarticlescraperpolizeiberlin"]'): parts = {} parts['place'] = None parts['author'] = 'Polizei Berlin' parts['source_name'] = 'polizei' parts['source_url'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').extract_first() parts['source_id'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').re_first('(\d+)/index\.html') parts['place'] = dj_au_obj.xpath('field[@name = "district"]/text()').extract_first() if parts['place'] == 'False': parts['place'] = None ## # select the payload-article # now we need to resolve the id for more data article_id = dj_au_obj.xpath('field[@to = "article.article"]/text()').extract_first() dj_art_obj = response.xpath('/django-objects/object[@pk = "%s"]'%article_id) pub_date = dj_art_obj.xpath('field[@name = "date_published"]/text()').extract_first() parts['time'] = datetime.strptime(pub_date , "%Y-%m-%d %H:%M:%S") parts['headline'] = dj_art_obj.xpath('field[@name = "name"]/text()').extract_first() parts['body'] = dj_art_obj.xpath('field[@name = "content"]/text()').extract_first() item_loader = BerlinItemLoader() item_loader.add_value(None, parts) yield item_loader.load_item()
def parse(self, response): parts = {} parts['place'] = None parts['author'] = 'Polizei Berlin' parts['source_name'] = 'polizei' parts['source_url'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').extract_first() parts['source_id'] = dj_au_obj.xpath('field[@name = "source_url"]/text()').re_first('(\d+)/index\.html') parts['place'] = dj_au_obj.xpath('field[@name = "district"]/text()').extract_first() parts['time'] = datetime.strptime(pub_date , "%Y-%m-%d %H:%M:%S") parts['headline'] = dj_art_obj.xpath('field[@name = "name"]/text()').extract_first() parts['body'] = dj_art_obj.xpath('field[@name = "content"]/text()').extract_first() item_loader = BerlinItemLoader() item_loader.add_value(None, parts) yield item_loader.load_item()
def parse_item_page(self, response): ## # retrieve what we sent selector = response.css('div.article') item_loader = BerlinItemLoader(selector=selector) ## # the simple parts parts = response.meta['parts'] parts['place'] = self.parse_item_page_place(response) parts['author'] = 'Polizei Berlin' item_loader.add_value(None, parts) ## # item_loader.add_css('headline', 'h1.title::text') item_loader.add_css('body', 'div.textile') return item_loader.load_item()
def parse(self, response): parts = {} parts['place'] = None parts['author'] = 'Polizei Berlin' parts['source_name'] = 'polizei' parts['source_url'] = dj_au_obj.xpath( 'field[@name = "source_url"]/text()').extract_first() parts['source_id'] = dj_au_obj.xpath( 'field[@name = "source_url"]/text()').re_first('(\d+)/index\.html') parts['place'] = dj_au_obj.xpath( 'field[@name = "district"]/text()').extract_first() parts['time'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S") parts['headline'] = dj_art_obj.xpath( 'field[@name = "name"]/text()').extract_first() parts['body'] = dj_art_obj.xpath( 'field[@name = "content"]/text()').extract_first() item_loader = BerlinItemLoader() item_loader.add_value(None, parts) yield item_loader.load_item()
def parse(self, response): """this is the dump of a cms running on django nonrel and google appengine I wrote in 2010 or s/t. Appengine got too expensive for me.""" logging.log(logging.INFO, "Parsed XML-Input-File %s" % response.url) ## # bvg is still online, we get e/t from there by recrawling recursively for dj_au_obj in response.xpath( '/django-objects/object[@model = "stag.augmentedarticlescraperpolizeiberlin"]' ): parts = {} parts['place'] = None parts['author'] = 'Polizei Berlin' parts['source_name'] = 'polizei' parts['source_url'] = dj_au_obj.xpath( 'field[@name = "source_url"]/text()').extract_first() parts['source_id'] = dj_au_obj.xpath( 'field[@name = "source_url"]/text()').re_first( '(\d+)/index\.html') parts['place'] = dj_au_obj.xpath( 'field[@name = "district"]/text()').extract_first() if parts['place'] == 'False': parts['place'] = None ## # select the payload-article # now we need to resolve the id for more data article_id = dj_au_obj.xpath( 'field[@to = "article.article"]/text()').extract_first() dj_art_obj = response.xpath('/django-objects/object[@pk = "%s"]' % article_id) pub_date = dj_art_obj.xpath( 'field[@name = "date_published"]/text()').extract_first() parts['time'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M:%S") parts['headline'] = dj_art_obj.xpath( 'field[@name = "name"]/text()').extract_first() parts['body'] = dj_art_obj.xpath( 'field[@name = "content"]/text()').extract_first() item_loader = BerlinItemLoader() item_loader.add_value(None, parts) yield item_loader.load_item()
def parse_item_page(self, response): ## # retrieve what we sent selector = response.css('div.article') item_loader = BerlinItemLoader(selector = selector) ## # the simple parts parts = response.meta['parts'] parts['place'] = self.parse_item_page_place(response) parts['author'] = 'Polizei Berlin' item_loader.add_value(None, parts) ## # item_loader.add_css('headline', 'h1.title::text') item_loader.add_css('body', 'div.textile') return item_loader.load_item()
def parse_item_page(self, response): selector = response.css('div.article__body') item_loader = BerlinItemLoader(selector = selector) ## # the simple parts parts = response.meta['parts'] place_from_text, raw_time, unwanted, author, headline \ = [ x.xpath('string(.)') for x in selector.css('.moment-info dd') ] parts['author'] = author.extract_first() if not 'author' in parts: parts['author'] = "-" parts['headline'] = headline.extract_first() ## parts['place'] = self.parse_item_page_place(selector) ## # and load item_loader.add_value(None, parts) ## # item_loader.add_css('body','div.moment-message') return item_loader.load_item()