def get_bio(self, response, values): bio = text.clean('\n'.join(reversed(values))) if not bio: bio = text.clean(' '.join(reversed(response.xpath( './/div[@id="person_details"]/div/br[1]/preceding-sibling::text()' ).extract()))) return bio
def refine_item(self, response, item): content = response.xpath('.//div[@id="mw-content-text"]/div[2]') children = content.xpath('./p/child::node()') if len(children) < 3 or children[2].xpath('local-name()').extract() != ['span']: return None else: name = children[2].xpath('.//text()').extract() if name: item['bio'] = text.clean_extract(content, './/text()') item['name'] = text.clean(children[1].extract() + ' ' + name[0]) else: return None return super(MusiciansSpider, self).refine_item(response, item)
def refine_item(self, response, item): content = response.xpath('.//div[@id="mw-content-text"]/div[2]') children = content.xpath('./p/child::node()') if len(children) < 3 or children[2].xpath( 'local-name()').extract() != ['span']: return None else: name = children[2].xpath('.//text()').extract() if name: item['bio'] = text.clean_extract(content, './/text()') item['name'] = text.clean(children[1].extract() + ' ' + name[0]) else: return None return super(MusiciansSpider, self).refine_item(response, item)
def parse_person(self, response): item = WebSourcesCorpusItem() item['url'] = response.url item['name'] = response.css('span.name::text').extract_first().strip() item['bio'] = [] item['bio'].append(clean(fromstring('\n'.join(response.css('div.description').extract())).text_content())) # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags semi_structured = response.css('div#tab_content_artist_summary') keys = filter(None, [k.strip() for k in semi_structured.xpath('//dt//text()').extract()]) values = filter(None, [v.strip() for v in semi_structured.xpath('//dd//text()').extract()]) # Build a dict by mapping keys to values, filtering out None values item['other'] = map(lambda x, y: {x: y}, keys, values) request = Request(item['url'] + 'biography', self.parse_bio) request.meta['item'] = item yield request
def refine_item(self, response, item): item['other'] = {} for ul in response.xpath( './/div[@id="stammdaten"]/div[contains(@class, "text")]//ul' ): field = ul.xpath('preceding-sibling::h4/text()').extract()[-1] value = [ text.clean_extract(li, './/text()', sep=' ') for li in ul.xpath('li') ] item['other'][field] = value for section in response.xpath('.//div[@class="section"]'): title = text.clean_extract(section, 'div[1]//text()') values = [text.clean_extract(li, './/text()') for li in section.xpath('div[2]/ul/li')] if values: item['other'][title] = values item['name'] = text.clean(item['name'].replace('\t', ' ')) return super(AcademiaNetSpider, self).refine_item(response, item)
def parse_person(self, response): item = WebSourcesCorpusItem() item['url'] = response.url item['name'] = response.css('span.name::text').extract_first().strip() item['bio'] = [] item['bio'].append( clean( fromstring('\n'.join( response.css( 'div.description').extract())).text_content())) # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags semi_structured = response.css('div#tab_content_artist_summary') keys = filter(None, [ k.strip() for k in semi_structured.xpath('//dt//text()').extract() ]) values = filter(None, [ v.strip() for v in semi_structured.xpath('//dd//text()').extract() ]) # Build a dict by mapping keys to values, filtering out None values item['other'] = map(lambda x, y: {x: y}, keys, values) request = Request(item['url'] + 'biography', self.parse_bio) request.meta['item'] = item yield request
def text_from_node(self, node): return (text.clean_extract(node, './/text()', sep=' ') if node.xpath('local-name()').extract() else text.clean( node.extract()))
def finalize(self, item): if 'other' in item: item['other'] = json.dumps(item['other']) item['bio'] = text.clean(item['bio']) item['name'] = text.clean(','.join(item['name'].split(',')[:-1])) return item
def clean_nu(self, response, strings): return text.clean(' '.join(strings).replace('\n', ' '), False)
def clean(self, response, strings, unicode=True): """ Utility function to clean strings. Can be used within your selectors """ return text.clean(' '.join(strings), unicode)
def text_from_node(self, node): return (text.clean_extract(node, './/text()', sep=' ') if node.xpath('local-name()').extract() else text.clean(node.extract()))