def get_bio(self, response, values):
     bio = text.clean('\n'.join(reversed(values)))
     if not bio:
         bio = text.clean(' '.join(reversed(response.xpath(
             './/div[@id="person_details"]/div/br[1]/preceding-sibling::text()'
         ).extract())))
     return bio
Example #2
0
 def refine_item(self, response, item):
     content = response.xpath('.//div[@id="mw-content-text"]/div[2]')
     children = content.xpath('./p/child::node()')
     if len(children) < 3 or children[2].xpath('local-name()').extract() != ['span']:
         return None
     else:
         name = children[2].xpath('.//text()').extract()
         if name:
             item['bio'] = text.clean_extract(content, './/text()')
             item['name'] = text.clean(children[1].extract() + ' ' + name[0])
         else:
             return None
     return super(MusiciansSpider, self).refine_item(response, item)
Example #3
0
 def refine_item(self, response, item):
     content = response.xpath('.//div[@id="mw-content-text"]/div[2]')
     children = content.xpath('./p/child::node()')
     if len(children) < 3 or children[2].xpath(
             'local-name()').extract() != ['span']:
         return None
     else:
         name = children[2].xpath('.//text()').extract()
         if name:
             item['bio'] = text.clean_extract(content, './/text()')
             item['name'] = text.clean(children[1].extract() + ' ' +
                                       name[0])
         else:
             return None
     return super(MusiciansSpider, self).refine_item(response, item)
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     item['name'] = response.css('span.name::text').extract_first().strip()
     item['bio'] = []
     item['bio'].append(clean(fromstring('\n'.join(response.css('div.description').extract())).text_content()))
     # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags
     semi_structured = response.css('div#tab_content_artist_summary')
     keys = filter(None, [k.strip() for k in semi_structured.xpath('//dt//text()').extract()])
     values = filter(None, [v.strip() for v in semi_structured.xpath('//dd//text()').extract()])
     # Build a dict by mapping keys to values, filtering out None values
     item['other'] = map(lambda x, y: {x: y}, keys, values)
     request = Request(item['url'] + 'biography', self.parse_bio)
     request.meta['item'] = item
     yield request
Example #5
0
    def refine_item(self, response, item):
        item['other'] = {}
        for ul in response.xpath(
            './/div[@id="stammdaten"]/div[contains(@class, "text")]//ul'
        ):
            field = ul.xpath('preceding-sibling::h4/text()').extract()[-1]
            value = [
                text.clean_extract(li, './/text()', sep=' ') for li in ul.xpath('li')
            ]
            item['other'][field] = value

        for section in response.xpath('.//div[@class="section"]'):
            title = text.clean_extract(section, 'div[1]//text()')
            values = [text.clean_extract(li, './/text()')
                      for li in section.xpath('div[2]/ul/li')]
            if values:
                item['other'][title] = values

        item['name'] = text.clean(item['name'].replace('\t', ' '))

        return super(AcademiaNetSpider, self).refine_item(response, item)
Example #6
0
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     item['name'] = response.css('span.name::text').extract_first().strip()
     item['bio'] = []
     item['bio'].append(
         clean(
             fromstring('\n'.join(
                 response.css(
                     'div.description').extract())).text_content()))
     # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags
     semi_structured = response.css('div#tab_content_artist_summary')
     keys = filter(None, [
         k.strip() for k in semi_structured.xpath('//dt//text()').extract()
     ])
     values = filter(None, [
         v.strip() for v in semi_structured.xpath('//dd//text()').extract()
     ])
     # Build a dict by mapping keys to values, filtering out None values
     item['other'] = map(lambda x, y: {x: y}, keys, values)
     request = Request(item['url'] + 'biography', self.parse_bio)
     request.meta['item'] = item
     yield request
Example #7
0
 def text_from_node(self, node):
     return (text.clean_extract(node, './/text()', sep=' ')
             if node.xpath('local-name()').extract() else text.clean(
                 node.extract()))
Example #8
0
 def finalize(self, item):
     if 'other' in item:
         item['other'] = json.dumps(item['other'])
     item['bio'] = text.clean(item['bio'])
     item['name'] = text.clean(','.join(item['name'].split(',')[:-1]))
     return item
Example #9
0
 def clean_nu(self, response, strings):
     return text.clean(' '.join(strings).replace('\n', ' '), False)
Example #10
0
 def clean_nu(self, response, strings):
     return text.clean(' '.join(strings).replace('\n', ' '), False)
Example #11
0
 def clean(self, response, strings, unicode=True):
     """ Utility function to clean strings. Can be used within your selectors
     """
     return text.clean(' '.join(strings), unicode)
Example #12
0
 def text_from_node(self, node):
     return (text.clean_extract(node, './/text()', sep=' ')
             if node.xpath('local-name()').extract()
             else text.clean(node.extract()))
Example #13
0
 def finalize(self, item):
     if 'other' in item:
         item['other'] = json.dumps(item['other'])
     item['bio'] = text.clean(item['bio'])
     item['name'] = text.clean(','.join(item['name'].split(',')[:-1]))
     return item