Beispiel #1
0
    def parse_detail(self, response):
        item = None
        for each in response.xpath(
                './/div[@class="tiInherit"]/parent::div/*')[3:]:
            content = each.xpath('child::node()')
            if content and content[0].xpath('local-name()').extract() == [
                    'span'
            ]:
                if item:
                    yield self.finalize(item)

                item = WebSourcesCorpusItem(
                    url=response.url,
                    name=' '.join(self.text_from_node(c) for c in content[:3]),
                    bio=text.clean_extract(each, './/text()', sep=' '),
                )

                if each.xpath('./i'):
                    item['other'] = {
                        'profession': text.clean_extract(each, './i//text()')
                    }

                assert item['name'] and len(item['name']) > 3
            elif item:
                item['bio'] += '\n' + text.clean_extract(
                    each, './/text()', sep=' ')

        if item:
            yield self.finalize(item)
Beispiel #2
0
 def parse_detail(self, response):
     for each in response.xpath(
             './/div[@id="headerContainer"]/following-sibling::div//p'):
         yield WebSourcesCorpusItem(
             url=response.url,
             name=text.clean_extract(each, './span//text()'),
             bio=text.clean_extract(each, './/text()', sep=' '),
         )
Beispiel #3
0
    def parse(self, response):
        current_item = None

        for p in response.xpath('.//div[@id="mw-content-text"]/p'):
            content = p.xpath('child::node()')
            if content and content[0].xpath('local-name()').extract() == ['a']:
                if current_item is not None:
                    if 'other' in current_item:
                        current_item['other'] = json.dumps(
                            current_item['other'])
                    yield current_item

                current_item = WebSourcesCorpusItem(
                    url=text.clean_extract(content[0], '@href'),
                    name=text.clean_extract(content[0], 'text()'),
                    bio=' '.join(
                        text.clean_extract(c, './/text()')
                        for c in content[1:]))
            else:
                txt = p.xpath('text()').extract()[0]
                m = re.match(
                    ur'([^(]{,50})\((about )?(B\.C\. )?(\d+| ) ?- ?(\d+| )\)',
                    txt)
                if m:
                    if 'other' in current_item:
                        current_item['other'] = json.dumps(
                            current_item['other'])
                    yield current_item
                    current_item = WebSourcesCorpusItem(
                        url=response.url,
                        name=m.group(1).strip(),
                        birth=(m.group(3) or '') + m.group(4),
                        death=(m.group(3) or '') + m.group(5),
                        bio=text.clean_extract(p, './/text()'),
                    )
                elif current_item is not None:
                    current_item['bio'] += text.clean_extract(p, './/text()')

        if current_item is not None:
            if 'other' in current_item:
                current_item['other'] = json.dumps(current_item['other'])
            yield current_item
Beispiel #4
0
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     name_and_dates = response.css('h2::text').extract_first()
     dates = re.search(r'\((\d{4})[^\d](\d{4})\)', name_and_dates, re.UNICODE)
     item['name'] = re.search(r'[^\d]+', name_and_dates, re.UNICODE).group().strip(' (')
     if dates:
         item['birth'] = dates.group(1)
         item['death'] = dates.group(2)
     item['bio'] = fromstring(response.css('div.biographyContent').extract_first()).text_content().strip()
     yield item
    def parse_detail(self, response):
        for each in response.xpath(
                './/div[@id="headerContainer"]/following-sibling::p'):
            item = WebSourcesCorpusItem(
                url=response.url,
                bio=text.clean_extract(each, './/text()', sep=' '),
            )

            if each.xpath('./a'):
                item['name'] = text.clean_extract(each, './a[1]//text()')

            if 'name' in item or item['bio']:
                yield item
Beispiel #6
0
    def row_to_item(self, row):
        """ Converts a single row, result of the join between all tables, into a finished item
        """
        this_year = datetime.date.today().year

        cleaned = []
        for i, field in enumerate(row):
            if not field:
                cleaned.append(field)
            elif field[0] == '"' and field[-1] == '"':
                cleaned.append(field[1:-1].replace('\\"', '"'))
            elif i == 5 or i == 7:
                try:
                    n = int(field)
                    # they estimate the death date of living people as birth date + 100
                    # of course we don't want this kind of data here
                    if n > this_year:
                        raise ValueError()
                    else:
                        cleaned.append(field)
                except ValueError:
                    cleaned.append(None)
            else:
                cleaned.append(field)

        url, name, bio1, bio2, nationality, birth_year, birth_place, death_year, death_place, gender = cleaned
        url = url.replace('http://vocab.getty.edu/ulan/',
                          'http://vocab.getty.edu/page/ulan/')
        gender = {
            'http://vocab.getty.edu/aat/300189557': 'female',
            'http://vocab.getty.edu/aat/300189559': 'male',
        }.get(gender)

        return WebSourcesCorpusItem(url=url,
                                    name=name,
                                    birth=birth_year,
                                    death=death_year,
                                    bio=u'{}\n{}'.format(
                                        bio1 or '', bio2 or '').strip(),
                                    other=json.dumps({
                                        'birth_place':
                                        birth_place,
                                        'death_place':
                                        death_place,
                                        'gender':
                                        gender,
                                        'nationality':
                                        nationality,
                                    }))
Beispiel #7
0
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     name = clean_extract(response,
                          "//h1[contains(@class, 'header')]//text()")
     if name:
         item['name'] = name
     else:
         logging.debug("No name found for item with URL '%s'" % item['url'])
     bio_nodes = response.xpath("//li[contains(., 'BIOGRAPHY')]").extract()
     if bio_nodes:
         item['bio'] = fromstring(
             '\n'.join(bio_nodes)).text_content().strip()
     else:
         logging.debug("No raw text biography found for %s" % item['name'])
     item['other'] = {}
     keys = response.css('li#info td.fieldnameback')
     if keys:
         for key_node in keys:
             key_text = key_node.xpath('.//text()').extract_first()
             # Take the first sibling of the key node as the value
             value = key_node.xpath('./following-sibling::td[1]')
             if value:
                 people_links = value.xpath(
                     ".//a[contains(@href, 'getperson')]")
                 if people_links:
                     logging.debug("Values with links found for key '%s'" %
                                   key_text)
                     item['other'][key_text] = []
                     for person in people_links:
                         name = person.xpath('.//text()').extract_first()
                         link = person.xpath('@href').extract_first()
                         item['other'][key_text].append(
                             {name: response.urljoin(link)})
                 else:
                     literal_value = clean_extract(value, './/text()')
                     item['other'][key_text] = literal_value
             else:
                 logging.debug("No value found for key '%s'" % key_text)
     else:
         logging.debug("No semi-structured data found for '%s'" %
                       item['name'])
     yield item
Beispiel #8
0
 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     item['name'] = response.css('span.name::text').extract_first().strip()
     item['bio'] = []
     item['bio'].append(
         clean(
             fromstring('\n'.join(
                 response.css(
                     'div.description').extract())).text_content()))
     # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags
     semi_structured = response.css('div#tab_content_artist_summary')
     keys = filter(None, [
         k.strip() for k in semi_structured.xpath('//dt//text()').extract()
     ])
     values = filter(None, [
         v.strip() for v in semi_structured.xpath('//dd//text()').extract()
     ])
     # Build a dict by mapping keys to values, filtering out None values
     item['other'] = map(lambda x, y: {x: y}, keys, values)
     request = Request(item['url'] + 'biography', self.parse_bio)
     request.meta['item'] = item
     yield request
    def parse_detail(self, response):
        artist_id = response.url.split('/')[-1]

        keys = response.xpath('.//div[@id="member_info"]//dt')
        values = response.xpath('.//div[@id="member_info"]//dd')
        info = dict((text.clean_extract(k, './/text()'),
                     text.clean_extract(v, './/text()'))
                    for k, v in zip(keys, values))

        item = WebSourcesCorpusItem(
            url=response.url,
            name=info.pop('Real/full name:'),
            other=info,
        )

        yield Request('http://www.metal-archives.com/artist/read-more/id/' +
                      artist_id,
                      self.parse_extern,
                      meta={
                          'item': item,
                          'field': 'bio',
                          'aid': artist_id
                      })
Beispiel #10
0
 def parse_fellow(self, response):
     yield WebSourcesCorpusItem(
         url=response.url,
         bio=text.clean_extract(response, './/div[@class="expandableBio"]//text()'),
         other=json.dumps(response.meta)
     )