Python clean Exemples, strephit.commons.text.clean Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : newulsterbiography_co_uk.py Projet : rpatil524/StrepHit

 def get_bio(self, response, values):
     bio = text.clean('\n'.join(reversed(values)))
     if not bio:
         bio = text.clean(' '.join(reversed(response.xpath(
             './/div[@id="person_details"]/div/br[1]/preceding-sibling::text()'
         ).extract())))
     return bio

Exemple #2

0

Afficher le fichier

Fichier : musicians.py Projet : Wikidata/StrepHit

 def refine_item(self, response, item):
     content = response.xpath('.//div[@id="mw-content-text"]/div[2]')
     children = content.xpath('./p/child::node()')
     if len(children) < 3 or children[2].xpath('local-name()').extract() != ['span']:
         return None
     else:
         name = children[2].xpath('.//text()').extract()
         if name:
             item['bio'] = text.clean_extract(content, './/text()')
             item['name'] = text.clean(children[1].extract() + ' ' + name[0])
         else:
             return None
     return super(MusiciansSpider, self).refine_item(response, item)

Exemple #3

0

Afficher le fichier

Fichier : musicians.py Projet : rpatil524/StrepHit

 def refine_item(self, response, item):
     content = response.xpath('.//div[@id="mw-content-text"]/div[2]')
     children = content.xpath('./p/child::node()')
     if len(children) < 3 or children[2].xpath(
             'local-name()').extract() != ['span']:
         return None
     else:
         name = children[2].xpath('.//text()').extract()
         if name:
             item['bio'] = text.clean_extract(content, './/text()')
             item['name'] = text.clean(children[1].extract() + ' ' +
                                       name[0])
         else:
             return None
     return super(MusiciansSpider, self).refine_item(response, item)

Exemple #4

0

Afficher le fichier

Fichier : design_and_art_australia_online.py Projet : Wikidata/StrepHit

 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     item['name'] = response.css('span.name::text').extract_first().strip()
     item['bio'] = []
     item['bio'].append(clean(fromstring('\n'.join(response.css('div.description').extract())).text_content()))
     # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags
     semi_structured = response.css('div#tab_content_artist_summary')
     keys = filter(None, [k.strip() for k in semi_structured.xpath('//dt//text()').extract()])
     values = filter(None, [v.strip() for v in semi_structured.xpath('//dd//text()').extract()])
     # Build a dict by mapping keys to values, filtering out None values
     item['other'] = map(lambda x, y: {x: y}, keys, values)
     request = Request(item['url'] + 'biography', self.parse_bio)
     request.meta['item'] = item
     yield request

Exemple #5

0

Afficher le fichier

    def refine_item(self, response, item):
        item['other'] = {}
        for ul in response.xpath(
            './/div[@id="stammdaten"]/div[contains(@class, "text")]//ul'
        ):
            field = ul.xpath('preceding-sibling::h4/text()').extract()[-1]
            value = [
                text.clean_extract(li, './/text()', sep=' ') for li in ul.xpath('li')
            ]
            item['other'][field] = value

        for section in response.xpath('.//div[@class="section"]'):
            title = text.clean_extract(section, 'div[1]//text()')
            values = [text.clean_extract(li, './/text()')
                      for li in section.xpath('div[2]/ul/li')]
            if values:
                item['other'][title] = values

        item['name'] = text.clean(item['name'].replace('\t', ' '))

        return super(AcademiaNetSpider, self).refine_item(response, item)

Exemple #6

0

Afficher le fichier

 def parse_person(self, response):
     item = WebSourcesCorpusItem()
     item['url'] = response.url
     item['name'] = response.css('span.name::text').extract_first().strip()
     item['bio'] = []
     item['bio'].append(
         clean(
             fromstring('\n'.join(
                 response.css(
                     'div.description').extract())).text_content()))
     # There is some semi-structured data available in key-value pairs, as <dt> and <dd> tags
     semi_structured = response.css('div#tab_content_artist_summary')
     keys = filter(None, [
         k.strip() for k in semi_structured.xpath('//dt//text()').extract()
     ])
     values = filter(None, [
         v.strip() for v in semi_structured.xpath('//dd//text()').extract()
     ])
     # Build a dict by mapping keys to values, filtering out None values
     item['other'] = map(lambda x, y: {x: y}, keys, values)
     request = Request(item['url'] + 'biography', self.parse_bio)
     request.meta['item'] = item
     yield request

Exemple #7

0

Afficher le fichier

Fichier : english_artists.py Projet : rpatil524/StrepHit

 def text_from_node(self, node):
     return (text.clean_extract(node, './/text()', sep=' ')
             if node.xpath('local-name()').extract() else text.clean(
                 node.extract()))

Exemple #8

0

Afficher le fichier

Fichier : english_artists.py Projet : rpatil524/StrepHit

 def finalize(self, item):
     if 'other' in item:
         item['other'] = json.dumps(item['other'])
     item['bio'] = text.clean(item['bio'])
     item['name'] = text.clean(','.join(item['name'].split(',')[:-1]))
     return item

Exemple #9

0

Afficher le fichier

Fichier : yba_llgc_org_uk.py Projet : Wikidata/StrepHit

 def clean_nu(self, response, strings):
     return text.clean(' '.join(strings).replace('\n', ' '), False)

Exemple #10

0

Afficher le fichier

 def clean_nu(self, response, strings):
     return text.clean(' '.join(strings).replace('\n', ' '), False)

Exemple #11

0

Afficher le fichier

Fichier : BaseSpider.py Projet : rpatil524/StrepHit

 def clean(self, response, strings, unicode=True):
     """ Utility function to clean strings. Can be used within your selectors
     """
     return text.clean(' '.join(strings), unicode)

Exemple #12

0

Afficher le fichier

Fichier : english_artists.py Projet : Wikidata/StrepHit

 def text_from_node(self, node):
     return (text.clean_extract(node, './/text()', sep=' ')
             if node.xpath('local-name()').extract()
             else text.clean(node.extract()))

Exemple #13

0

Afficher le fichier

Fichier : english_artists.py Projet : Wikidata/StrepHit

 def finalize(self, item):
     if 'other' in item:
         item['other'] = json.dumps(item['other'])
     item['bio'] = text.clean(item['bio'])
     item['name'] = text.clean(','.join(item['name'].split(',')[:-1]))
     return item