Python table2dict Examples

Programming Language: Python

Namespace/Package Name: scrapyscrappers.util

Method/Function: table2dict

Examples at hotexamples.com: 4

Python table2dict - 4 examples found. These are the top rated real world Python examples of scrapyscrappers.util.table2dict extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: usajobs_crawler.py Project: davete/scrapyscrappers

 def parse_item(self,  response):
     self.logger.debug('parse_item %s' % response.url, log_level=log.DEBUG)
     item = response.meta['item']
     soup = bs4.BeautifulSoup(response.body)          
     # FIXME: uncomment when not debugging
     #item['description'] = soup.select('div.jobdetail')[0].text
     infodict = table2dict(soup,  'div#jobinfo2')
     item['clearance'] = infodict.get('SECURITY CLEARANCE')
     yield item

Example #2

Show file

File: usajobs.py Project: scottmcclenning/scrapyscrappers

 def parse_item(self,  response):
     item = super(UsajobsSpider, self).parse_item(response)
     soup = bs4.BeautifulSoup(response.body)          
     # item['description'] = soup.select('div.jobdetail')[0].text # with soup, plan text
     try:
         item['description'] = response.css('div.jobdetail')[0].extract() 
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     else:
         infodict = table2dict(soup,  'div#jobinfo2')
         item['clearance'] = infodict.get('SECURITY CLEARANCE')
     yield item

Example #3

Show file

File: usajobs.py Project: scottmcclenning/scrapyscrappers

 def parse(self, response):
     super(UsajobsSpider,  self).parse(response)
     soup = bs4.BeautifulSoup(response.body)
     soupitems = soup.select('div#jobResultNew')    
     if len(soupitems) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for soupitem in soupitems:
         item = self.init_item(response)
         item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href')
         item['title'] = soupitem.select('a.jobTitleLink')[0].text
         item['short_description'] = soupitem.select('p.summary')[0].text.strip()
         details = table2dict(soupitem,  'table.joaResultsDetailsTable')
         item['company'] = details.get('Agency',  '')
         location_region = details.get('Location(s)',  '').split(', ')
         item['locality'] = location_region[0]
         try:
             item['region'] = location_region[1]
         except IndexError:
             pass
         item['salary'] = details.get('Salary',  '')
         item['department'] = details.get('Department',  '')
         # data not available in this website
         item['published']= ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     # next = soup.select('a.nextPage') # with soup
     next = response.css('a.nextPage::attr(href)').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             # self.base_url + next[0]['href'],  # with soup
             self.base_url + next[0], 
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')

Example #4

Show file

File: usajobs_crawler.py Project: davete/scrapyscrappers

 def parse(self, response):
     self.logger.debug('in parse', log_level=log.DEBUG)
     soup = bs4.BeautifulSoup(response.body)
     soupitems = soup.select('div#jobResultNew')    
     for soupitem in soupitems:
         item = ScrapyscrappersItem()
         item['keyword'] = response.meta['keyword']
         item['date_search'] = current_datetime()
         item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href')
         item['title'] = soupitem.select('a.jobTitleLink')[0].text
         item['short_description'] = soupitem.select('p.summary')[0].text.strip()
         details = table2dict(soupitem,  'table.joaResultsDetailsTable')
         item['company'] = details.get('Agency',  '')
         location_region = details.get('Location(s)',  '').split(', ')
         item['locality'] = location_region[0]
         try:
             item['region'] = location_region[1]
         except IndexError:
             pass
         item['salary'] = details.get('Salary',  '')
         item['department'] = details.get('Department',  '')
         # item.published = ''
         self.logger.debug('title %s' % item['title'], log_level=log.DEBUG)
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )