Exemple #1
0
 def parse(self, response):
     super(ClearedconnectionsSpider, self).parse(response)
     table = response.xpath('//table[@class="jstext"]')
     rows = table.xpath('.//tr')[3:]
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col_url = cols[1]
         onclick = col_url.xpath('.//a/@onclick').extract()[0]
         m = re.search("PopupViewWindow\('(.+?)'", onclick)
         if m:
             url = m.group(1)
         item[
             'item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url
         item['title'] = col_url.xpath('.//a/text()').extract()[0]
         col_published = cols[0]
         published = col_published.xpath('.//p/text()').extract()[0].strip()
         item['published'] = datetime2datetimestr(
             datetime.strptime(published, '%m/%d/%Y'))
         col_company = cols[3]
         try:
             item['company'] = col_company.xpath(
                 './/a/text()').extract()[0].strip()
         except IndexError:
             item['company'] = col_company.xpath(
                 './/p/text()').extract()[0].strip()
         col_loc = cols[2]
         loc = col_loc.xpath('./text()').extract()[0]
         try:
             item['locality'], item['region'], _ = loc.split('-')
         except ValueError:
             item['locality'] = loc
         # data not available in this website
         item['short_description'] = ''
         item['salary'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],
                       callback=self.parse_item,
                       meta={'item': item})
     next = response.css('.pagination').xpath(
         './/a[contains(.,"Next")]/@href').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(self.base_url + next[0],
                       callback=self.parse,
                       meta={
                           'keyword': response.meta['keyword'],
                           'location': response.meta['location']
                       })
     else:
         self.logger.debug('no next url')
 def parse(self, response):
     super(ClearedjobsSpider,  self).parse(response)
     table = response.xpath('//table[@class="search_res"]//tbody')
     rows = table.xpath('.//tr')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col0 = cols[0]
         url = col0.xpath('.//a[@class="search"]/@href').extract()[0]
         item['item_url'] = self.base_url + "/" + url
         # a text could have span tag inside
         item['title'] = col0.xpath('string(.//a[@class="search"])').extract()[0].strip()
         item['company'] = col0.xpath('.//div[@class="info"]//a/text()').extract()[0]
         try:
             item['clearance'] = col0.xpath('.//div[@class="desc"]/text()').extract()[0]
         except IndexError:
             # item['clearance'] = ''
             pass
         published = col0.xpath('.//div[@class=""]/text()').extract()[0].replace('Posted - ','')
         item['published'] = datetime2datetimestr(datetime.strptime(u'July 8, 2015', '%B %d, %Y'))
         col1 = cols[1]
         loc = col1.xpath('./text()').extract()[0].strip()
         try:
             item['locality'] ,  item['region']  = loc.split(', ')
         except ValueError:
             item['locality'] = loc
             item['region'] = ''
         # data not available in this website
         item['short_description'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     next = response.css('.navbar_bottom').xpath('.//a[text()=">"]/@href').extract()
     if next:
         if next[0].startswith('/'):
             self.logger.debug('next url: %s' % self.base_url +next[0])
             yield Request(
                 self.base_url + next[0],  
                 callback=self.parse, 
                 meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
             )
         else:
             self.logger.debug('next url: %s' % self.base_url + '/'  +next[0])
             yield Request(
                 self.base_url + '/' + next[0],  
                 callback=self.parse, 
                 meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
             )
     else:
         self.logger.debug('no next url')
Exemple #3
0
 def parse(self, response):
     super(ClearancejobsSpider,  self).parse(response)
     results = response.css('#search-results') # div
     rows = results.css('.cj-search-result-item')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         item['title'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/text()').extract()[0].strip()
         item['item_url'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/@href').extract()[0]
         try:
             item['company'] = row.xpath('.//span[@class="cj-company-name"]/a/text()').extract()[0].strip()
         except IndexError:
             item['company'] = ''
         #location = row.xpath('.//span[@class="cj-company-name"]/text()').extract()[0].strip()
         location = row.css('.cj-job-primary-info::text').extract()[1].strip()
         try:
             item['locality'] ,  item['region']  = location.split(', ')
         except ValueError:
             item['locality'] = location
             item['region'] = ''
         # attributes not provided
         item['short_description'] = ''
         item['salary'] = ''
         item['department'] = ''
         #updated = row.css('.cj-text-sm::text').extract()[1].strip()
         updated = row.xpath('.//span[@class="cj-text-sm cj-color-mediumgray"]//text()').extract()[1].strip()
         try:
             dt = datetime.strptime(updated, '%m/%d/%y')
         # FIXME: check if exception not only cause of Today string but other string like yesterday
         except ValueError:
             dt = datetime.now()
         item['published'] = datetime2datetimestr(dt)
         #item['clearance'] = row.xpath('.//div[@class="cj-card-data"])').extract()[0].strip()
         item['clearance'] = row.css('.cj-card-data::text').extract()[1]
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     #FIXME: no a link, javascript
     next = response.xpath('button[@class="cj-table-pagination-next"]/a/@href').extract()
     if next:
         self.logger.debug('next url: %s' % next[0])
         yield Request(
             next[0],  
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')
 def parse(self, response):
     super(ClearedconnectionsSpider,  self).parse(response)
     table = response.xpath('//table[@class="jstext"]')
     rows = table.xpath('.//tr')[3:]
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col_url = cols[1]
         onclick = col_url.xpath('.//a/@onclick').extract()[0]
         m = re.search("PopupViewWindow\('(.+?)'", onclick)
         if m:
             url = m.group(1)
         item['item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url
         item['title'] = col_url.xpath('.//a/text()').extract()[0]
         col_published = cols[0]
         published = col_published.xpath('.//p/text()').extract()[0].strip()
         item['published'] = datetime2datetimestr(datetime.strptime(published,'%m/%d/%Y'))
         col_company = cols[3]
         try:
             item['company'] = col_company.xpath('.//a/text()').extract()[0].strip()
         except IndexError:
             item['company'] = col_company.xpath('.//p/text()').extract()[0].strip()
         col_loc = cols[2]
         loc = col_loc.xpath('./text()').extract()[0]
         try:
             item['locality'] ,  item['region'],  _ = loc.split('-')
         except ValueError:
             item['locality'] = loc
         # data not available in this website
         item['short_description'] = ''            
         item['salary'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     next = response.css('.pagination').xpath('.//a[contains(.,"Next")]/@href').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             self.base_url + next[0],  
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')
Exemple #5
0
 def parse(self, response):
     super(ClearedjobsSpider, self).parse(response)
     table = response.xpath('//table[@class="search_res"]//tbody')
     rows = table.xpath('.//tr')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col0 = cols[0]
         url = col0.xpath('.//a[@class="search"]/@href').extract()[0]
         item['item_url'] = self.base_url + "/" + url
         # a text could have span tag inside
         item['title'] = col0.xpath(
             'string(.//a[@class="search"])').extract()[0].strip()
         item['company'] = col0.xpath(
             './/div[@class="info"]//a/text()').extract()[0]
         try:
             item['clearance'] = col0.xpath(
                 './/div[@class="desc"]/text()').extract()[0]
         except IndexError:
             # item['clearance'] = ''
             pass
         published = col0.xpath(
             './/div[@class=""]/text()').extract()[0].replace(
                 'Posted - ', '')
         item['published'] = datetime2datetimestr(
             datetime.strptime(u'July 8, 2015', '%B %d, %Y'))
         col1 = cols[1]
         loc = col1.xpath('./text()').extract()[0].strip()
         try:
             item['locality'], item['region'] = loc.split(', ')
         except ValueError:
             item['locality'] = loc
             item['region'] = ''
         # data not available in this website
         item['short_description'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],
                       callback=self.parse_item,
                       meta={'item': item})
     next = response.css('.navbar_bottom').xpath(
         './/a[text()=">"]/@href').extract()
     if next:
         if next[0].startswith('/'):
             self.logger.debug('next url: %s' % self.base_url + next[0])
             yield Request(self.base_url + next[0],
                           callback=self.parse,
                           meta={
                               'keyword': response.meta['keyword'],
                               'location': response.meta['location']
                           })
         else:
             self.logger.debug('next url: %s' % self.base_url + '/' +
                               next[0])
             yield Request(self.base_url + '/' + next[0],
                           callback=self.parse,
                           meta={
                               'keyword': response.meta['keyword'],
                               'location': response.meta['location']
                           })
     else:
         self.logger.debug('no next url')