Esempio n. 1
0
 def parse_item(self,  response):
     item = super(ClearancejobsSpider, self).parse_item(response)
     try:
         item['description'] =  response.css('.cj-job-details').extract()[0] #div
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)            
     yield item
 def parse_item(self,  response):
     item = super(SimplyhiredSpider, self).parse_item(response)
     try:
         item['description'] = response.css('div.detail')[0].extract()
         #item['description'] = response.css('div.description-full::text')[0].extract()
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     yield item
Esempio n. 3
0
 def parse_item(self,  response):
     item = super(ClearedjobsSpider, self).parse_item(response)
     try:
         item['description'] = response.css('.view_long').extract()[0]
         item['salary'] = response.css('.view_job').xpath('.//div//div[contains(.,"Salary:")]/following-sibling::div[1]/text()').extract()[0].strip()
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     yield item
Esempio n. 4
0
 def parse_item(self, response):
     item = super(ClearedjobsSpider, self).parse_item(response)
     try:
         item['description'] = response.css('.view_long').extract()[0]
         item['salary'] = response.css('.view_job').xpath(
             './/div//div[contains(.,"Salary:")]/following-sibling::div[1]/text()'
         ).extract()[0].strip()
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     yield item
Esempio n. 5
0
 def handle_response(self, response):
     self.logger.debug("in handle response")
     try:
         if response.status == 200:
             append(settings.get("LOG_OK_URL_FULLPATH"), response.url)
         else:
             self.logger.debug("error http code: %s", response.status)
             append(settings.get("LOG_FAIL_URL_FULLPATH"), str(response.status) + ": " + response.url)
     except Exception, e:
         self.logger.exception(e)
Esempio n. 6
0
 def parse(self, response):
     super(ClearedconnectionsSpider, self).parse(response)
     table = response.xpath('//table[@class="jstext"]')
     rows = table.xpath('.//tr')[3:]
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col_url = cols[1]
         onclick = col_url.xpath('.//a/@onclick').extract()[0]
         m = re.search("PopupViewWindow\('(.+?)'", onclick)
         if m:
             url = m.group(1)
         item[
             'item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url
         item['title'] = col_url.xpath('.//a/text()').extract()[0]
         col_published = cols[0]
         published = col_published.xpath('.//p/text()').extract()[0].strip()
         item['published'] = datetime2datetimestr(
             datetime.strptime(published, '%m/%d/%Y'))
         col_company = cols[3]
         try:
             item['company'] = col_company.xpath(
                 './/a/text()').extract()[0].strip()
         except IndexError:
             item['company'] = col_company.xpath(
                 './/p/text()').extract()[0].strip()
         col_loc = cols[2]
         loc = col_loc.xpath('./text()').extract()[0]
         try:
             item['locality'], item['region'], _ = loc.split('-')
         except ValueError:
             item['locality'] = loc
         # data not available in this website
         item['short_description'] = ''
         item['salary'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],
                       callback=self.parse_item,
                       meta={'item': item})
     next = response.css('.pagination').xpath(
         './/a[contains(.,"Next")]/@href').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(self.base_url + next[0],
                       callback=self.parse,
                       meta={
                           'keyword': response.meta['keyword'],
                           'location': response.meta['location']
                       })
     else:
         self.logger.debug('no next url')
 def parse_item(self,  response):
     item = super(ClearedconnectionsSpider, self).parse_item(response)     
     try:
         item['description'] = response.xpath('//div[@class="viewjob"]').extract()[0]
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     else:
         table = response.xpath('//table[@class="LabelValueTable"]')
         details = tablexpath2dict(table)
         item['clearance'] = details.get('Security Clearances',  '')
     yield item
Esempio n. 8
0
 def parse_item(self,  response):
     item = super(UsajobsSpider, self).parse_item(response)
     soup = bs4.BeautifulSoup(response.body)          
     # item['description'] = soup.select('div.jobdetail')[0].text # with soup, plan text
     try:
         item['description'] = response.css('div.jobdetail')[0].extract() 
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     else:
         infodict = table2dict(soup,  'div#jobinfo2')
         item['clearance'] = infodict.get('SECURITY CLEARANCE')
     yield item
    def parse(self, response):
        super(SimplyhiredSpider,  self).parse(response)
#        # with scrapy selector
#        # for sel in response.xpath('//div[@class="job"]'):
#        for sel in response.css('div.job'):
#            item = self.init_item(response)
#            item['keyword'] = response.meta['keyword']
#            item['date_search'] = current_datetime()
#            #this works, but return different url
#            #item['item_url'] = sel.css('a.title::attr(href)').extract()[0]
#            item['item_url'] =[tool.css('a::attr(href)')[0].extract() for tool in sel.css('div.tools')][0]
#            item['title'] = sel.css('a.title::text').extract()[0].strip()

        # with bs4
        soup = bs4.BeautifulSoup(response.body)
        soupitems = soup.select('div.job')     
        if len(soupitems) < 1:
            append(self.fail_url_path, 'no data:' + response.url)
            return
        for soupitem in soupitems:
            item = self.init_item(response)
            item['item_url'] = [a.attrs.get('href') for a in soupitem.select('div.tools > a') if a.attrs.get('href')][0]
            item['title'] = soupitem.select('h2')[0].text.strip()
            try:
                item['company'] = soupitem.h4.text
            except AttributeError:
                pass
            #      logger.debug('item: %s has no h4 tag' % i)
            if soupitem.find('span', itemprop="addressLocality"):
                item['locality'] = soupitem.find('span', itemprop="addressLocality").text
            if soupitem.find('span', itemprop="addressRegion"):
                item['region'] = soupitem.find('span', itemprop="addressRegion").text
            item['short_description'] = soupitem.find('p', itemprop="description").text
            item['published'] = timeago2datetimestr(item['date_search'],  soupitem.select('span.ago')[0].text)
            # data not available in this website
            item['salary'] = ''
            item['clearance'] = ''
            item['department'] = ''
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )

        #for url in response.xpath('//link[@rel="next"]/@href').extract()[0]:
        next = response.css('a.next::attr(href)').extract()            
        if next:
            self.logger.debug('next url: %s' % next[0])
            yield Request(
                # self.base_url + next[0]['href'],  # with soup
                next[0], 
                callback=self.parse, 
                meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
            )
        else:
            self.logger.debug('no next url')            
Esempio n. 10
0
 def parse(self, response):
     super(ClearedjobsSpider,  self).parse(response)
     table = response.xpath('//table[@class="search_res"]//tbody')
     rows = table.xpath('.//tr')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col0 = cols[0]
         url = col0.xpath('.//a[@class="search"]/@href').extract()[0]
         item['item_url'] = self.base_url + "/" + url
         # a text could have span tag inside
         item['title'] = col0.xpath('string(.//a[@class="search"])').extract()[0].strip()
         item['company'] = col0.xpath('.//div[@class="info"]//a/text()').extract()[0]
         try:
             item['clearance'] = col0.xpath('.//div[@class="desc"]/text()').extract()[0]
         except IndexError:
             # item['clearance'] = ''
             pass
         published = col0.xpath('.//div[@class=""]/text()').extract()[0].replace('Posted - ','')
         item['published'] = datetime2datetimestr(datetime.strptime(u'July 8, 2015', '%B %d, %Y'))
         col1 = cols[1]
         loc = col1.xpath('./text()').extract()[0].strip()
         try:
             item['locality'] ,  item['region']  = loc.split(', ')
         except ValueError:
             item['locality'] = loc
             item['region'] = ''
         # data not available in this website
         item['short_description'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     next = response.css('.navbar_bottom').xpath('.//a[text()=">"]/@href').extract()
     if next:
         if next[0].startswith('/'):
             self.logger.debug('next url: %s' % self.base_url +next[0])
             yield Request(
                 self.base_url + next[0],  
                 callback=self.parse, 
                 meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
             )
         else:
             self.logger.debug('next url: %s' % self.base_url + '/'  +next[0])
             yield Request(
                 self.base_url + '/' + next[0],  
                 callback=self.parse, 
                 meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
             )
     else:
         self.logger.debug('no next url')
Esempio n. 11
0
 def parse_item(self, response):
     item = super(ClearedconnectionsSpider, self).parse_item(response)
     try:
         item['description'] = response.xpath(
             '//div[@class="viewjob"]').extract()[0]
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     else:
         table = response.xpath('//table[@class="LabelValueTable"]')
         details = tablexpath2dict(table)
         item['clearance'] = details.get('Security Clearances', '')
     yield item
Esempio n. 12
0
 def parse(self, response):
     super(GlassdoorSpider, self).parse(response)
     #        list = response.css('.standardJobListings')
     list = response.css('.jlGrid')
     rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0]
         item['item_url'] = self.base_url + url
         # a text could have span tag inside
         item['title'] = row.xpath(
             'string(.//h3[@itemprop="title"]/a)').extract()[0].strip()
         item['company'] = row.xpath(
             'string(.//span[@class="employerName"])').extract()[0].strip()
         published = row.css('.logo').css(
             '.minor::text').extract()[0].strip()
         item['published'] = timeago2datetimestr(item['date_search'],
                                                 published)
         item['short_description'] = row.xpath(
             'string(.//p[@itemprop="description"])').extract()[0].strip()
         loc = row.xpath('string(.//span[@itemprop="addressLocality"])'
                         ).extract()[0].strip()
         try:
             item['locality'], item['region'] = loc.split(', ')
         except ValueError:
             item['locality'] = loc
             item['region'] = ''
         # data not available in this website
         item['salary'] = ''
         item['clearance'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],
                       callback=self.parse_item,
                       meta={'item': item})
     next = response.css('.next a::attr(href)').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             # self.base_url + next[0]['href'],  # with soup
             self.base_url + next[0],
             callback=self.parse,
             meta={
                 'keyword': response.meta['keyword'],
                 'location': response.meta['location']
             })
     else:
         self.logger.debug('no next url')
Esempio n. 13
0
 def parse(self, response):
     super(ClearancejobsSpider,  self).parse(response)
     results = response.css('#search-results') # div
     rows = results.css('.cj-search-result-item')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         item['title'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/text()').extract()[0].strip()
         item['item_url'] = row.xpath('.//strong[@class="cj-search-result-item-title"]/a/@href').extract()[0]
         try:
             item['company'] = row.xpath('.//span[@class="cj-company-name"]/a/text()').extract()[0].strip()
         except IndexError:
             item['company'] = ''
         #location = row.xpath('.//span[@class="cj-company-name"]/text()').extract()[0].strip()
         location = row.css('.cj-job-primary-info::text').extract()[1].strip()
         try:
             item['locality'] ,  item['region']  = location.split(', ')
         except ValueError:
             item['locality'] = location
             item['region'] = ''
         # attributes not provided
         item['short_description'] = ''
         item['salary'] = ''
         item['department'] = ''
         #updated = row.css('.cj-text-sm::text').extract()[1].strip()
         updated = row.xpath('.//span[@class="cj-text-sm cj-color-mediumgray"]//text()').extract()[1].strip()
         try:
             dt = datetime.strptime(updated, '%m/%d/%y')
         # FIXME: check if exception not only cause of Today string but other string like yesterday
         except ValueError:
             dt = datetime.now()
         item['published'] = datetime2datetimestr(dt)
         #item['clearance'] = row.xpath('.//div[@class="cj-card-data"])').extract()[0].strip()
         item['clearance'] = row.css('.cj-card-data::text').extract()[1]
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     #FIXME: no a link, javascript
     next = response.xpath('button[@class="cj-table-pagination-next"]/a/@href').extract()
     if next:
         self.logger.debug('next url: %s' % next[0])
         yield Request(
             next[0],  
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')
Esempio n. 14
0
 def parse_item(self,  response):
     item = super(CareerbuilderSpider,  self).parse_item(response)
     #soup = bs4.BeautifulSoup(response.body)     
     #item['title'] = response.xpath('//h1/span/text()').extract()
     #if DEBUG == False:
     #item['description'] = response.css('article').extract()[0]
     # or
     # or to dont get html tags
     #item['description'] = bs4.BeautifulSoup(response.body).select('article')[0].text
     try:
         item['description'] = response.xpath('//article').extract()[0]
     except IndexError:
         append(self.fail_url_path, 'failed to parse:' + response.url)
     # attribute not provided
     # item['clearance'] = ''
     yield item
Esempio n. 15
0
 def parse(self, response):
     super(ClearedconnectionsSpider,  self).parse(response)
     table = response.xpath('//table[@class="jstext"]')
     rows = table.xpath('.//tr')[3:]
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col_url = cols[1]
         onclick = col_url.xpath('.//a/@onclick').extract()[0]
         m = re.search("PopupViewWindow\('(.+?)'", onclick)
         if m:
             url = m.group(1)
         item['item_url'] = 'https://www.clearedconnections.com/JobSeekerX/' + url
         item['title'] = col_url.xpath('.//a/text()').extract()[0]
         col_published = cols[0]
         published = col_published.xpath('.//p/text()').extract()[0].strip()
         item['published'] = datetime2datetimestr(datetime.strptime(published,'%m/%d/%Y'))
         col_company = cols[3]
         try:
             item['company'] = col_company.xpath('.//a/text()').extract()[0].strip()
         except IndexError:
             item['company'] = col_company.xpath('.//p/text()').extract()[0].strip()
         col_loc = cols[2]
         loc = col_loc.xpath('./text()').extract()[0]
         try:
             item['locality'] ,  item['region'],  _ = loc.split('-')
         except ValueError:
             item['locality'] = loc
         # data not available in this website
         item['short_description'] = ''            
         item['salary'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     next = response.css('.pagination').xpath('.//a[contains(.,"Next")]/@href').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             self.base_url + next[0],  
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')
Esempio n. 16
0
    def parse(self, response):
        super(CareerbuilderSpider,  self).parse(response)
        rows = response.css('.gs-job-result-abstract')
        if len(rows) < 1:
            append(self.fail_url_path, 'no data:' + response.url)
            return
        for row in rows:
            item = self.init_item(response)
            item['item_url'] = row.css('.jt').xpath('@href').extract()[0]
            item['title'] = row.css('.jt::text').extract()[0]
            #item['short_description'] = row.css('span[itemprop="description"]::text').extract()[0]
            # the same with xpath
            item['short_description'] = row.xpath('.//span[@itemprop="description"]/text()').extract()[0]
            item['company'] = row.xpath('.//a/@companyname').extract()[0]
            # or
#            try:
#                item['company'] = row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract()[0]
#            except IndexError:
#                self.logger.debug(row.xpath('.//td[@itemprop="hiringOrganization"]/a/text()').extract())
            location = row.xpath('.//div[@itemprop="jobLocation"]/span/text()').extract()[0]
            try:
                item['region'] ,  item['locality'] = location.split(' - ')
            except ValueError:
                item['locality'] = location
            teaser = [i.strip() for i in row.xpath('.//div[contains(@id, "pnlTeaser")]/p/text()').extract()[0].split('|')]
            if len(teaser) == 2:
                item['salary'] = teaser[1].split(':')[1]
            else:
                item['salary'] = ''
            item['department'] = ''
            ago = row.css('.jl_rslt_posted_cell span::text').extract()[0]
            item['published'] = timeago2datetimestr(item['date_search'] , ago)
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
        next = response.css('.JL_MXDLPagination2_next').xpath('@href').extract()
        if next:
            self.logger.debug('next url: %s' % next[0])
            yield Request(
                next[0], 
                callback=self.parse, 
                meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
            )
        else:
            self.logger.debug('no next url')
Esempio n. 17
0
    def parse(self, response):
        super(GlassdoorSpider,  self).parse(response)
#        list = response.css('.standardJobListings')
        list = response.css('.jlGrid')
        rows = list.xpath('.//div[@itemtype="http://schema.org/JobPosting"]')
        if len(rows) < 1:
            append(self.fail_url_path, 'no data:' + response.url)
            return
        for row in rows:
            item = self.init_item(response)
            url = row.xpath('.//h3[@itemprop="title"]/a/@href').extract()[0]
            item['item_url'] = self.base_url + url
            # a text could have span tag inside
            item['title'] = row.xpath('string(.//h3[@itemprop="title"]/a)').extract()[0].strip()
            item['company'] = row.xpath('string(.//span[@class="employerName"])').extract()[0].strip()
            published = row.css('.logo').css('.minor::text').extract()[0].strip()
            item['published'] =  timeago2datetimestr(item['date_search'],  published)
            item['short_description'] = row.xpath('string(.//p[@itemprop="description"])').extract()[0].strip()
            loc = row.xpath('string(.//span[@itemprop="addressLocality"])').extract()[0].strip()
            try:
                item['locality'] ,  item['region']  = loc.split(', ')
            except ValueError:
                item['locality'] = loc
                item['region'] = ''
            # data not available in this website
            item['salary'] = ''
            item['clearance'] = ''
            item['department'] = ''
            self.logger.debug('title %s' % item['title'])
            yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
        next = response.css('.next a::attr(href)').extract()
        if next:
            self.logger.debug('next url: %s' % self.base_url + next[0])
            yield Request(
                # self.base_url + next[0]['href'],  # with soup
                self.base_url + next[0], 
                callback=self.parse, 
                meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
            )
        else:
            self.logger.debug('no next url')
Esempio n. 18
0
 def parse(self, response):
     super(UsajobsSpider,  self).parse(response)
     soup = bs4.BeautifulSoup(response.body)
     soupitems = soup.select('div#jobResultNew')    
     if len(soupitems) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for soupitem in soupitems:
         item = self.init_item(response)
         item['item_url'] = self.base_url + soupitem.select('a.jobTitleLink')[0].attrs.get('href')
         item['title'] = soupitem.select('a.jobTitleLink')[0].text
         item['short_description'] = soupitem.select('p.summary')[0].text.strip()
         details = table2dict(soupitem,  'table.joaResultsDetailsTable')
         item['company'] = details.get('Agency',  '')
         location_region = details.get('Location(s)',  '').split(', ')
         item['locality'] = location_region[0]
         try:
             item['region'] = location_region[1]
         except IndexError:
             pass
         item['salary'] = details.get('Salary',  '')
         item['department'] = details.get('Department',  '')
         # data not available in this website
         item['published']= ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],  callback=self.parse_item, meta={'item': item} )
     # next = soup.select('a.nextPage') # with soup
     next = response.css('a.nextPage::attr(href)').extract()
     if next:
         self.logger.debug('next url: %s' % self.base_url + next[0])
         yield Request(
             # self.base_url + next[0]['href'],  # with soup
             self.base_url + next[0], 
             callback=self.parse, 
             meta={'keyword': response.meta['keyword'],  'location': response.meta['location']}
         )
     else:
         self.logger.debug('no next url')
Esempio n. 19
0
 def parse(self, response):
     super(ClearedjobsSpider, self).parse(response)
     table = response.xpath('//table[@class="search_res"]//tbody')
     rows = table.xpath('.//tr')
     if len(rows) < 1:
         append(self.fail_url_path, 'no data:' + response.url)
         return
     for row in rows:
         item = self.init_item(response)
         cols = row.xpath('.//td')
         col0 = cols[0]
         url = col0.xpath('.//a[@class="search"]/@href').extract()[0]
         item['item_url'] = self.base_url + "/" + url
         # a text could have span tag inside
         item['title'] = col0.xpath(
             'string(.//a[@class="search"])').extract()[0].strip()
         item['company'] = col0.xpath(
             './/div[@class="info"]//a/text()').extract()[0]
         try:
             item['clearance'] = col0.xpath(
                 './/div[@class="desc"]/text()').extract()[0]
         except IndexError:
             # item['clearance'] = ''
             pass
         published = col0.xpath(
             './/div[@class=""]/text()').extract()[0].replace(
                 'Posted - ', '')
         item['published'] = datetime2datetimestr(
             datetime.strptime(u'July 8, 2015', '%B %d, %Y'))
         col1 = cols[1]
         loc = col1.xpath('./text()').extract()[0].strip()
         try:
             item['locality'], item['region'] = loc.split(', ')
         except ValueError:
             item['locality'] = loc
             item['region'] = ''
         # data not available in this website
         item['short_description'] = ''
         item['department'] = ''
         self.logger.debug('title %s' % item['title'])
         yield Request(item['item_url'],
                       callback=self.parse_item,
                       meta={'item': item})
     next = response.css('.navbar_bottom').xpath(
         './/a[text()=">"]/@href').extract()
     if next:
         if next[0].startswith('/'):
             self.logger.debug('next url: %s' % self.base_url + next[0])
             yield Request(self.base_url + next[0],
                           callback=self.parse,
                           meta={
                               'keyword': response.meta['keyword'],
                               'location': response.meta['location']
                           })
         else:
             self.logger.debug('next url: %s' % self.base_url + '/' +
                               next[0])
             yield Request(self.base_url + '/' + next[0],
                           callback=self.parse,
                           meta={
                               'keyword': response.meta['keyword'],
                               'location': response.meta['location']
                           })
     else:
         self.logger.debug('no next url')