Exemple #1
0
    def parse(self, response):
        questions = Selector(response).xpath('//div[@class="row  result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('a/@title').extract()[0]
            item['url'] = question.xpath('a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath(
                    'div[@class="sjcl"]/span[@class="company"]/text()').
                extract()[0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath(
                        'div[@class="sjcl"]/span[@class="company"]/a/text()').
                    extract()[0].split())
            yield item

        questions = Selector(response).xpath('//div[@class="  row  result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('h2/a/@title').extract()[0]
            item['url'] = question.xpath('h2/a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath('span[@class="company"]/span/text()').extract()
                [0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath('span[@class="company"]/span/a/text()').
                    extract()[0].split())
            yield item

        questions = Selector(response).xpath(
            '//div[@class="row sjlast result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('a/@title').extract()[0]
            item['url'] = question.xpath('a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath('div[@class="sjcl"]/span/text()').extract()
                [0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath('div[@class="sjcl"]/span/a/text()').extract(
                    )[0].split())
            yield item

        questions = Selector(response).xpath(
            '//div[@class="lastRow  row  result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('h2/a/@title').extract()[0]
            item['url'] = question.xpath('h2/a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath('span[@class="company"]/span/text()').extract()
                [0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath('span[@class="company"]/span/a/text()').
                    extract()[0].split())
            yield item
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.xpath('//div[@class="row  result"]')
        items = []
        for site in sites:
            item = IndeedItem(company='none')

            item['job_title'] = site.xpath('h2/a/@title').extract()
            item['link_url'] = site.xpath('h2/a/@href').extract()

            # item['location'] = site.xpath(
            #     'span[@class="location"]/span/text()').extract()

            # Not all entries have a company
            if site.xpath("span[@class='company']/text()").extract() == []:
                item['company'] = [u'']
            else:
                item['company'] = site.xpath(
                    "span[@class='company']/text()").extract()

            item['summary'] = site.xpath(
                "//table/tr/td/span[@class='summary']").extract()
            item['source'] = site.xpath(
                "table/tr/td/span[@class='source']/text()").extract()
            item['found_date'] = site.xpath(
                "table/tr/td/span[@class='date']/text()").extract()
            items.append(item)
        print items
Exemple #3
0
	def parse_detail_page(self, response):
		print("=" * 50)
		print(response.url)

		title = response.xpath('//b[@class="jobtitle"]/font/text()').extract_first()
		company = response.xpath('//span[@class="company"]/text()').extract_first()
		location = response.xpath('//span[@class="location"]/text()').extract_first()
		summary = response.xpath('//span[@id="job_summary"]').extract()

		print("title", title)
		print("company", company)
		print("location",location)

		try: 
			reviews = response.xpath('//span[@class="slNoUnderline"]/text()').extract_first() # can be None
		except IndexError:
			reviews = ""

		try:
			salary = response.xpath('//span[@class="no-wrap"]/text()').extract_first().strip() # can be None
		except:
			salary = ""


		item = IndeedItem()

		item['title'] = title
		item['company'] = company
		item['reviews'] = reviews
		item['location'] = location
		item['salary'] = salary
		item['summary'] = summary
		
		yield item
Exemple #4
0
    def parse_indeed_results(self, response):
        self.log("PAPA")
        #LIMIT
        """
        self.item_count += 1
        if self.item_count > 10:
            raise CloseSpider('item_exceeded')
        """

        # To extract elements, add them here
        item = IndeedItem()

        #TITLE
        title = response.xpath(
            '//h3[contains(@class,"JobInfoHeader")]/text()').extract_first()
        item['title'] = title.strip()

        #COMPANY
        company_data = response.xpath(
            '//div[contains(@class,"InlineCompanyRating")]//text()').extract()
        item['company'] = company_data[0].strip()
        item['address'] = company_data[-1].strip()

        #DESCRIPTION
        description = response.xpath(
            '//div[contains(@id,"jobDescriptionText")]//text()').extract()
        description = ' '.join(description)
        item['description'] = description.replace("\n", "").encode('utf-8')

        yield item
Exemple #5
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.xpath('//div[contains(@class,"row")]')

        for site in sites:
            item = IndeedItem()

            company = site.xpath(
                ".//span[@class='company']//a/text()").extract_first()
            if not company:
                company = site.xpath(
                    ".//span[@class='company']/text()").extract_first()

            item['company'] = company.strip()

            # title
            title = site.xpath(
                './/a[@data-tn-element="jobTitle"]/@title[1]').extract_first()

            item['title'] = title

            # indeed url
            link = site.xpath(
                ".//span[@class='company']//a/@href").extract_first()
            if link:
                item['link'] = 'https://www.indeed.com' + link

            yield item
            # what to crawl next
            next_to_crawl = hxs.xpath(
                '//span[@class="pn"]/parent::a/@href').extract()
            for i in next_to_crawl:
                url = response.urljoin(i)
                yield Request(url)
Exemple #6
0
    def parse(self, response):
        list = response.xpath("//div[contains(@class,'row')]")

        for row in list:
            title = row.xpath("//a[@data-tn-element='jobTitle']/@title").extract()
            companyina = row.xpath("//span[@class='company']/a/text()").extract()
            companynoa = row.xpath("//span[@class='company']/text()").extract()
            location = row.xpath("//span[@class='location']/text()").extract()
            
            ka = 0
            kn = 0
            for kn in range(len(companynoa)):
                if companynoa[kn] == '\n    ':
                    companynoa[kn] = companyina[ka].strip('    \n')
                    ka += 1
                else:
                    companynoa[kn] = companynoa[ka].strip('    \n')
                kn += 1
            company = companynoa
            
            item = IndeedItem()
            item['title'] = title
            item['company'] = company
            item['location'] = location
            
        yield item
Exemple #7
0
 def process_spider_output(self, response, result, spider):
     context = getattr(spider, 'context', {})
     visited_ids = context.setdefault(self.CONTEXT_KEY, {})
     ret = []
     for x in result:
         visited = False
         if isinstance(x, Request):
             if self.FILTER_VISITED in x.meta:
                 visit_id = self._visited_id(x)
                 if visit_id in visited_ids:
                     log.msg("Ignoring already visited: %s" % x.url,
                             level=log.INFO,
                             spider=spider)
                     visited = True
         elif isinstance(x, BaseItem):
             visit_id = self._visited_id(response.request)
             if visit_id:
                 visited_ids[visit_id] = True
                 x['visit_id'] = visit_id
                 x['visit_status'] = 'new'
         if visited:
             ret.append(IndeedItem(visit_id=visit_id, visit_status='old'))
         else:
             ret.append(x)
     return ret
    def parse(self, response):
        for clickCard in response.xpath(
                '//div[@class="jobsearch-SerpJobCard unifiedRow row result"]'):
            # 初始化Item
            items = IndeedItem()
            self.Q.put('Initialized......')

            # jobTitle
            items['jobTitle'] = clickCard.xpath('.//h2/a/@title').extract()[0]
            self.Q.put('title parsed......')

            # companyName
            if clickCard.xpath('.//div/span[@class="company"]/text()').extract(
            )[0].strip() == '':
                items['companyName'] = clickCard.xpath(
                    './/div/span[@class="company"]/a/text()').extract(
                    )[0].strip()
            else:
                items['companyName'] = clickCard.xpath(
                    './/div/span[@class="company"]/text()').extract()[0].strip(
                    )
            self.Q.put('companyName parsed......')

            # url
            items['url'] = clickCard.xpath('.//h2/a/@href').extract()[0]
            self.Q.put('url parsed......')

            self.Q.put(
                f"\n{items['jobTitle']}\n{items['companyName']}\n{items['url']}\n"
            )
            yield items
	def parse_item(self, response):
	        self.log('\n Crawling  %s\n' % response.url)
		hxs = HtmlXPathSelector(response)
		sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
		#sites = hxs.select("//div[@class='row ']")
		items = []
		for site in sites:
			item = IndeedItem(company='none')
			
			item['job_title'] = site.select('h2/a/@title').extract()
			link_url= site.select('h2/a/@href').extract()
			item['link_url'] = link_url
			item['crawl_url'] = response.url
			item['location'] = site.select("span[@class='location']/text()").extract()

			# Not all entries have a company
			if  site.select("span[@class='company']/text()").extract() == []:
				item['company'] = [u'']
			else:
				item['company'] = site.select("span[@class='company']/text()").extract()

			item['summary'] = site.select("//table/tr/td/span[@class='summary']").extract()
			item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
			item['found_date'] = site.select("table/tr/td/span[@class='date']/text()").extract()
			#item['source_url'] = self.get_source(link_url)
			request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
            		request.meta['item'] = item
			yield request

			items.append(item)
		return
Exemple #10
0
    def parse(self, response):        
        item = IndeedItem()

        company = response.xpath("""//*[@class="company"]//text()""").extract()
        company = [i.strip() for i in company if i.strip()]
        item['company'] = company
        
        yield item
Exemple #11
0
    def parse_individual_job(self, response):
        job_title = response.xpath(
            '//h3[@class="icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"]/text()'
        ).extract()
        company_name = response.xpath(
            '//div[@class="icl-u-lg-mr--sm icl-u-xs-mr--xs"]//text()').extract(
            )[0]
        location = response.xpath(
            '//div[@class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs  jobsearch-DesktopStickyContainer-companyrating"]/div'
        ).extract()[-1].replace('<div>', '').replace('</div>', '')
        salary = response.xpath(
            '//span[@class="icl-u-xs-mr--xs"]/text()').extract_first()
        job_type = response.xpath(
            '//span[@class="jobsearch-JobMetadataHeader-item  icl-u-xs-mt--xs"]/text()'
        ).extract_first()
        job_summary = response.xpath(
            '//div[@class="jobsearch-jobDescriptionText"]//text()').extract()
        job_headings_text = response.xpath(
            '//div[@class="jobsearch-jobDescriptionText"]/p/b//text()'
        ).extract()
        job_bullets_text = response.xpath(
            '//div[@class="jobsearch-jobDescriptionText"]/ul/li/text()'
        ).extract()
        day_posted = response.xpath(
            '//div[@class="jobsearch-JobMetadataFooter"]/text()'
        ).extract_first().replace(' - ', '')
        try:
            num_ratings = int(
                response.xpath(
                    '//div[@class="icl-Ratings icl-Ratings--gold icl-Ratings--sm"]/meta/@content'
                ).extract()[1])
        except Exception:
            num_ratings = 0
        try:
            rating_out_of_5 = float(
                response.xpath(
                    '//div[@class="icl-Ratings icl-Ratings--gold icl-Ratings--sm"]/meta/@content'
                ).extract()[0])
        except Exception:
            rating_out_of_5 = None

        item = IndeedItem()
        item['job_title'] = job_title
        item['company_name'] = company_name
        item['location'] = location
        item['salary'] = salary
        item['job_type'] = job_type
        item['job_summary'] = job_summary
        item['job_headings_text'] = job_headings_text
        item['job_bullets_text'] = job_bullets_text
        item['num_ratings'] = num_ratings
        item['rating_out_of_5'] = rating_out_of_5
        item['day_posted'] = day_posted

        yield item
    def parse_data(self, response):
        #/'//span[@class="company"]/span'
        #//h2/a/@title'
        titles = response.xpath('//h2/a/@title').extract()
        links = response.xpath('//h2/a/@href').extract()
        companies = response.xpath('//span[@class="company"]/span').extract()

        for title, link, company in zip(titles, links, companies):
            item = IndeedItem()
            item['title'] = title
            item['link'] = link
            item['company'] = bs(company).get_text().strip()
            yield item
Exemple #13
0
    def store_item(self, data_dict):
        item = IndeedItem()

        # Raw scraped information
        item['search_page_url'] = data_dict['search_page_url']
        item['indeed_url'] = data_dict['indeed_url']
        item['job_title'] = data_dict['job_title']
        item['company_name'] = data_dict['company_name']
        item['company_url'] = data_dict['company_url']
        item['company_reviews'] = data_dict['company_reviews']
        item['job_location'] = data_dict['job_location']
        item['job_description'] = data_dict['job_description']
        item['original_url'] = data_dict['original_url']
        item['posted_when'] = data_dict['posted_when']
        item['salary'] = data_dict['salary']

        # Calculated information
        parsed = urlparse(data_dict['search_page_url'])
        item['search_location'] = unquote(parse_qs(parsed.query).get('l')[0])

        parsed = urlparse(data_dict['indeed_url'])
        try:
            item['indeed_job_key'] = parse_qs(parsed.query).get('jk')[0]
        except (TypeError, KeyError):
            logging.error(f'Problem with {data_dict["indeed_url"]}')

        if data_dict['company_reviews']:
            num_stars, _, num_reviews = re.findall(
                r'^([\d.]+) out of (\d) from ([\d,]+) employee rating',
                data_dict['company_reviews'])[0]
            item['num_stars'] = float(num_stars)
            item['num_reviews'] = int(num_reviews.replace(',', ''))

        if data_dict['salary']:
            salary_range = re.findall(r'\$([\d,]+)', data_dict['salary'])
            job_salary_low = salary_range[0]
            job_salary_high = salary_range[-1]
            item['job_salary_low'] = int(job_salary_low.replace(',', ''))
            item['job_salary_high'] = int(job_salary_high.replace(',', ''))

        if data_dict['posted_when']:
            if data_dict['posted_when'] in ['Just posted', 'Today']:
                days_ago = 0
            else:
                days_ago = int(
                    re.findall(r'(\d+)', data_dict['posted_when'])[0])
            post_date = datetime.datetime.now() - datetime.timedelta(
                days=days_ago)
            item['post_date'] = post_date.date()

        return item
    def parse_data(self, response):
        titles = response.xpath('//h2/a/@title').extract()
        links = response.xpath('//h2/a/@href').extract()
        companies = response.xpath('//span[@class="company"]/span').extract()

        for title, link, comp in zip(titles, links, companies):
            item = IndeedItem()
            item['title'] = title
            abs_url = response.urljoin(link)
            item['url'] = abs_url
            item['company'] = bs(comp).get_text().strip()
            request = scrapy.Request(abs_url, callback=self.parse_job)
            request.meta['item'] = item
            yield request
Exemple #15
0
    def parse(self, response):

        self.driver.get(response.url)
        urls = []

        for i in range(1, 20):

            # self.driver.get(response.url)
            response = TextResponse(url=self.driver.current_url,
                                    body=self.driver.page_source,
                                    encoding='utf-8')
            self.driver.implicitly_wait(10)

            for j in range(1, 31):
                result = response.xpath('//*[@class="col-md-9"]/div[1]/div[' +
                                        str(j) + ']/h3/a/@href')
                urls.extend(result)

            next_page = self.driver.find_element_by_xpath(
                '//*[@title="Go to next page"]')
            next_page.click()

        for href in urls:
            print href
            url = href.extract()
            self.driver.get(url)
            response = TextResponse(url=self.driver.current_url,
                                    body=self.driver.page_source,
                                    encoding='utf-8')
            item = IndeedItem()

            for sel in response.xpath('//div[@class="col-md-5 col-lg-6"]'):
                item['job_title'] = sel.xpath(
                    '//div[@class="col-md-5 col-lg-6"]/h1/text()').extract()
                item['location'] = sel.xpath(
                    '//div[@class="col-md-5 col-lg-6"]/ul/li[2]/text()'
                ).extract()
                item['company_name'] = sel.xpath(
                    '//div[@class="col-md-5 col-lg-6"]/ul/li[1]/a/text()'
                ).extract()

            for sel_1 in response.xpath('//*[@id="bd"]/div/div[1]'):
                item['job_type'] = sel_1.xpath(
                    '//div[2]/div/div[2]/span/text()').extract()
                item['job_salary'] = sel_1.xpath(
                    '//div[3]/div/div[2]/span/text()').extract()

            yield item

        self.driver.close()
Exemple #16
0
    def parse_item(self, response):
        print(response)
        print('\n Crawling  %s\n' % response.url)
        hxs = Selector(response)
        sites = hxs.select("//div[@class='  row  result'] | //div[@class='row  result'] | //div[@class='lastRow  row  result'] | //div[@class='row sjlast result']")
        print (len(sites))
        items = []
#--------------------------------------------------------------------------------------------#
        for site in sites:
            item = IndeedItem(company='none')
            #print(site.select("descendant::a[@data-tn-element='jobTitle']/@href").extract())
            item['job_title'] = site.select("descendant::a[@data-tn-element='jobTitle']/text()").extract()
            link_url= site.select("descendant::a[@data-tn-element='jobTitle']/@href").extract()
            item['link_url'] = link_url[0]
            item['crawl_url'] = response.url
            # Not all entries have a company
            if  site.select("descendant::span[@class='company']/span/text()").extract() == []:
                if site.select("descendant::span[@class='company']/span/a/text()").extract() == []:
                    if len(site.select("descendant::span[@class='company']/a/text()").extract()) == 1:
                        item['company'] = site.select("descendant::span[@class='company']/a/text()").extract()
                    else:
                        item['company'] = site.select("descendant::span[@class='company']/text()").extract()
                else:
                    item['company'] = site.select("descendant::span[@class='company']/a/text()").extract()
                item['salary'] = site.select("descendant::div[@class='sjcl']/div/text()").extract()
                item['location'] = site.select("descendant::span[@class='location']/text()").extract()
            else:
                if len(site.select("descendant::span[@class='company']/span/a/text()").extract()) == 1:
                    item['company'] = site.select("descendant::span[@class='company']/span/a/text()").extract()
                else:
                    item['company'] = site.select("descendant::span[@class='company']/span/text()").extract()    
                item['salary'] = site.select("descendant::td[@class='snip']/nobr/text()").extract()
                item['location'] = site.select("descendant::span[@class='location']/span/text()").extract()
            tempSource = str(site.select("descendant::div[@class='result-link-bar']/script/text()").extract()[0]).split('=',1)
            tempSource = tempSource[1]
            tempSource = json.loads(tempSource[:-1])
            try:
                item['source'] = tempSource['source']
            except KeyError, e:
                item['source'] = []
            try:
                item['sponsored'] = tempSource['sponsored']
            except KeyError, e:
                item['sponsored'] = []
Exemple #17
0
  def parse_item(self, response):
    '''
    import pdb
    pdb.set_trace()
    '''


    self.log('\n Crawling  %s\n' % response.url)
    hxs = HtmlXPathSelector(response)
    sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
    #sites = hxs.select("//div[@class='row ']")
    items = []

    #Skip top two sponsored ads
    for site in sites[:-2]:
      item = IndeedItem(company='none')

      item['job_title'] = site.select('h2/a/@title').extract()
      link_url= site.select('h2/a/@href').extract()
      item['link_url'] = link_url
      item['crawl_url'] = response.url
      item['location'] = site.select("span[@itemprop='jobLocation']/span[@class='location']/span[@itemprop='addressLocality']/text()").extract()
      # Not all entries have a company
      company_name = site.select("span[@class='company']/span[@itemprop='name']/text()").extract()
      if company_name == []:
        item['company'] = [u'']
      else:
        item['company'] = company_name

      item['summary'] =site.select("table/tr/td/div/span[@class='summary']/text()").extract()
      #item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
      item['found_date'] =site.select("table/tr/td/span[@class='date']/text()").extract()
      #item['source_url'] = self.get_source(link_url)


      if len(item['link_url']):
        request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
        request.meta['item'] = item

        yield request


    return
 def parse_jobs(self, response):
     for sel in response.xpath("//div[contains(@class, 'row')]"):
         items = []
         jobs = sel.xpath(
             'normalize-space(//a[contains(@data-tn-element, "jobTitle")])'
         ).extract()
         city = sel.xpath(
             'normalize-space(//span[@class="location"])').extract()
         company = sel.xpath(
             'normalize-space(//span[@class="company"]|//span[@itemprop = "hiringOrganization"])'
         ).extract()
         description = sel.xpath(
             'normalize-space(//span[@class="summary"])').extract()
         for j, c, co, d in zip(jobs, city, company, description):
             position = IndeedItem()
             position['jobs'] = j.strip()
             position['city'] = c.strip()
             position['company'] = co.strip()
             position['description'] = d.strip()
             items.append(position)
         return items
    def parse(self, response):
        jobs = Selector(response).xpath('//*[@id="resultsCol"]/div[@class="  row  result"]')
        file_path = os.path.join(os.getcwd(), 'items.csv')
        job_data = []
        mail_msg = ''

        with open(file_path, 'r+') as data_file:
            f_csv = csv.DictReader(data_file)
            next(f_csv)
            for row in f_csv:
                job_data.append(row['url'])

            for job in jobs:
                item = IndeedItem()
                item['title'] = job.xpath('h2/a[@class="turnstileLink"]/text()').extract()[0]
                item['url'] = 'http://www.indeed.com.mx' + job.xpath('h2/a[@class="turnstileLink"]/@href').extract()[0]

                if item['url'] not in job_data:
                    data_file.write(item['url'] + ',' + item['title'] + '\n')
                    mail_msg += '%s - %s \n\n' % (item['title'], item['url'] )

        if mail_msg != '':
            send_email(mail_msg)
    def parse_item(self, response):
        self.log('\n Crawling  %s\n' % response.url)
        hxs = Selector(response)
        sites = hxs.xpath(
            "//div[@class='  row  result' or @class='lastRow  row  result' or @class='row sjlast result' or @class='row  result']"
        )
        items = []
        for site in sites:
            item = IndeedItem(company='none')

            item['job_title'] = site.xpath('h2/a/@title').extract()
            link_url = site.xpath('h2/a/@href').extract()
            item['link_url'] = link_url
            item['crawl_url'] = response.url
            item['location'] = site.xpath(
                "span[@itemprop='jobLocation']//text()").extract()

            # not all entries have a company name
            if site.xpath("span[@class='company']//text()").extract() == []:
                item['company'] = [u'']
            else:
                item['company'] = site.xpath(
                    "span[@class='company']//text()").extract()
                item['summary'] = site.xpath(
                    "table//span[@class='summary']/text()").extract()
                item['found_date'] = site.xpath(
                    "table/tr/td//span[@class='date']/text()").extract()
                request = Request("http://www.indeed.com" +
                                  item['link_url'][0],
                                  callback=self.parse_next_site)
                request.meta['item'] = item
                yield request

                items.append(item)
            return item
        return item
    def parse_single(self, response):

        reviews = response.xpath('//div[@class = "cmp-Review"]')

        company = response.xpath('//div[@class = "cmp-CompactHeaderLayout-nameContainer"]//text()').extract_first()

        for review in reviews:

            item = IndeedItem()

            item['company'] = company

            item['rating'] = review.xpath(".//div[@class = 'cmp-ReviewRating-text']/text()").extract_first()

            item['date'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()]").extract_first()

            item['content'] = review.xpath(".//span[@itemprop = 'reviewBody']//span[@class = 'cmp-NewLineToBr-text']/text()").extract()

            item['position'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']//meta[@itemprop='name']/@content").extract_first()

            if len(review.xpath(".//div[@class = 'cmp-Review-title']/text()")):
                item['title'] = review.xpath(".//div[@class = 'cmp-Review-title']/text()").extract_first()
            else:
                item['title'] = review.xpath(".//a[@class = 'cmp-Review-titleLink']/text()").extract_first()


            if len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 2 :
                item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][2]/text()").extract_first()
                item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[2]").extract_first()

            elif len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 1:
                if review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first() != ' - ':
                    item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first()
                    item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-4]").extract_first()
                else:
                    item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][1]/text()").extract_first()
                    item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first()
            else:
                item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[5]").extract_first()
                item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first()


            subrating = review.xpath(".//div[@class = 'cmp-SubRating']//div[@class = 'cmp-RatingStars-starsFilled']/@style").extract()

            item['work_life_rating'] = subrating[0]

            item['benefits_rating'] = subrating[1]

            item['security_rating'] = subrating[2]

            item['management_rating'] = subrating[3]

            item['culture_rating'] = subrating[4]
            # 3px=0
            # 15px=1
            # 27px=2
            # 39px=3
            # 51px=4
            # 63px=5

            if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()")):
                item['Pros'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first()
            else:
                item['Pros'] = 'NaN'

            if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()")):
                item['Cons'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first()
            else:
                item['Cons'] = 'NaN'

            if len(review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")):
                item['helpful'] = review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first()
            else:
                item['helpful'] = 0
            
            if len(review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")):
                item['helpless'] = review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first()
            else:
                item['helpless'] = 0

            yield item

        if len(response.xpath("//a[@data-tn-element = 'next-page']/@href")):
            next_url = response.xpath("//a[@data-tn-element = 'next-page']/@href").extract_first()

            yield scrapy.Request("https://www.indeed.com" + next_url ,callback = self.parse_single)
Exemple #22
0
    def parse(self, response):
        for x in range (0, 10):
            range_lower = None
            range_upper = None
            job_title = None
            job_description = None
            job_location = None
            job_company = None
            job_date = None
            salary_description = None
            job_money_unchanged = None
            
            job_title = response.xpath('//h2/a/text()')[x].extract()
            job_description = response.css('span.summary::text')[x].extract()
            job_location = response.css('span.location span::text')[x].extract()
            job_company = None
            job_date = response.css('span.date::text')[x].extract()
            try:
                job_money = response.css('td.snip nobr::text')[x].extract()
                job_money = str(job_money)
                job_money_unchanged = job_money 

                if re.search(r' a year', job_money):
                    job_money = job_money.split(' a year')[0]    
                    salary_description = 'a year'
                if re.search(r' an hour', job_money):
                    job_money = job_money.split(' an hour')[0]    
                    salary_description = 'an hour'
                if re.search(r'-', job_money):
                    range_lower = job_money.split(" - ")[0]
                    range_upper = job_money.split(" - ")[1]
            except:
                job_money = None
            
            try:
                job_company = response.xpath('//div['+ str(x+4) +']/span[1]/span/a/text()').extract_first()
                if job_company == None:
                    job_company = response.xpath('//div['+ str(x+4) +']/span[1]/span/text()').extract_first()
                if job_company == None:
                    job_company = "Nothing"
            except:
                job_company = None
            try:
                image_link = response.xpath('//div['+ str(x+4) +']/span[1]/span/a/@href').extract_first()
            except:
                image_link = None
                
            half_link = response.xpath('//h2/a').xpath("@href")[x].extract()
            full_link = "http://au.indeed.com" + half_link
            item = IndeedItem()
            global main_counter
            main_counter = main_counter + 1
            with open('./static/counter', 'w') as f:
                f.write(str(main_counter))
            item['jobNumber'] = main_counter
            item['job_title'] = job_title
            item['job_description'] = job_description
            item['job_location'] = job_location
            item['job_company'] = job_company
            global loaded_date
            item['job_date'] = loaded_date
            item['job_money'] = job_money
            item['range_upper'] = range_upper
            item['job_money_unchanged'] = job_money_unchanged
            item['range_lower'] = range_lower
            item['salary_description'] = salary_description
            item['image_link'] = image_link
            request = scrapy.Request(full_link, callback=self.parse_original_url)
            request.meta['item'] = item
            yield request
Exemple #23
0
 def parse(self, response):
     test = str(response.url)
     yield IndeedItem(test=test)
Exemple #24
0
    def parse_item(self, response):
        questions = Selector(response).xpath('//div[@class="row  result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('a/@title').extract()[0]
            item['url'] = 'http://indeed.com' + question.xpath(
                'a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath(
                    'div[@class="sjcl"]/span[@class="company"]/text()').
                extract()[0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath(
                        'div[@class="sjcl"]/span[@class="company"]/a/text()').
                    extract()[0].split())
            item['words'] = text_cleaner(item['url'])
            item['city'] = question.xpath(
                'div[@class="sjcl"]/span[@class="location"]/text()').extract(
                )[0]
            item['city'], item['state'] = state_extract(item['city'], )
            yield item

        questions = Selector(response).xpath('//div[@class="  row  result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('h2/a/@title').extract()[0]
            item['url'] = 'http://indeed.com' + question.xpath(
                'h2/a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath('span[@class="company"]/span/text()').extract()
                [0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath('span[@class="company"]/span/a/text()').
                    extract()[0].split())
            item['words'] = text_cleaner(item['url'])
            item['city'] = question.xpath(
                'span[@itemprop="jobLocation"]/span/span/text()').extract()[0]
            item['city'], item['state'] = state_extract(item['city'], )
            yield item

        questions = Selector(response).xpath(
            '//div[@class="row sjlast result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('a/@title').extract()[0]
            item['url'] = 'http://indeed.com' + question.xpath(
                'a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath('div[@class="sjcl"]/span/text()').extract()
                [0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath('div[@class="sjcl"]/span/a/text()').extract(
                    )[0].split())
            item['words'] = text_cleaner(item['url'])
            item['city'] = question.xpath(
                'div[@class="sjcl"]/span[@class="location"]/text()').extract(
                )[0]
            item['city'], item['state'] = state_extract(item['city'], )
            yield item

        questions = Selector(response).xpath(
            '//div[@class="lastRow  row  result"]')
        for question in questions:
            item = IndeedItem()
            item['title'] = question.xpath('h2/a/@title').extract()[0]
            item['url'] = 'http://indeed.com' + question.xpath(
                'h2/a/@href').extract()[0]
            item['company'] = " ".join(
                question.xpath('span[@class="company"]/span/text()').extract()
                [0].split())
            if not item['company']:
                item['company'] = " ".join(
                    question.xpath('span[@class="company"]/span/a/text()').
                    extract()[0].split())
            item['words'] = text_cleaner(item['url'])
            item['city'] = question.xpath(
                'span[@itemprop="jobLocation"]/span/span/text()').extract()[0]
            item['city'], item['state'] = state_extract(item['city'], )
            yield item