def parse(self, response): questions = Selector(response).xpath('//div[@class="row result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('a/@title').extract()[0] item['url'] = question.xpath('a/@href').extract()[0] item['company'] = " ".join( question.xpath( 'div[@class="sjcl"]/span[@class="company"]/text()'). extract()[0].split()) if not item['company']: item['company'] = " ".join( question.xpath( 'div[@class="sjcl"]/span[@class="company"]/a/text()'). extract()[0].split()) yield item questions = Selector(response).xpath('//div[@class=" row result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('h2/a/@title').extract()[0] item['url'] = question.xpath('h2/a/@href').extract()[0] item['company'] = " ".join( question.xpath('span[@class="company"]/span/text()').extract() [0].split()) if not item['company']: item['company'] = " ".join( question.xpath('span[@class="company"]/span/a/text()'). extract()[0].split()) yield item questions = Selector(response).xpath( '//div[@class="row sjlast result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('a/@title').extract()[0] item['url'] = question.xpath('a/@href').extract()[0] item['company'] = " ".join( question.xpath('div[@class="sjcl"]/span/text()').extract() [0].split()) if not item['company']: item['company'] = " ".join( question.xpath('div[@class="sjcl"]/span/a/text()').extract( )[0].split()) yield item questions = Selector(response).xpath( '//div[@class="lastRow row result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('h2/a/@title').extract()[0] item['url'] = question.xpath('h2/a/@href').extract()[0] item['company'] = " ".join( question.xpath('span[@class="company"]/span/text()').extract() [0].split()) if not item['company']: item['company'] = " ".join( question.xpath('span[@class="company"]/span/a/text()'). extract()[0].split()) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.xpath('//div[@class="row result"]') items = [] for site in sites: item = IndeedItem(company='none') item['job_title'] = site.xpath('h2/a/@title').extract() item['link_url'] = site.xpath('h2/a/@href').extract() # item['location'] = site.xpath( # 'span[@class="location"]/span/text()').extract() # Not all entries have a company if site.xpath("span[@class='company']/text()").extract() == []: item['company'] = [u''] else: item['company'] = site.xpath( "span[@class='company']/text()").extract() item['summary'] = site.xpath( "//table/tr/td/span[@class='summary']").extract() item['source'] = site.xpath( "table/tr/td/span[@class='source']/text()").extract() item['found_date'] = site.xpath( "table/tr/td/span[@class='date']/text()").extract() items.append(item) print items
def parse_detail_page(self, response): print("=" * 50) print(response.url) title = response.xpath('//b[@class="jobtitle"]/font/text()').extract_first() company = response.xpath('//span[@class="company"]/text()').extract_first() location = response.xpath('//span[@class="location"]/text()').extract_first() summary = response.xpath('//span[@id="job_summary"]').extract() print("title", title) print("company", company) print("location",location) try: reviews = response.xpath('//span[@class="slNoUnderline"]/text()').extract_first() # can be None except IndexError: reviews = "" try: salary = response.xpath('//span[@class="no-wrap"]/text()').extract_first().strip() # can be None except: salary = "" item = IndeedItem() item['title'] = title item['company'] = company item['reviews'] = reviews item['location'] = location item['salary'] = salary item['summary'] = summary yield item
def parse_indeed_results(self, response): self.log("PAPA") #LIMIT """ self.item_count += 1 if self.item_count > 10: raise CloseSpider('item_exceeded') """ # To extract elements, add them here item = IndeedItem() #TITLE title = response.xpath( '//h3[contains(@class,"JobInfoHeader")]/text()').extract_first() item['title'] = title.strip() #COMPANY company_data = response.xpath( '//div[contains(@class,"InlineCompanyRating")]//text()').extract() item['company'] = company_data[0].strip() item['address'] = company_data[-1].strip() #DESCRIPTION description = response.xpath( '//div[contains(@id,"jobDescriptionText")]//text()').extract() description = ' '.join(description) item['description'] = description.replace("\n", "").encode('utf-8') yield item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.xpath('//div[contains(@class,"row")]') for site in sites: item = IndeedItem() company = site.xpath( ".//span[@class='company']//a/text()").extract_first() if not company: company = site.xpath( ".//span[@class='company']/text()").extract_first() item['company'] = company.strip() # title title = site.xpath( './/a[@data-tn-element="jobTitle"]/@title[1]').extract_first() item['title'] = title # indeed url link = site.xpath( ".//span[@class='company']//a/@href").extract_first() if link: item['link'] = 'https://www.indeed.com' + link yield item # what to crawl next next_to_crawl = hxs.xpath( '//span[@class="pn"]/parent::a/@href').extract() for i in next_to_crawl: url = response.urljoin(i) yield Request(url)
def parse(self, response): list = response.xpath("//div[contains(@class,'row')]") for row in list: title = row.xpath("//a[@data-tn-element='jobTitle']/@title").extract() companyina = row.xpath("//span[@class='company']/a/text()").extract() companynoa = row.xpath("//span[@class='company']/text()").extract() location = row.xpath("//span[@class='location']/text()").extract() ka = 0 kn = 0 for kn in range(len(companynoa)): if companynoa[kn] == '\n ': companynoa[kn] = companyina[ka].strip(' \n') ka += 1 else: companynoa[kn] = companynoa[ka].strip(' \n') kn += 1 company = companynoa item = IndeedItem() item['title'] = title item['company'] = company item['location'] = location yield item
def process_spider_output(self, response, result, spider): context = getattr(spider, 'context', {}) visited_ids = context.setdefault(self.CONTEXT_KEY, {}) ret = [] for x in result: visited = False if isinstance(x, Request): if self.FILTER_VISITED in x.meta: visit_id = self._visited_id(x) if visit_id in visited_ids: log.msg("Ignoring already visited: %s" % x.url, level=log.INFO, spider=spider) visited = True elif isinstance(x, BaseItem): visit_id = self._visited_id(response.request) if visit_id: visited_ids[visit_id] = True x['visit_id'] = visit_id x['visit_status'] = 'new' if visited: ret.append(IndeedItem(visit_id=visit_id, visit_status='old')) else: ret.append(x) return ret
def parse(self, response): for clickCard in response.xpath( '//div[@class="jobsearch-SerpJobCard unifiedRow row result"]'): # 初始化Item items = IndeedItem() self.Q.put('Initialized......') # jobTitle items['jobTitle'] = clickCard.xpath('.//h2/a/@title').extract()[0] self.Q.put('title parsed......') # companyName if clickCard.xpath('.//div/span[@class="company"]/text()').extract( )[0].strip() == '': items['companyName'] = clickCard.xpath( './/div/span[@class="company"]/a/text()').extract( )[0].strip() else: items['companyName'] = clickCard.xpath( './/div/span[@class="company"]/text()').extract()[0].strip( ) self.Q.put('companyName parsed......') # url items['url'] = clickCard.xpath('.//h2/a/@href').extract()[0] self.Q.put('url parsed......') self.Q.put( f"\n{items['jobTitle']}\n{items['companyName']}\n{items['url']}\n" ) yield items
def parse_item(self, response): self.log('\n Crawling %s\n' % response.url) hxs = HtmlXPathSelector(response) sites = hxs.select("//div[@class='row ' or @class='row lastRow']") #sites = hxs.select("//div[@class='row ']") items = [] for site in sites: item = IndeedItem(company='none') item['job_title'] = site.select('h2/a/@title').extract() link_url= site.select('h2/a/@href').extract() item['link_url'] = link_url item['crawl_url'] = response.url item['location'] = site.select("span[@class='location']/text()").extract() # Not all entries have a company if site.select("span[@class='company']/text()").extract() == []: item['company'] = [u''] else: item['company'] = site.select("span[@class='company']/text()").extract() item['summary'] = site.select("//table/tr/td/span[@class='summary']").extract() item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract() item['found_date'] = site.select("table/tr/td/span[@class='date']/text()").extract() #item['source_url'] = self.get_source(link_url) request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site) request.meta['item'] = item yield request items.append(item) return
def parse(self, response): item = IndeedItem() company = response.xpath("""//*[@class="company"]//text()""").extract() company = [i.strip() for i in company if i.strip()] item['company'] = company yield item
def parse_individual_job(self, response): job_title = response.xpath( '//h3[@class="icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"]/text()' ).extract() company_name = response.xpath( '//div[@class="icl-u-lg-mr--sm icl-u-xs-mr--xs"]//text()').extract( )[0] location = response.xpath( '//div[@class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating"]/div' ).extract()[-1].replace('<div>', '').replace('</div>', '') salary = response.xpath( '//span[@class="icl-u-xs-mr--xs"]/text()').extract_first() job_type = response.xpath( '//span[@class="jobsearch-JobMetadataHeader-item icl-u-xs-mt--xs"]/text()' ).extract_first() job_summary = response.xpath( '//div[@class="jobsearch-jobDescriptionText"]//text()').extract() job_headings_text = response.xpath( '//div[@class="jobsearch-jobDescriptionText"]/p/b//text()' ).extract() job_bullets_text = response.xpath( '//div[@class="jobsearch-jobDescriptionText"]/ul/li/text()' ).extract() day_posted = response.xpath( '//div[@class="jobsearch-JobMetadataFooter"]/text()' ).extract_first().replace(' - ', '') try: num_ratings = int( response.xpath( '//div[@class="icl-Ratings icl-Ratings--gold icl-Ratings--sm"]/meta/@content' ).extract()[1]) except Exception: num_ratings = 0 try: rating_out_of_5 = float( response.xpath( '//div[@class="icl-Ratings icl-Ratings--gold icl-Ratings--sm"]/meta/@content' ).extract()[0]) except Exception: rating_out_of_5 = None item = IndeedItem() item['job_title'] = job_title item['company_name'] = company_name item['location'] = location item['salary'] = salary item['job_type'] = job_type item['job_summary'] = job_summary item['job_headings_text'] = job_headings_text item['job_bullets_text'] = job_bullets_text item['num_ratings'] = num_ratings item['rating_out_of_5'] = rating_out_of_5 item['day_posted'] = day_posted yield item
def parse_data(self, response): #/'//span[@class="company"]/span' #//h2/a/@title' titles = response.xpath('//h2/a/@title').extract() links = response.xpath('//h2/a/@href').extract() companies = response.xpath('//span[@class="company"]/span').extract() for title, link, company in zip(titles, links, companies): item = IndeedItem() item['title'] = title item['link'] = link item['company'] = bs(company).get_text().strip() yield item
def store_item(self, data_dict): item = IndeedItem() # Raw scraped information item['search_page_url'] = data_dict['search_page_url'] item['indeed_url'] = data_dict['indeed_url'] item['job_title'] = data_dict['job_title'] item['company_name'] = data_dict['company_name'] item['company_url'] = data_dict['company_url'] item['company_reviews'] = data_dict['company_reviews'] item['job_location'] = data_dict['job_location'] item['job_description'] = data_dict['job_description'] item['original_url'] = data_dict['original_url'] item['posted_when'] = data_dict['posted_when'] item['salary'] = data_dict['salary'] # Calculated information parsed = urlparse(data_dict['search_page_url']) item['search_location'] = unquote(parse_qs(parsed.query).get('l')[0]) parsed = urlparse(data_dict['indeed_url']) try: item['indeed_job_key'] = parse_qs(parsed.query).get('jk')[0] except (TypeError, KeyError): logging.error(f'Problem with {data_dict["indeed_url"]}') if data_dict['company_reviews']: num_stars, _, num_reviews = re.findall( r'^([\d.]+) out of (\d) from ([\d,]+) employee rating', data_dict['company_reviews'])[0] item['num_stars'] = float(num_stars) item['num_reviews'] = int(num_reviews.replace(',', '')) if data_dict['salary']: salary_range = re.findall(r'\$([\d,]+)', data_dict['salary']) job_salary_low = salary_range[0] job_salary_high = salary_range[-1] item['job_salary_low'] = int(job_salary_low.replace(',', '')) item['job_salary_high'] = int(job_salary_high.replace(',', '')) if data_dict['posted_when']: if data_dict['posted_when'] in ['Just posted', 'Today']: days_ago = 0 else: days_ago = int( re.findall(r'(\d+)', data_dict['posted_when'])[0]) post_date = datetime.datetime.now() - datetime.timedelta( days=days_ago) item['post_date'] = post_date.date() return item
def parse_data(self, response): titles = response.xpath('//h2/a/@title').extract() links = response.xpath('//h2/a/@href').extract() companies = response.xpath('//span[@class="company"]/span').extract() for title, link, comp in zip(titles, links, companies): item = IndeedItem() item['title'] = title abs_url = response.urljoin(link) item['url'] = abs_url item['company'] = bs(comp).get_text().strip() request = scrapy.Request(abs_url, callback=self.parse_job) request.meta['item'] = item yield request
def parse(self, response): self.driver.get(response.url) urls = [] for i in range(1, 20): # self.driver.get(response.url) response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') self.driver.implicitly_wait(10) for j in range(1, 31): result = response.xpath('//*[@class="col-md-9"]/div[1]/div[' + str(j) + ']/h3/a/@href') urls.extend(result) next_page = self.driver.find_element_by_xpath( '//*[@title="Go to next page"]') next_page.click() for href in urls: print href url = href.extract() self.driver.get(url) response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') item = IndeedItem() for sel in response.xpath('//div[@class="col-md-5 col-lg-6"]'): item['job_title'] = sel.xpath( '//div[@class="col-md-5 col-lg-6"]/h1/text()').extract() item['location'] = sel.xpath( '//div[@class="col-md-5 col-lg-6"]/ul/li[2]/text()' ).extract() item['company_name'] = sel.xpath( '//div[@class="col-md-5 col-lg-6"]/ul/li[1]/a/text()' ).extract() for sel_1 in response.xpath('//*[@id="bd"]/div/div[1]'): item['job_type'] = sel_1.xpath( '//div[2]/div/div[2]/span/text()').extract() item['job_salary'] = sel_1.xpath( '//div[3]/div/div[2]/span/text()').extract() yield item self.driver.close()
def parse_item(self, response): print(response) print('\n Crawling %s\n' % response.url) hxs = Selector(response) sites = hxs.select("//div[@class=' row result'] | //div[@class='row result'] | //div[@class='lastRow row result'] | //div[@class='row sjlast result']") print (len(sites)) items = [] #--------------------------------------------------------------------------------------------# for site in sites: item = IndeedItem(company='none') #print(site.select("descendant::a[@data-tn-element='jobTitle']/@href").extract()) item['job_title'] = site.select("descendant::a[@data-tn-element='jobTitle']/text()").extract() link_url= site.select("descendant::a[@data-tn-element='jobTitle']/@href").extract() item['link_url'] = link_url[0] item['crawl_url'] = response.url # Not all entries have a company if site.select("descendant::span[@class='company']/span/text()").extract() == []: if site.select("descendant::span[@class='company']/span/a/text()").extract() == []: if len(site.select("descendant::span[@class='company']/a/text()").extract()) == 1: item['company'] = site.select("descendant::span[@class='company']/a/text()").extract() else: item['company'] = site.select("descendant::span[@class='company']/text()").extract() else: item['company'] = site.select("descendant::span[@class='company']/a/text()").extract() item['salary'] = site.select("descendant::div[@class='sjcl']/div/text()").extract() item['location'] = site.select("descendant::span[@class='location']/text()").extract() else: if len(site.select("descendant::span[@class='company']/span/a/text()").extract()) == 1: item['company'] = site.select("descendant::span[@class='company']/span/a/text()").extract() else: item['company'] = site.select("descendant::span[@class='company']/span/text()").extract() item['salary'] = site.select("descendant::td[@class='snip']/nobr/text()").extract() item['location'] = site.select("descendant::span[@class='location']/span/text()").extract() tempSource = str(site.select("descendant::div[@class='result-link-bar']/script/text()").extract()[0]).split('=',1) tempSource = tempSource[1] tempSource = json.loads(tempSource[:-1]) try: item['source'] = tempSource['source'] except KeyError, e: item['source'] = [] try: item['sponsored'] = tempSource['sponsored'] except KeyError, e: item['sponsored'] = []
def parse_item(self, response): ''' import pdb pdb.set_trace() ''' self.log('\n Crawling %s\n' % response.url) hxs = HtmlXPathSelector(response) sites = hxs.select("//div[@class='row ' or @class='row lastRow']") #sites = hxs.select("//div[@class='row ']") items = [] #Skip top two sponsored ads for site in sites[:-2]: item = IndeedItem(company='none') item['job_title'] = site.select('h2/a/@title').extract() link_url= site.select('h2/a/@href').extract() item['link_url'] = link_url item['crawl_url'] = response.url item['location'] = site.select("span[@itemprop='jobLocation']/span[@class='location']/span[@itemprop='addressLocality']/text()").extract() # Not all entries have a company company_name = site.select("span[@class='company']/span[@itemprop='name']/text()").extract() if company_name == []: item['company'] = [u''] else: item['company'] = company_name item['summary'] =site.select("table/tr/td/div/span[@class='summary']/text()").extract() #item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract() item['found_date'] =site.select("table/tr/td/span[@class='date']/text()").extract() #item['source_url'] = self.get_source(link_url) if len(item['link_url']): request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site) request.meta['item'] = item yield request return
def parse_jobs(self, response): for sel in response.xpath("//div[contains(@class, 'row')]"): items = [] jobs = sel.xpath( 'normalize-space(//a[contains(@data-tn-element, "jobTitle")])' ).extract() city = sel.xpath( 'normalize-space(//span[@class="location"])').extract() company = sel.xpath( 'normalize-space(//span[@class="company"]|//span[@itemprop = "hiringOrganization"])' ).extract() description = sel.xpath( 'normalize-space(//span[@class="summary"])').extract() for j, c, co, d in zip(jobs, city, company, description): position = IndeedItem() position['jobs'] = j.strip() position['city'] = c.strip() position['company'] = co.strip() position['description'] = d.strip() items.append(position) return items
def parse(self, response): jobs = Selector(response).xpath('//*[@id="resultsCol"]/div[@class=" row result"]') file_path = os.path.join(os.getcwd(), 'items.csv') job_data = [] mail_msg = '' with open(file_path, 'r+') as data_file: f_csv = csv.DictReader(data_file) next(f_csv) for row in f_csv: job_data.append(row['url']) for job in jobs: item = IndeedItem() item['title'] = job.xpath('h2/a[@class="turnstileLink"]/text()').extract()[0] item['url'] = 'http://www.indeed.com.mx' + job.xpath('h2/a[@class="turnstileLink"]/@href').extract()[0] if item['url'] not in job_data: data_file.write(item['url'] + ',' + item['title'] + '\n') mail_msg += '%s - %s \n\n' % (item['title'], item['url'] ) if mail_msg != '': send_email(mail_msg)
def parse_item(self, response): self.log('\n Crawling %s\n' % response.url) hxs = Selector(response) sites = hxs.xpath( "//div[@class=' row result' or @class='lastRow row result' or @class='row sjlast result' or @class='row result']" ) items = [] for site in sites: item = IndeedItem(company='none') item['job_title'] = site.xpath('h2/a/@title').extract() link_url = site.xpath('h2/a/@href').extract() item['link_url'] = link_url item['crawl_url'] = response.url item['location'] = site.xpath( "span[@itemprop='jobLocation']//text()").extract() # not all entries have a company name if site.xpath("span[@class='company']//text()").extract() == []: item['company'] = [u''] else: item['company'] = site.xpath( "span[@class='company']//text()").extract() item['summary'] = site.xpath( "table//span[@class='summary']/text()").extract() item['found_date'] = site.xpath( "table/tr/td//span[@class='date']/text()").extract() request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site) request.meta['item'] = item yield request items.append(item) return item return item
def parse_single(self, response): reviews = response.xpath('//div[@class = "cmp-Review"]') company = response.xpath('//div[@class = "cmp-CompactHeaderLayout-nameContainer"]//text()').extract_first() for review in reviews: item = IndeedItem() item['company'] = company item['rating'] = review.xpath(".//div[@class = 'cmp-ReviewRating-text']/text()").extract_first() item['date'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()]").extract_first() item['content'] = review.xpath(".//span[@itemprop = 'reviewBody']//span[@class = 'cmp-NewLineToBr-text']/text()").extract() item['position'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']//meta[@itemprop='name']/@content").extract_first() if len(review.xpath(".//div[@class = 'cmp-Review-title']/text()")): item['title'] = review.xpath(".//div[@class = 'cmp-Review-title']/text()").extract_first() else: item['title'] = review.xpath(".//a[@class = 'cmp-Review-titleLink']/text()").extract_first() if len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 2 : item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][2]/text()").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[2]").extract_first() elif len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 1: if review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first() != ' - ': item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-4]").extract_first() else: item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][1]/text()").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first() else: item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[5]").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first() subrating = review.xpath(".//div[@class = 'cmp-SubRating']//div[@class = 'cmp-RatingStars-starsFilled']/@style").extract() item['work_life_rating'] = subrating[0] item['benefits_rating'] = subrating[1] item['security_rating'] = subrating[2] item['management_rating'] = subrating[3] item['culture_rating'] = subrating[4] # 3px=0 # 15px=1 # 27px=2 # 39px=3 # 51px=4 # 63px=5 if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()")): item['Pros'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first() else: item['Pros'] = 'NaN' if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()")): item['Cons'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first() else: item['Cons'] = 'NaN' if len(review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")): item['helpful'] = review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first() else: item['helpful'] = 0 if len(review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")): item['helpless'] = review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first() else: item['helpless'] = 0 yield item if len(response.xpath("//a[@data-tn-element = 'next-page']/@href")): next_url = response.xpath("//a[@data-tn-element = 'next-page']/@href").extract_first() yield scrapy.Request("https://www.indeed.com" + next_url ,callback = self.parse_single)
def parse(self, response): for x in range (0, 10): range_lower = None range_upper = None job_title = None job_description = None job_location = None job_company = None job_date = None salary_description = None job_money_unchanged = None job_title = response.xpath('//h2/a/text()')[x].extract() job_description = response.css('span.summary::text')[x].extract() job_location = response.css('span.location span::text')[x].extract() job_company = None job_date = response.css('span.date::text')[x].extract() try: job_money = response.css('td.snip nobr::text')[x].extract() job_money = str(job_money) job_money_unchanged = job_money if re.search(r' a year', job_money): job_money = job_money.split(' a year')[0] salary_description = 'a year' if re.search(r' an hour', job_money): job_money = job_money.split(' an hour')[0] salary_description = 'an hour' if re.search(r'-', job_money): range_lower = job_money.split(" - ")[0] range_upper = job_money.split(" - ")[1] except: job_money = None try: job_company = response.xpath('//div['+ str(x+4) +']/span[1]/span/a/text()').extract_first() if job_company == None: job_company = response.xpath('//div['+ str(x+4) +']/span[1]/span/text()').extract_first() if job_company == None: job_company = "Nothing" except: job_company = None try: image_link = response.xpath('//div['+ str(x+4) +']/span[1]/span/a/@href').extract_first() except: image_link = None half_link = response.xpath('//h2/a').xpath("@href")[x].extract() full_link = "http://au.indeed.com" + half_link item = IndeedItem() global main_counter main_counter = main_counter + 1 with open('./static/counter', 'w') as f: f.write(str(main_counter)) item['jobNumber'] = main_counter item['job_title'] = job_title item['job_description'] = job_description item['job_location'] = job_location item['job_company'] = job_company global loaded_date item['job_date'] = loaded_date item['job_money'] = job_money item['range_upper'] = range_upper item['job_money_unchanged'] = job_money_unchanged item['range_lower'] = range_lower item['salary_description'] = salary_description item['image_link'] = image_link request = scrapy.Request(full_link, callback=self.parse_original_url) request.meta['item'] = item yield request
def parse(self, response): test = str(response.url) yield IndeedItem(test=test)
def parse_item(self, response): questions = Selector(response).xpath('//div[@class="row result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('a/@title').extract()[0] item['url'] = 'http://indeed.com' + question.xpath( 'a/@href').extract()[0] item['company'] = " ".join( question.xpath( 'div[@class="sjcl"]/span[@class="company"]/text()'). extract()[0].split()) if not item['company']: item['company'] = " ".join( question.xpath( 'div[@class="sjcl"]/span[@class="company"]/a/text()'). extract()[0].split()) item['words'] = text_cleaner(item['url']) item['city'] = question.xpath( 'div[@class="sjcl"]/span[@class="location"]/text()').extract( )[0] item['city'], item['state'] = state_extract(item['city'], ) yield item questions = Selector(response).xpath('//div[@class=" row result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('h2/a/@title').extract()[0] item['url'] = 'http://indeed.com' + question.xpath( 'h2/a/@href').extract()[0] item['company'] = " ".join( question.xpath('span[@class="company"]/span/text()').extract() [0].split()) if not item['company']: item['company'] = " ".join( question.xpath('span[@class="company"]/span/a/text()'). extract()[0].split()) item['words'] = text_cleaner(item['url']) item['city'] = question.xpath( 'span[@itemprop="jobLocation"]/span/span/text()').extract()[0] item['city'], item['state'] = state_extract(item['city'], ) yield item questions = Selector(response).xpath( '//div[@class="row sjlast result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('a/@title').extract()[0] item['url'] = 'http://indeed.com' + question.xpath( 'a/@href').extract()[0] item['company'] = " ".join( question.xpath('div[@class="sjcl"]/span/text()').extract() [0].split()) if not item['company']: item['company'] = " ".join( question.xpath('div[@class="sjcl"]/span/a/text()').extract( )[0].split()) item['words'] = text_cleaner(item['url']) item['city'] = question.xpath( 'div[@class="sjcl"]/span[@class="location"]/text()').extract( )[0] item['city'], item['state'] = state_extract(item['city'], ) yield item questions = Selector(response).xpath( '//div[@class="lastRow row result"]') for question in questions: item = IndeedItem() item['title'] = question.xpath('h2/a/@title').extract()[0] item['url'] = 'http://indeed.com' + question.xpath( 'h2/a/@href').extract()[0] item['company'] = " ".join( question.xpath('span[@class="company"]/span/text()').extract() [0].split()) if not item['company']: item['company'] = " ".join( question.xpath('span[@class="company"]/span/a/text()'). extract()[0].split()) item['words'] = text_cleaner(item['url']) item['city'] = question.xpath( 'span[@itemprop="jobLocation"]/span/span/text()').extract()[0] item['city'], item['state'] = state_extract(item['city'], ) yield item