def __init__(self): # signal for closing method: dispatcher.connect(self.spider_closed, signals.spider_closed) # Selenium driver self.driver = webdriver.PhantomJS() self.driver.set_window_size(1024, 768) # Mapper for geoname_id and country_code self.geoCache = MapperGeoCache() # List of unique categories self.uniqueCategoriesSet = set() # Erase old categories fset = open('crawling/spiders/googleCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Load the dict for category mapper # Change this filename in each spider class with open('crawling/spiders/googleCategoriesMap.json') as data_file: self.categories = json.load(data_file) # for counting elapsed time self.start_time = time() # Jobs date self.dt = datetime.now() self.stop_by_max_jobs = False
class GoogleJobsSpider(scrapy.Spider): name = "googleGetItems" allowed_domains = ["google.com"] start_urls = ( "https://www.google.com/about/careers/search#t=sq&q=j&li=10&st=0&", ) # http://stackoverflow.com/questions/25353650/scrapy-how-to-import-the-settings-to-override-it def set_crawler(self, crawler): super(GoogleJobsSpider, self).set_crawler(crawler) # Getting the BEBEE CONFIGURATION PARAMETERS from .ini # Second level configuration file takes precedence over settings.py config = ConfigParser.ConfigParser() if config.read('./crawling/spiders/' + self.name + '.ini'): for name, value in config.items('DEFAULT'): crawler.settings.set(name.upper(), value) else: # NO .ini configuration file print "WARNING: no %s.ini config. using default values" % self.name # Getting the BEBEE CONFIGURATION PARAMETERS self.page_index = crawler.settings.getint('BEBEE_SPIDER_FIRST_PAGE', 1) self.stop_index = crawler.settings.getint('BEBEE_SPIDER_LAST_PAGE', 1) self.max_jobs = crawler.settings.getint('BEBEE_SPIDER_MAX_ITEMS', 3) self.delay_crawl_page = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_PAGE', 5) self.delay_crawl_job = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_ITEM', 1) self.max_execution_time = crawler.settings.getint( 'BEBEE_SPIDER_MAX_EXECUTION_TIME', 1800) self.account_id = crawler.settings.get('BEBEE_SPIDER_ACCOUNT_ID', '0') self.company_id = crawler.settings.get('BEBEE_SPIDER_COMPANY_ID', '') # Logger start. This code need account_id self.beBeeLogger = BebeeLogger(account_id=self.account_id, botName=self.name) self.beBeeLogger.init() def __init__(self): # signal for closing method: dispatcher.connect(self.spider_closed, signals.spider_closed) # Selenium driver self.driver = webdriver.PhantomJS() self.driver.set_window_size(1024, 768) # Mapper for geoname_id and country_code self.geoCache = MapperGeoCache() # List of unique categories self.uniqueCategoriesSet = set() # Erase old categories fset = open('crawling/spiders/googleCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Load the dict for category mapper # Change this filename in each spider class with open('crawling/spiders/googleCategoriesMap.json') as data_file: self.categories = json.load(data_file) # for counting elapsed time self.start_time = time() # Jobs date self.dt = datetime.now() self.stop_by_max_jobs = False def spider_closed(self): # Close selenium driver to avoid too much phantomJS running self.driver.close() # Saving the unique set of categories # Change this filename in each spider class fset = open('crawling/spiders/googleCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Log end self.beBeeLogger.end() def parse(self, response): self.driver.get(response.url) # URLs list links = [] # Jobs counter totalJobs = 0 # Filter configuration # We wait for the country input WebDriverWait(self.driver, 10).until( EC.visibility_of_element_located(( By.XPATH, '//div[@class="primary-filters"]//input[@class="mini ghost-text"]' ))) # We take the country inputElement = self.driver.find_element_by_xpath( '//div[@class="primary-filters"]//input[@class="mini ghost-text"]') # Page loop while True: print "Waiting for link page %s" % (str(self.page_index)) try: # Wait 10 secs for the button (timeout) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.LINK_TEXT, str(self.page_index)))) except: print "Page %s discarded" % (str(self.page_index)) print "Page %s not exists" % (str(self.page_index)) break # Next page print "-> Click page %s" % (str(self.page_index)) # Wait between pages sleep(self.delay_crawl_page) self.driver.find_element_by_link_text(str(self.page_index)).click() # Only URLs we want try: WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, '//a[@itemprop="url"]'))) except: print "Page %s discarded" % (str(self.page_index)) continue idsSel = self.driver.find_elements_by_xpath('//a[@itemprop="url"]') # Links count print "--> %s links" % (len(idsSel)) print for id in idsSel: links.append(str(id.get_attribute('href'))) totalJobs += 1 if (self.max_jobs <= totalJobs): self.stop_by_max_jobs = True print "-> Max jobs reached" break # Loop exit condition if (self.page_index == self.stop_index ) or self.stop_by_max_jobs or (self.max_execution_time <= (time() - self.start_time)): print "-> Stop process" break # Next page self.page_index += 1 print "--> Links: %s items" % (totalJobs) # Links loop for i, link in enumerate(links): item = BebeeItem() item['url'] = link item['account_id'] = str(self.account_id) item['company_id'] = str(self.company_id) if ((i % 100) == 0): print "Progress: " + str(i) self.beBeeLogger.progress() if True: sleep(self.delay_crawl_job) request = scrapy.Request(item['url'], callback=self.parse_item) request.meta['item'] = item yield request else: yield item # item parser def parse_item(self, response): error_location = False error_category = False item = response.meta['item'] item['title'] = response.xpath( '//a[@class="heading detail-title"]/@title').extract()[0] item['offer_id'] = response.xpath( '//div[@itemtype="http://schema.org/JobPosting"]/@id').extract()[0] item['lang_code'] = 'en-US' item['date'] = self.dt.strftime('%Y%m%d') item['description'] = remove_tags( response.xpath('//div[@itemprop="description"]').extract()[0]) item['location_name'] = response.xpath( '//span[@itemprop="name"]/text()').extract()[0] item['category_name'] = response.xpath( '//span[@itemprop="occupationalCategory"]/text()').extract()[0] # GEONAME MANAGEMENT try: item['geoname_id'] = self.geoCache.getGeonameId( item['location_name']) item['country_code'] = self.geoCache.getCountryCode( item['location_name']) except: error_message = "%s location not found in GeoName" % str( item['location_name']) print error_message error_location = True self.beBeeLogger.failure(item['offer_id'], error_message) # CATEGORY MANAGEMENT category_id = self.categoryMapper(item['category_name']) if category_id: item['category_id'] = category_id else: error_message = "category not found: %s" % str( item['category_name']) print error_message error_category = True self.beBeeLogger.failure(item['offer_id'], error_message) if not (error_location or error_category): self.beBeeLogger.success(item['offer_id']) return item # category_id Mapper function def categoryMapper(self, category_name): if category_name in self.categories: category_id = self.categories[category_name] return category_id else: # saving the unique set of categories self.uniqueCategoriesSet.add(category_name) fset = open('crawling/spiders/googleCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() return None
class UberJobsSpider(Spider): name = "uberGetItems" allowed_domains = ["uber.com"] start_urls = ('https://www.uber.com/jobs/list', ) # Override settings # http://stackoverflow.com/questions/25353650/scrapy-how-to-import-the-settings-to-override-it def set_crawler(self, crawler): super(UberJobsSpider, self).set_crawler(crawler) # Getting the BEBEE CONFIGURATION PARAMETERS from .ini # Second level configuration file takes precedence over settings.py config = ConfigParser.ConfigParser() if config.read('./crawling/spiders/' + self.name + '.ini'): for name, value in config.items('DEFAULT'): crawler.settings.set(name.upper(), value) else: # NO .ini configuration file print "WARNING: no %s.ini config. using default values" % self.name # Getting the BEBEE CONFIGURATION PARAMETERS self.page_index = crawler.settings.getint('BEBEE_SPIDER_FIRST_PAGE', 1) self.stop_index = crawler.settings.getint('BEBEE_SPIDER_LAST_PAGE', 1) self.max_jobs = crawler.settings.getint('BEBEE_SPIDER_MAX_ITEMS', 3) self.delay_crawl_page = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_PAGE', 5) self.delay_crawl_job = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_ITEM', 1) self.max_execution_time = crawler.settings.getint( 'BEBEE_SPIDER_MAX_EXECUTION_TIME', 1800) self.account_id = crawler.settings.get('BEBEE_SPIDER_ACCOUNT_ID', '0') self.company_id = crawler.settings.get('BEBEE_SPIDER_COMPANY_ID', '') # Logger start. This code need account_id self.beBeeLogger = BebeeLogger(account_id=self.account_id, botName=self.name) self.beBeeLogger.init() def __init__(self): # signal for closing method: dispatcher.connect(self.spider_closed, signal=signals.spider_closed) # Selenium driver self.driver = webdriver.PhantomJS() #Firefox() self.driver.set_window_size(1024, 768) # Mapper for geoname_id and country_code self.geoCache = MapperGeoCache() # List of unique categories self.uniqueCategoriesSet = set() # Erase old categories fset = open('crawling/spiders/uberCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Load the dict for category mapper # Change this filename in each spider class with open('crawling/spiders/uberCategoriesMap.json') as data_file: try: self.categories = json.load(data_file) except ValueError: pass # for counting elapsed time self.start_time = time() def spider_closed(self): # Close selenium driver to avoid too much phantomJS running self.driver.close() # Saving the unique set of categories # Change this filename in each spider class fset = open('crawling/spiders/uberCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Log end self.beBeeLogger.end() def parse(self, response): # storages for data links = [] titles = [] locations = [] categories = [] # Total job links crawled totalJobs = 0 # ------------------- DRIVER TO CRAWL RESULTS AND GET LINKS ----------------- print response.url self.driver.get(response.url) # Results loop while True: # ------------------ Go to page_index -------------------- print "-----------------------------------------" print "------------- PAGE: %s --------------" % (self.page_index) # ------------------ Get links and data --------------- print " Getting links in page %s" % (str(self.page_index)) WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "html/body/div[@class='web global-container js-global-container']/section/section[@class='jobs-listings']//div/ul/li/div[1]/a" ))) linkElems = self.driver.find_elements_by_xpath( "//section[@class='jobs-listings']/div/ul[@class='listing']/li/div[1]/a" ) categoryElems = self.driver.find_elements_by_xpath( "//section[@class='jobs-listings']/div/ul[@class='listing']/li/div[2]/a" ) locationElems = self.driver.find_elements_by_xpath( "//section[@class='jobs-listings']/div/ul[@class='listing']/li/div[3]/a" ) print " Save results data and links in arrays " for i, link in enumerate(linkElems): links.append(link.get_attribute("href")) titles.append(link.text) categories.append(categoryElems[i].get_attribute("innerHTML")) locations.append(locationElems[i].get_attribute("innerHTML")) # Break execution when reach max_jobs totalJobs += 1 if (self.max_jobs <= totalJobs): break # Break execution when reach self.max_execution_time seconds if (self.max_execution_time <= (time() - self.start_time)): break print "--> %s Links possible" % (len(linkElems)) print "--> %s Links obtained" % (totalJobs) # Results loop - stop condition if (self.page_index == self.stop_index) or (self.max_jobs <= totalJobs) or ( self.max_execution_time <= (time() - self.start_time)): break # Next results page self.page_index += 1 # --------------------------------- Download job data --------------------------------------------- # Job date (we take now because Amazon doesn't publish the date) dt = datetime.now() # Data loop (foreach link) for i, link in enumerate(links): error_location = False error_category = False # Object to create XML item = BebeeItem() item['title'] = titles[i] item['offer_id'] = re.search( "^https://www.uber.com/jobs/([0-9]*)/.*", link).group(1) #item['lang_code'] = 'en-US' item['url'] = link #item['date'] = dt.strftime('%Y%m%d') item['account_id'] = str(self.account_id) item['company_id'] = str(self.company_id) item['location_name'] = locations[i] item['category_name'] = categories[i] # GEONAME MANAGEMENT try: item['geoname_id'] = self.geoCache.getGeonameId(locations[i]) item['country_code'] = self.geoCache.getCountryCode( locations[i]) except: error_message = "%s location not found in GeoName" % locations[ i] print error_message error_location = True self.beBeeLogger.failure(item['offer_id'], error_message) # CATEGORY MANAGEMENT category_id = self.categoryMapper(item['category_name']) if category_id: item['category_id'] = category_id else: error_message = "category not found: %s" % item['category_name'] print error_message error_category = True self.beBeeLogger.failure(item['offer_id'], error_message) # Count success jobs if not (error_location or error_category): self.beBeeLogger.success(item['offer_id']) # Print progress if ((i % 100) == 0): print "-------------------" print "Jobs crawled: " + str(i) self.beBeeLogger.progress() # Crawl job description request = scrapy.Request(item['url'], callback=self.parse_description) request.meta['item'] = item yield request # Delay between each job print "%s secs delayed" % str(self.delay_crawl_job) sleep(self.delay_crawl_job) # Get job descripcion def parse_description(self, response): item = response.meta['item'] description = "" descriptionDivList = response.xpath( "//span[@itemprop='description']").re(r'(?s)Description(.*)') for desc in descriptionDivList: description += unicode.strip(remove_tags(desc)) item['description'] = description language = langid.classify(description)[0] if (language == 'en'): item['lang_code'] = 'en-US' elif (language == 'es'): item['lang_code'] = 'es-ES' elif (language == 'pt'): item['lang_code'] = 'pt-BR' else: item['lang_code'] = language item['date'] = response.xpath( "//div[@class='jobDisplay']/p[@class='jobDate']/span/text()" ).extract()[0].strip() return item # category_id Mapper function def categoryMapper(self, category_name): if category_name in self.categories: category_id = self.categories[category_name] return category_id else: # saving the unique set of categories self.uniqueCategoriesSet.add(category_name) fset = open('uberCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() return None
class AB_INBEV_SPIDER(BaseSpider): name = 'AB_INBEV' allowed_domains = [ 'http://www.ab-inbev.com/careers.html', 'http://eu.abinbevcareers.com', 'http://gmodelo.bumeran.com.mx', 'https://abinbev.taleo.net', 'http://eu.abinbevcareers.com' ] start_urls = ['http://www.ab-inbev.com/careers.html'] # Override settings # http://stackoverflow.com/questions/25353650/scrapy-how-to-import-the-settings-to-override-it def set_crawler(self, crawler): super(AB_INBEV_SPIDER, self).set_crawler(crawler) # Getting the BEBEE CONFIGURATION PARAMETERS from .ini # Second level configuration file takes precedence over settings.py config = ConfigParser.ConfigParser() if config.read('./crawling/spiders/' + self.name + '.ini'): for name, value in config.items('DEFAULT'): crawler.settings.set(name.upper(), value) else: # NO .ini configuration file print "WARNING: no %s.ini config. using default values" % self.name # Getting the BEBEE CONFIGURATION PARAMETERS self.page_index = crawler.settings.getint('BEBEE_SPIDER_FIRST_PAGE', 1) self.stop_index = crawler.settings.getint('BEBEE_SPIDER_LAST_PAGE', 1) self.max_jobs = crawler.settings.getint('BEBEE_SPIDER_MAX_ITEMS', 3) self.delay_crawl_page = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_PAGE', 5) self.delay_crawl_job = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_ITEM', 1) self.max_execution_time = crawler.settings.getint( 'BEBEE_SPIDER_MAX_EXECUTION_TIME', 1800) self.account_id = crawler.settings.get('BEBEE_SPIDER_ACCOUNT_ID', '0') self.company_id = crawler.settings.get('BEBEE_SPIDER_COMPANY_ID', '') # Logger start. This code need account_id self.beBeeLogger = BebeeLogger(account_id=self.account_id, botName=self.name) self.beBeeLogger.init() def __init__(self): # signal for closing method: #dispatcher.connect(self.spider_closed, signals.spider_closed) # Selenium driver self.driver = webdriver.PhantomJS() self.driver.set_window_size(1024, 768) # Mapper for geoname_id and country_code self.geoCache = MapperGeoCache() # List of unique categories self.uniqueCategoriesSet = set() # Erase old categories fset = open('crawling/spiders/ab_inbevCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Load the dict for category mapper # Change this filename in each spider class with open('crawling/spiders/ab_inbevCategoriesMap.json') as data_file: self.categories = json.load(data_file) # for counting elapsed time self.start_time = time() def spider_closed(self): # Close selenium driver to avoid too much phantomJS running self.driver.close() # Saving the unique set of categories # Change this filename in each spider class fset = open('crawling/spiders/ab_inbevCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Log end self.beBeeLogger.end() def myHREF(self, htmlcode): hrefs = [] htmlcode = htmlcode.encode('ascii', 'ignore') #converted to string item = htmlcode htmllen = len(htmlcode) i = 0 while i < htmllen: if item[i] == 'h' and item[i + 1] == 'r' and item[ i + 2] == 'e' and item[i + 3] == 'f' and item[i + 4] == '=': i = i + 6 url = '' while item[i] != '"': url += item[i] i = i + 1 if 'javascript' not in url: hrefs.append(url) i = i + 1 #~ print 'printing URLSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS' #~ for href in hrefs : #~ print '-> ' + href return hrefs def parse(self, response): # storages for data links = [] titles = [] locations = [] categories = [] dates = [] offerids = [] descriptions = [] langs = [] reload(sys) sys.setdefaultencoding("utf-8") #dcap = dict(DesiredCapabilities.PHANTOMJS) #dcap["phantomjs.page.settings.userAgent"] = ( #"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87") browser = webdriver.Firefox() #browser = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true']) browser.set_window_size(1024, 768) raw_input('Release the SPIDER') # USA new_url = 'https://abinbev.taleo.net/careersection/27/jobsearch.ftl?lang=en' browser.get(new_url) new_response = browser.page_source #WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="requisitionListInterface.ID3161.row1"]'))) #//*[@id="requisitionListInterface.reqTitleLinkAction.row1"] ##WebDriverWait(browser, 10) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="requisitionListInterface.reqTitleLinkAction.row1"]' ))) button = browser.find_element_by_xpath( '//*[@id="requisitionListInterface.reqTitleLinkAction.row1"]' ) #//div[@class="no-change-header-inline"]//h3[@class="no-change-header-inline"]//span[@class="titlelink"]').click() button.click() sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.pagerDivID862.Next"]' ))) div_str = (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.pagerDivID862.Next"]'). text) while "Next" in div_str: sleep(3) #wat abt URL in such cases? WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqTitleLinkAction.row1"]' ))) title = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqTitleLinkAction.row1"]' ).text WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1676.row1"]'))) location = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1676.row1"]').text #NO UPAR DEK #category = browser.find_element_by_xpath('//*[@id="requisitionDescriptionInterface.ID1626.row1"]').text WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqPostingDate.row1"]' ))) date = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqPostingDate.row1"]' ).text WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqContestNumberValue.row1"]' ))) offerid = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqContestNumberValue.row1"]' ).text WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID3560.row.row1"]' ))) description = (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID3560.row.row1"]'). text).split('Job Description')[1] WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.pagerDivID862.Next"]' ))) new_button = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.pagerDivID862.Next"]' ) new_button.click() sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.pagerDivID862.Next"]' ))) div_str = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.pagerDivID862.Next"]' ).text sleep(5) #USA #~ new_url = 'https://abinbev.taleo.net/careersection/27/jobsearch.ftl?lang=en' #~ browser.get(new_url) #~ WebDriverWait(browser, 10) #~ new_response = browser.page_source #~ print 'RESPONSE>>>>>>>>>>>>>>>>>>>\n',new_response #~ print 'URL -------->> ',browser.current_url sleep(5) #mexico new_url = 'http://gmodelo.bumeran.com.mx/listadoofertas.bum' browser.get(new_url) new_response = browser.page_source #WebDriverWait(browser, 10) sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="listado"]'))) hrefss = self.myHREF( browser.find_element_by_xpath( '//div[@class="listado"]').get_attribute('outerHTML')) #remember first we are getting all urls for all coutries.. than only we categories them #joblinks for item in hrefss: if 'detal' in item: links.append((new_url.split('/lis')[0]) + '/' + item) div_str = browser.find_elements_by_xpath( '//div[@class="paginador"]')[0].text i = 2 while "Siguiente" in div_str: browser.get(new_url + '?page=' + ` i `) i = i + 1 WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="paginador"]'))) div_str = browser.find_elements_by_xpath( '//div[@class="paginador"]')[0].text WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="listado"]'))) hrefss = self.myHREF( browser.find_element_by_xpath( '//div[@class="listado"]').get_attribute('outerHTML')) for item in hrefss: #print item if 'detal' in item: links.append((new_url.split('/lis')[0]) + '/' + item) for joblinkpage in links: browser.get(joblinkpage) sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//span[@class="ciudad"]'))) location = browser.find_elements_by_xpath( '//span[@class="ciudad"]')[0].text print location WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="titulo"]'))) title = browser.find_elements_by_xpath( '//div[@class="titulo"]')[0].text print title.split(location)[0] WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="detalle"]//dev[@class="value"]'))) category = browser.find_elements_by_xpath( '//div[@class="detalle"]//dev[@class="value"]')[0].text print category WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="detalle"]//dev[@class="value"]'))) date = browser.find_elements_by_xpath( '//div[@class="detalle"]//dev[@class="value"]')[5].text print date WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="detalle"]//dev[@class="value"]'))) description = browser.find_elements_by_xpath( '//div[@class="detalle"]//dev[@class="value"]')[6].text print description #WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, ))) offerid = joblinkpage.split('=')[1] print offerid sleep(5) #south america url = 'https://abinbev.taleo.net/careersection/2/jobsearch.ftl?lang=pt-BR' browser.get(url) WebDriverWait(browser, 10) # extract from this DIV --->>> resultListPanel #joblinks sleep(5) #joblinks WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="jobs"]//tbody/tr[1]/th'))) hrefss = self.myHREF( browser.find_element_by_xpath( '//*[@id="jobs"]//tbody/tr[1]/th').get_attribute('outerHTML')) for href in hrefss: links.append(href) #yahi se location pe append kara le WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="jobs"]//tbody//tr[1]/td[2]'))) location = browser.find_element_by_xpath( '//*[@id="jobs"]//tbody//tr[1]/td[2]').text WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="next"]'))) div_str = (browser.find_element_by_xpath( '//*[@id="next"]').get_attribute('outerHTML')).encode( 'ascii', 'ignore') var = 25 while "-disabled" not in div_str: #print 'done>>>>>>>>>>>>>>>>>>> ',var #var+=25 WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="next"]'))) browser.find_element_by_xpath('//*[@id="next"]').click() sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="jobs"]//tbody/tr[1]/th'))) hrefss = self.myHREF( browser.find_element_by_xpath('//*[@id="jobs"]//tbody/tr[1]/th' ).get_attribute('outerHTML')) for href in hrefss: links.append(href) #yahi se location pe appens kara le WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="jobs"]//tbody//tr[1]/td[2]'))) location = browser.find_element_by_xpath( '//*[@id="jobs"]//tbody//tr[1]/td[2]').text WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="next"]'))) div_str = (browser.find_element_by_xpath( '//*[@id="next"]').get_attribute('outerHTML')).encode( 'ascii', 'ignore') for joblinkpage in links: browser.get(joblinkpage) sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqTitleLinkAction.row1"]' ))) title = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqTitleLinkAction.row1"]' ) print title WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqContestNumberValue.row1"]' ))) offerid = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqContestNumberValue.row1"]' ) print offerid WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1493.row1"] ' ))) description = '' description = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1493.row1"]').text WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1553.row1"]'))) description += browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1553.row1"]').text print description sleep(5) #EUROPE url = 'http://eu.abinbevcareers.com/en/job-search.aspx?country=Any&keyword=' browser.get(url) #WebDriverWait(browser, 10) sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="lumesse-search-results"]'))) div_str = browser.find_elements_by_xpath( '//div[@class="lumesse-search-results"]')[0].text num_pages = int(div_str.split(' ')[3]) if num_pages % 10 != 0: num_pages = num_pages / 10 + 1 else: num_pages = num_pages / 10 sleep(5) #joblink WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="lumesse-search-results"]'))) hrefss = self.myHREF( browser.find_element_by_xpath( '//div[@class="lumesse-search-results"]').get_attribute( 'outerHTML')) for item in hrefss: links.append('http://eu.abinbevcareers.com' + item) #date WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="job-listing-container-right"]'))) for item in browser.find_elements_by_xpath( '//div[@class="job-listing-container-right"]'): print item.text.split(':')[1] #title WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//div[@class="job-listing-container-left"]//div[@class="job-listing-title"]' ))) for item in browser.find_elements_by_xpath( '//div[@class="job-listing-container-left"]//div[@class="job-listing-title"]' ): print item.text #locaation WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//div[@class="job-listing-container-left"]//div[@class="job-listing-key-details"]' ))) for item in (browser.find_elements_by_xpath( '//div[@class="job-listing-container-left"]//div[@class="job-listing-key-details"]' )): print item.text #description #WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="lumesse-advert-row"]'))) #in joblink page -> (('//div[@class="lumesse-advert-row"]').text).split('About Us')[0] raw_input('hi...........######.......###########........#########') start = 2 while start <= num_pages: myxpath = '//*[@id="btnPage' + ` start ` + '"]' start += 1 WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.XPATH, myxpath))) browser.find_element_by_xpath(myxpath).click() #joblink WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="lumesse-search-results"]'))) hrefss = self.myHREF( browser.find_element_by_xpath( '//div[@class="lumesse-search-results"]').get_attribute( 'outerHTML')) for item in hrefss: links.append('http://eu.abinbevcareers.com' + item) #date WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="job-listing-container-right"]'))) for item in browser.find_elements_by_xpath( '//div[@class="job-listing-container-right"]'): print item.text.split(':')[1] #title WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//div[@class="job-listing-container-left"]//div[@class="job-listing-title"]' ))) for item in browser.find_elements_by_xpath( '//div[@class="job-listing-container-left"]//div[@class="job-listing-title"]' ): print item.text #locaation WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//div[@class="job-listing-container-left"]//div[@class="job-listing-key-details"]' ))) for item in (browser.find_elements_by_xpath( '//div[@class="job-listing-container-left"]//div[@class="job-listing-key-details"]' )): print item.text #description #WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="lumesse-advert-row"]'))) #in joblink page -> (('//div[@class="lumesse-advert-row"]').text).split('About Us')[0] for joblinkpage in links: browser.get(joblinkpage) sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="lumesse-advert-row"]'))) description = (browser.find_elements_by_xpath( '//div[@class="lumesse-advert-row"]').text ).split('About Us')[0] sleep(5) #ASIA url = 'https://abinbev.taleo.net/careersection/15/jobsearch.ftl?lang=zh_CN' browser.get(url) #WebDriverWait(browser, 10) WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="requisitionListInterface.reqTitleLinkAction.row1"]' ))) button = browser.find_element_by_xpath( '//*[@id="requisitionListInterface.reqTitleLinkAction.row1"]') button.click() sleep(5) #div_str = (browser.find_element_by_xpath('//*[@id="requisitionListInterface.pagerDivID3595.panel.Next"]').get_attribute('outerHTML')).encode('ascii','ignore') WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.pagerDivID835.Next"]' ))) div_str = (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.pagerDivID835.Next"]'). get_attribute('outerHTML')).encode('ascii', 'ignore') while "pagerlinkoff" not in div_str: sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqPostingDate.row1"]' ))) date = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqPostingDate.row1"]' ).text #print 'date ---->> ',date ##locations = browser.find_element_by_xpath('//*[@id="requisitionDescriptionInterface.ID1815.row1"]').text locations = 'China' #print 'locatoin ---->> ',locations WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.reqContestNumberValue.row1"]' ))) offerid = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.reqContestNumberValue.row1"]' ).text #print 'offerid ---->> ',offerid res = '' try: WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/div[1]' ))) res = (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/div[1]' ).text) except Exception: try: WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/p[1]' ))) res = (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/p[1]' ).text) except Exception: pass pass try: title = '' if res == '': title = 'NAN' if 'seeking ' in res: try: WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/div[1]' ))) title = ((browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/div[1]' ).text).split('seeking ')[1]).split('to ')[0] except Exception: pass elif 'seeking ' in (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/p[1]' ).text): WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/p[1]' ))) title = ((browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1629.row1"]/p[1]' ).text).split('seeking ')[1]).split('to ')[0] else: title = 'NAN' except Exception: pass #print 'title ---->> ',title WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.ID1629.row1"]'))) description = browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.ID1629.row1"]').text #print 'description ---->> ',description #raw_input('??????????? *******#################### O O O O O O O O O O') WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.pagerDivID835.Next"]' ))) browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.pagerDivID835.Next"]' ).click() #browser.implicitly_wait(5) sleep(5) WebDriverWait(browser, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="requisitionDescriptionInterface.pagerDivID835.Next"]' ))) div_str = (browser.find_element_by_xpath( '//*[@id="requisitionDescriptionInterface.pagerDivID835.Next"]' ).get_attribute('outerHTML')).encode('ascii', 'ignore') for i in range(len(titles)): error_location = False error_category = False # Object to create XML item = BebeeItem() item['title'] = titles[i] item['offer_id'] = offerids[i] item['url'] = links[i] #item['date'] = dates[i] item['account_id'] = str(self.account_id) item['company_id'] = str(self.company_id) item['location_name'] = locations[i] item['category_name'] = categories[i] item['description'] = descriptions[i] language = langid.classify(item['description'])[0] if (language == 'en'): item['lang_code'] = 'en-US' elif (language == 'es'): item['lang_code'] = 'es-ES' elif (language == 'pt'): item['lang_code'] = 'pt-BR' else: item['lang_code'] = language #GEONAME MANAGEMENT try: item['geoname_id'] = self.geoCache.getGeonameId(locations[i]) item['country_code'] = self.geoCache.getCountryCode( locations[i]) except: error_message = "%s location not found in GeoName" % str( locations[i]) print error_message error_location = True self.beBeeLogger.failure(item['offer_id'], error_message) #CATEGORY MANAGEMENT category_id = self.categoryMapper(item['category_name']) if category_id: item['category_id'] = category_id else: error_message = "category not found: %s" % str( item['category_name']) print error_message error_category = True self.beBeeLogger.failure(item['offer_id'], error_message) #Count success jobs if not (error_location or error_category): self.beBeeLogger.success(item['offer_id']) #Print progress if ((i % 100) == 0): print "-------------------" print "Jobs crawled: " + str(i) self.beBeeLogger.progress() yield item # category_id Mapper function def categoryMapper(self, category_name): if category_name in self.categories: category_id = self.categories[category_name] return category_id else: # saving the unique set of categories self.uniqueCategoriesSet.add(category_name) fset = open('ab_inbevCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() return None
class NestleJobsSpider(scrapy.Spider): name = "nestleGetItems" allowed_domains = ["taleo.net"] start_urls = ('https://nestle.taleo.net/careersection/3/jobsearch.ftl?', ) # Override settings # http://stackoverflow.com/questions/25353650/scrapy-how-to-import-the-settings-to-override-it def set_crawler(self, crawler): super(NestleJobsSpider, self).set_crawler(crawler) # Getting the BEBEE CONFIGURATION PARAMETERS from .ini # Second level configuration file takes precedence over settings.py config = ConfigParser.ConfigParser() if config.read('./crawling/spiders/' + self.name + '.ini'): for name, value in config.items('DEFAULT'): crawler.settings.set(name.upper(), value) else: # NO .ini configuration file print "WARNING: no %s.ini config. using default values" % self.name # Getting the BEBEE CONFIGURATION PARAMETERS self.page_index = crawler.settings.getint('BEBEE_SPIDER_FIRST_PAGE', 1) self.stop_index = crawler.settings.getint('BEBEE_SPIDER_LAST_PAGE', 1) self.max_jobs = crawler.settings.getint('BEBEE_SPIDER_MAX_ITEMS', 3) self.delay_crawl_page = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_PAGE', 5) self.delay_crawl_job = crawler.settings.getint( 'BEBEE_SPIDER_CRAWL_DELAY_ITEM', 1) self.max_execution_time = crawler.settings.getint( 'BEBEE_SPIDER_MAX_EXECUTION_TIME', 1800) self.account_id = crawler.settings.get('BEBEE_SPIDER_ACCOUNT_ID', '0') self.company_id = crawler.settings.get('BEBEE_SPIDER_COMPANY_ID', '') # Logger start. This code need account_id self.beBeeLogger = BebeeLogger(account_id=self.account_id, botName=self.name) self.beBeeLogger.init() def __init__(self): # signal for closing method: dispatcher.connect(self.spider_closed, signals.spider_closed) # Selenium driver self.driver = webdriver.PhantomJS() self.driver.set_window_size(1024, 768) # Mapper for geoname_id and country_code self.geoCache = MapperGeoCache() # List of unique categories self.uniqueCategoriesSet = set() # Erase old categories fset = open('crawling/spiders/nestleCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Load the dict for category mapper # Change this filename in each spider class with open('crawling/spiders/nestleCategoriesMap.json') as data_file: self.categories = json.load(data_file) # for counting elapsed time self.start_time = time() def spider_closed(self): # Close selenium driver to avoid too much phantomJS running self.driver.close() # Saving the unique set of categories # Change this filename in each spider class fset = open('crawling/spiders/nestleCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() # Log end self.beBeeLogger.end() def parse(self, response): # storages for data links = [] titles = [] locations = [] categories = [] # Total job links crawled totalJobs = 0 print response.url # ------------------- DRIVER TO CRAWL RESULTS AND GET LINKS ----------------- self.driver.get(response.url) #Go to the first page for i in range(1, self.page_index + 1): try: nextButton = self.driver.find_elements_by_xpath( "//a[@id='next']")[0] print "On page: " + str(i) nextButton.click() WebDriverWait(self.driver, 50).until( EC.presence_of_element_located( (By.XPATH, "//a[@id='next']"))) except: print "------------------------------------------" print "No pages found. Program will end in error." print "------------------------------------------" #If the page index is 1 then the loop above didn't run, so wait for page to load print "Load first page" #Check for presence of first job. Xpath is 1 indexed WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//ul[@id='jobList']/li/div[@class='multiline-data-container']/div/div/a)[1]" ))) # Results loop while True: # ------------------ Go to page_index -------------------- print "-----------------------------------------" print "------------- NEW PAGE: %s --------------" % ( self.page_index) # ------------------ Get links and data --------------- linkElems = self.driver.find_elements_by_xpath( "//ul[@id='jobList']/li/div[@class='multiline-data-container']/div/div/a" ) print "{} links obtained".format(len(linkElems)) for i in range(len(linkElems)): print "job {}/{}".format(i + 1, len(linkElems)) error_location = False error_category = False link = self.driver.find_elements_by_xpath( "//ul[@id='jobList']/li/div[@class='multiline-data-container']/div/div/a" )[i] item = BebeeItem() item['url'] = link.get_attribute("href") item['offer_id'] = re.search("job=([0-9]+w*)", item['url']).group(1) item['account_id'] = str(self.account_id) item['company_id'] = str(self.company_id) #Click on the link to get the data on its page link.click() #Wait till the page loads print "Wait for description to load" WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//span[@id='requisitionDescriptionInterface.reqTitleLinkAction.row1'])[1]" ))) item['title'] = self.driver.find_elements_by_xpath( "//span[@id='requisitionDescriptionInterface.reqTitleLinkAction.row1']" )[0].text WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//span[@id='requisitionDescriptionInterface.ID1644.row1'])[1]" ))) item['description'] = self.driver.find_elements_by_xpath( "//span[@id='requisitionDescriptionInterface.ID1644.row1']" )[0].text language = langid.classify(item['description'])[0] if (language == 'en'): item['lang_code'] = 'en-US' elif (language == 'es'): item['lang_code'] = 'es-ES' elif (language == 'pt'): item['lang_code'] = 'pt-BR' else: item['lang_code'] = language WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//span[@id='requisitionDescriptionInterface.ID1712.row1'])[1]" ))) item['location_name'] = self.driver.find_elements_by_xpath( "//span[@id='requisitionDescriptionInterface.ID1712.row1']" )[0].text WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//span[@id='requisitionDescriptionInterface.ID1762.row1'])[1]" ))) item['category_name'] = self.driver.find_elements_by_xpath( "//span[@id='requisitionDescriptionInterface.ID1762.row1']" )[0].text WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//span[@id='requisitionDescriptionInterface.reqPostingDate.row1'])[1]" ))) item['date'] = self.driver.find_elements_by_xpath( "//span[@id='requisitionDescriptionInterface.reqPostingDate.row1']" )[0].text # GEONAME MANAGEMENT try: item['geoname_id'] = self.geoCache.getGeonameId( item['location_name']) item['country_code'] = self.geoCache.getCountryCode( item['location_name']) except: error_message = "%s location not found in GeoName" % item[ 'location_name'] print error_message error_location = True self.beBeeLogger.failure(item['offer_id'], error_message) # CATEGORY MANAGEMENT category_id = self.categoryMapper(item['category_name']) if category_id: item['category_id'] = category_id else: error_message = "category not found: %s" % item[ 'category_name'] print error_message error_category = True self.beBeeLogger.failure(item['offer_id'], error_message) # Count success jobs if not (error_location or error_category): self.beBeeLogger.success(item['offer_id']) # Print progress if ((i % 100) == 0): print "-------------------" print "Jobs crawled: " + str(i) self.beBeeLogger.progress() yield item totalJobs += 1 if (self.max_jobs <= totalJobs): break # Break execution when reach self.max_execution_time seconds if (self.max_execution_time <= (time() - self.start_time)): break self.driver.back() print "Wait to go back to joblist" WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//ul[@id='jobList']/li/div[@class='multiline-data-container']/div/div/a)[" + str(i + 1) + "]"))) print "--> %s Links possible" % (len(linkElems)) print "--> %s Links obtained" % (totalJobs) # Results loop - stop condition if (self.page_index == self.stop_index) or (self.max_jobs <= totalJobs) or ( self.max_execution_time <= (time() - self.start_time)): break # Go to next results page nextButton = self.driver.find_elements_by_xpath( "//a[@id='next']")[0] nextButton.click() print "Wait to go to next page" WebDriverWait(self.driver, 50).until( EC.presence_of_element_located(( By.XPATH, "(//ul[@id='jobList']/li/div[@class='multiline-data-container']/div/div/a)[1]" ))) self.page_index += 1 # category_id Mapper function def categoryMapper(self, category_name): if category_name in self.categories: category_id = self.categories[category_name] return category_id else: # saving the unique set of categories self.uniqueCategoriesSet.add(category_name) fset = open('crawling/spiders/amazonCategoriesMissing.json', 'w') json.dump(list(self.uniqueCategoriesSet), fset) fset.close() return None