def test_run(): # Change other logger levels logging.getLogger('urllib3').setLevel(logging.WARN) logging.getLogger('selenium').setLevel(logging.WARN) scraper = LinkedinScraper( chrome_executable_path=None, chrome_options=None, headless=True, max_workers=1, slow_mo=1, ) scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.INVALID_SESSION, on_invalid_session) scraper.on(Events.END, on_end) queries = [ Query(), Query( query='c#', options=QueryOptions( locations=['Finland'], optimize=False, apply_link=True, limit=33, filters=QueryFilters( time=TimeFilters.WEEK, experience=ExperienceLevelFilters.MID_SENIOR, ) ) ), Query( query='Engineer', options=QueryOptions( locations=['United States'], optimize=False, limit=27, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP, TypeFilters.CONTRACT] ) ) ), ] scraper.run( queries=queries, # Global options options=QueryOptions( locations=['United Kingdom'], limit=10, optimize=True, ) )
def scrape(query, numresults): scraper = LinkedinScraper( chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_options=None, # Custom Chrome options here headless=False, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=1, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query=query, options=QueryOptions( locations=['Montreal'], optimize=True, # Blocks requests for resources like images and stylesheet limit=numresults # Limit the number of jobs to scrape ) ), # Query( # query='database', # options=QueryOptions( # locations=['United States'], # optimize=False, # limit=5, # filters=QueryFilters( # # company_jobs_url='https://www.linkedin.com/jobs/search/?geoId=101174742&keywords=amazon&location=Canada', # Filter by companies # relevance=RelevanceFilters.RECENT, # time=TimeFilters.MONTH, # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], # experience=None, # ) # ) # ), ] scraper.run(queries)
) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Data Engineer', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'], optimize=False, limit=940, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.DAY, ) ) ), Query( query='Data Scientist', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'], optimize=False, limit=940, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.DAY, )
None, # You can pass your custom Chrome options here max_workers= 1, # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread) slow_mo= 1.2, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query(options=QueryOptions( optimize= True, # Blocks requests for resources like images and stylesheet limit=0 # Limit the number of jobs to scrape )), Query(query=search, options=QueryOptions( locations=['Toronto, Ontario, Canada'], optimize=True, limit=400, filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.WEEK, type=[TypeFilters.FULL_TIME], ))), ] scraper.run(queries)
def scrapeLinkedinJobs(industries): scraper = LinkedinScraper( # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_executable_path= 'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe', chrome_options=None, # Custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=len( industries ), # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 2, # Slow down the scraper to avoid 'Too many requests (429)' errors ) queries = [] for i in range(len(industries)): paramQ = Query( query=industries[i], options=QueryOptions( locations=['Singapore'], optimize=True, limit=6, filters=QueryFilters( company_jobs_url=None, # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME], experience=None, ))) queries.append(paramQ) JobList = {} def on_data(data: EventData): jobData = {} jobData["title"] = data.title jobData["company"] = data.company jobData["place"] = data.place jobData["description"] = data.description jobData["linkedinUrl"] = data.link jobData["descriptionHTML"] = data.description_html jobData["employmentType"] = data.employment_type jobData["applyUrl"] = data.apply_link jobData["date"] = data.date jobData["seniority"] = data.seniority_level jobData["jobFunction"] = data.job_function jobData["industries"] = data.industries jobData["skills"] = json.loads( extract_skills_from_document(data.description)) if data.query not in JobList.keys(): JobList[data.query] = [] JobList[data.query].append(jobData) else: JobList[data.query].append(jobData) del data del jobData def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) scraper.run(queries) JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()] return JobList
scraper = LinkedinScraper( chrome_options=None, # You can pass your custom Chrome options here max_workers=1, # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread) slow_mo=1, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( options=QueryOptions( optimize=True, # Blocks requests for resources like images and stylesheet limit=27 # Limit the number of jobs to scrape ) ), Query( query='Engineer', options=QueryOptions( locations=['United States'], optimize=False, limit=5, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], experience=None, )
scraper = LinkedinScraper( chrome_options=None, # You can pass your custom Chrome options here max_workers=1, # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread) slow_mo=1.5, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( options=QueryOptions( optimize=True, # Blocks requests for resources like images and stylesheet limit=0 # Limit the number of jobs to scrape ) ), Query( query= search , options=QueryOptions( locations=['Canada'], optimize=True, limit=50, #full on 500 filters=QueryFilters( relevance=RelevanceFilters.RELEVANT, time=TimeFilters.WEEK, type=[TypeFilters.FULL_TIME], ) ) ),
) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Data', options=QueryOptions( locations=['Belgium'], optimize=True, limit=10, filters=QueryFilters( company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1508%2C6754%2C3880216%2C631981%2C166278%2C963211%2C3625182%2C256009%2C157326%2C282760%2C3627928%2C1519%2C281207%2C18735883%2C10070%2C98774%2C15245937%2C3683364%2C251838%2C2642837&geoId=92000000', # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.TEMPORARY], experience=[ExperienceLevelFilters.ENTRY_LEVEL, ExperienceLevelFilters.MID_SENIOR], ) ) ), ] scraper.run(queries) ## Currently not working (ie no data export) results = scraper.run(queries) import csv with open("jobs.csv", "a") as csvfile:
def linkedinsc(): with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json', 'r') as readfile: try: jsondata = json.load(readfile) stored_links = [] for single_data in jsondata: stored_links.append(single_data['Page_URL']) except: jsondata = [] stored_links = [] # Change root logger level (default is WARN) logging.basicConfig(level=logging.INFO) def on_data(data: EventData): # print('[ON_DATA]', data.title, data.company, data.date, data.link,data.seniority_level,data.employment_type) link = data.link link = link.split('?', 1)[0] if link not in stored_links: stored_links.append(link) print("NEW JOB FOUND !!!", link) source = requests.get(data.link).text soup = BeautifulSoup(source, 'lxml') desct = soup.find('main', class_='main').get_text(strip=True) jsondata.append({ 'name': data.title, 'company': data.company, 'address': data.place, 'deadline': data.date, 'time': data.employment_type, 'Page_URL': link, 'desct': desct, 'websitename': 'np.linkedin.com' }) def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper = LinkedinScraper( chrome_options=None, # You can pass your custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 1.5, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ # Query( # options=QueryOptions( # optimize=True, # Blocks requests for resources like images and stylesheet # limit=50 # Limit the number of jobs to scrape # ) # ), Query( query='it', options=QueryOptions( locations=['Nepal'], optimize=True, limit=70, # filters=QueryFilters( # company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies # relevance=RelevanceFilters.RECENT, # time=TimeFilters.MONTH, # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], # experience=None, # ) )) ] scraper.run(queries) with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json', 'w') as outfile: json.dump(jsondata, outfile) print("linkedin done")
chrome_options=None, # You can pass your custom Chrome options here max_workers= 1, # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread) slow_mo= 0.4, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query(options=QueryOptions( optimize= True, # Blocks requests for resources like images and stylesheet limit=0 # Limit the number of jobs to scrape )), Query(query='Engineer', options=QueryOptions( locations=['Toronto, Ontario, Canada'], optimize=False, limit=5, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], experience=None, ))), ]
headless=True, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=5.0, # Slow down the scraper to avoid 'Too many requests (429)' errors #slow_mo=0.4, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Machine Learning Engineer', options=QueryOptions( locations=['Bengaluru, Karnataka, India'], optimize=True, limit=1000, filters=QueryFilters( #company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies relevance=RelevanceFilters.RELEVANT, time=TimeFilters.MONTH, #type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], experience=None, ) ) ), ] scraper.run(queries)
chrome_options=None, # You can pass your custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers= 1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 0.4, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners # Evento que para detectar cada oferta scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( # Categoria que estás buscando query='Engineer', options=QueryOptions( # Pais donde quieres buscar locations=['United States'], # En caso sean imágenes o css optimize=False, # Defines la cantidad limit=5)), ] scraper.run(queries)
) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) limit = 9000 queries = [ Query(query='Cloud Engineer', options=QueryOptions(locations=['Canada', "United-States"], optimize=False, limit=limit, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, experience=[ ExperienceLevelFilters.INTERNSHIP, ExperienceLevelFilters.ASSOCIATE, ExperienceLevelFilters.ENTRY_LEVEL ]))), Query(query='Cloud architect', options=QueryOptions(locations=['Canada', 'United-States'], optimize=False, limit=limit, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, experience=[ ExperienceLevelFilters.INTERNSHIP, ExperienceLevelFilters.ASSOCIATE,
) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='human resources', options=QueryOptions( locations=['Sharnbrook, England, United Kingdom'], optimize=False, limit=1000, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.DAY, type=[TypeFilters.FULL_TIME, TypeFilters.CONTRACT, TypeFilters.TEMPORARY], experience=ExperienceLevelFilters.ENTRY_LEVEL, ) ) ), Query( query='human resources', options=QueryOptions( locations=['United Kingdom'], optimize=False, limit=1000, filters=QueryFilters( relevance=RelevanceFilters.RECENT, time=TimeFilters.DAY,