def scrape(query, numresults): scraper = LinkedinScraper( chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_options=None, # Custom Chrome options here headless=False, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=1, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query=query, options=QueryOptions( locations=['Montreal'], optimize=True, # Blocks requests for resources like images and stylesheet limit=numresults # Limit the number of jobs to scrape ) ), # Query( # query='database', # options=QueryOptions( # locations=['United States'], # optimize=False, # limit=5, # filters=QueryFilters( # # company_jobs_url='https://www.linkedin.com/jobs/search/?geoId=101174742&keywords=amazon&location=Canada', # Filter by companies # relevance=RelevanceFilters.RECENT, # time=TimeFilters.MONTH, # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], # experience=None, # ) # ) # ), ] scraper.run(queries)
global df dictTemp = {"JobID": [data.job_id] , "Company": [data.company],"Title" : [data.title], "Place": [data.place],"Date": [data.date],"SeniorityLevel": [data.seniority_level],"JobFunction": [data.job_function],"EmployementType": [data.employment_type],"Industries": [data.industries],"description":[data.description]} dfTemp = pd.DataFrame(dictTemp) df = df.append(dfTemp,ignore_index = True) def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper = LinkedinScraper( chrome_executable_path='C:\chromedriver_win32\chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_options=None, # Custom Chrome options here headless=False, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo=1.4, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Data Engineer', options=QueryOptions( locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'],
for counter in range(2): if counter == 0: for search in searches: title = [] company = [] date = [] link = [] industry = [] occupation = [] jobType = [] scraper = LinkedinScraper( chrome_options= None, # You can pass your custom Chrome options here max_workers= 1, # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread) slow_mo= 1.2, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query(options=QueryOptions( optimize= True, # Blocks requests for resources like images and stylesheet limit=0 # Limit the number of jobs to scrape )),
def test_run(): # Change other logger levels logging.getLogger('urllib3').setLevel(logging.WARN) logging.getLogger('selenium').setLevel(logging.WARN) scraper = LinkedinScraper( chrome_executable_path=None, chrome_options=None, headless=True, max_workers=1, slow_mo=1, ) scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.INVALID_SESSION, on_invalid_session) scraper.on(Events.END, on_end) queries = [ Query(), Query(query='c#', options=QueryOptions( locations=['Finland'], optimize=False, limit=33, filters=QueryFilters( time=TimeFilters.WEEK, experience=ExperienceLevelFilters.MID_SENIOR, ))), Query( query='Engineer', options=QueryOptions( locations=['United States'], optimize=False, limit=27, filters=QueryFilters( company_jobs_url= 'https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', time=TimeFilters.MONTH, type=[ TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP, TypeFilters.CONTRACT ]))), ] scraper.run( queries=queries, # Global options options=QueryOptions( locations=['United Kingdom'], limit=10, optimize=False, ))
def scrapeLinkedinJobs(industries): scraper = LinkedinScraper( # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) chrome_executable_path= 'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe', chrome_options=None, # Custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=len( industries ), # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 2, # Slow down the scraper to avoid 'Too many requests (429)' errors ) queries = [] for i in range(len(industries)): paramQ = Query( query=industries[i], options=QueryOptions( locations=['Singapore'], optimize=True, limit=6, filters=QueryFilters( company_jobs_url=None, # Filter by companies relevance=RelevanceFilters.RECENT, time=TimeFilters.MONTH, type=[TypeFilters.FULL_TIME], experience=None, ))) queries.append(paramQ) JobList = {} def on_data(data: EventData): jobData = {} jobData["title"] = data.title jobData["company"] = data.company jobData["place"] = data.place jobData["description"] = data.description jobData["linkedinUrl"] = data.link jobData["descriptionHTML"] = data.description_html jobData["employmentType"] = data.employment_type jobData["applyUrl"] = data.apply_link jobData["date"] = data.date jobData["seniority"] = data.seniority_level jobData["jobFunction"] = data.job_function jobData["industries"] = data.industries jobData["skills"] = json.loads( extract_skills_from_document(data.description)) if data.query not in JobList.keys(): JobList[data.query] = [] JobList[data.query].append(jobData) else: JobList[data.query].append(jobData) del data del jobData def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) scraper.run(queries) JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()] return JobList
def on_data(data: EventData): print('[ON_DATA]', data.title, data.company, data.date, data.link, len(data.description)) def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper = LinkedinScraper( chrome_executable_path= None, chrome_options=None, # Custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=1, slow_mo=1, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ Query( query='Data', options=QueryOptions( locations=['Belgium'], optimize=True,
def linkedinsc(): with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json', 'r') as readfile: try: jsondata = json.load(readfile) stored_links = [] for single_data in jsondata: stored_links.append(single_data['Page_URL']) except: jsondata = [] stored_links = [] # Change root logger level (default is WARN) logging.basicConfig(level=logging.INFO) def on_data(data: EventData): # print('[ON_DATA]', data.title, data.company, data.date, data.link,data.seniority_level,data.employment_type) link = data.link link = link.split('?', 1)[0] if link not in stored_links: stored_links.append(link) print("NEW JOB FOUND !!!", link) source = requests.get(data.link).text soup = BeautifulSoup(source, 'lxml') desct = soup.find('main', class_='main').get_text(strip=True) jsondata.append({ 'name': data.title, 'company': data.company, 'address': data.place, 'deadline': data.date, 'time': data.employment_type, 'Page_URL': link, 'desct': desct, 'websitename': 'np.linkedin.com' }) def on_error(error): print('[ON_ERROR]', error) def on_end(): print('[ON_END]') scraper = LinkedinScraper( chrome_options=None, # You can pass your custom Chrome options here headless=True, # Overrides headless mode only if chrome_options is None max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread) slow_mo= 1.5, # Slow down the scraper to avoid 'Too many requests (429)' errors ) # Add event listeners scraper.on(Events.DATA, on_data) scraper.on(Events.ERROR, on_error) scraper.on(Events.END, on_end) queries = [ # Query( # options=QueryOptions( # optimize=True, # Blocks requests for resources like images and stylesheet # limit=50 # Limit the number of jobs to scrape # ) # ), Query( query='it', options=QueryOptions( locations=['Nepal'], optimize=True, limit=70, # filters=QueryFilters( # company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies # relevance=RelevanceFilters.RECENT, # time=TimeFilters.MONTH, # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP], # experience=None, # ) )) ] scraper.run(queries) with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json', 'w') as outfile: json.dump(jsondata, outfile) print("linkedin done")