def test_run():
    # Change other logger levels
    logging.getLogger('urllib3').setLevel(logging.WARN)
    logging.getLogger('selenium').setLevel(logging.WARN)

    scraper = LinkedinScraper(
        chrome_executable_path=None,
        chrome_options=None,
        headless=True,
        max_workers=1,
        slow_mo=1,
    )

    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.INVALID_SESSION, on_invalid_session)
    scraper.on(Events.END, on_end)

    queries = [
        Query(),

        Query(
            query='c#',
            options=QueryOptions(
                locations=['Finland'],
                optimize=False,
                apply_link=True,
                limit=33,
                filters=QueryFilters(
                    time=TimeFilters.WEEK,
                    experience=ExperienceLevelFilters.MID_SENIOR,
                )
            )
        ),

        Query(
            query='Engineer',
            options=QueryOptions(
                locations=['United States'],
                optimize=False,
                limit=27,
                filters=QueryFilters(
                    company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',
                    time=TimeFilters.MONTH,
                    type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP, TypeFilters.CONTRACT]
                )
            )
        ),
    ]

    scraper.run(
        queries=queries,
        # Global options
        options=QueryOptions(
            locations=['United Kingdom'],
            limit=10,
            optimize=True,
        )
    )
Exemple #2
0
def scrape(query, numresults):
    scraper = LinkedinScraper(
        chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver) 
        chrome_options=None,  # Custom Chrome options here
        headless=False,  # Overrides headless mode only if chrome_options is None
        max_workers=1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=1,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )

    # Add event listeners
    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    queries = [
        Query(
            query=query,
            options=QueryOptions(
                locations=['Montreal'],
                optimize=True,  # Blocks requests for resources like images and stylesheet
                limit=numresults  # Limit the number of jobs to scrape
            )
        ),
        # Query(
        #     query='database',
        #     options=QueryOptions(
        #         locations=['United States'],
        #         optimize=False,
        #         limit=5,
        #         filters=QueryFilters(
        #             # company_jobs_url='https://www.linkedin.com/jobs/search/?geoId=101174742&keywords=amazon&location=Canada',  # Filter by companies
        #             relevance=RelevanceFilters.RECENT,
        #             time=TimeFilters.MONTH,
        #             type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
        #             experience=None,
        #         )
        #     )
        # ),
    ]

    scraper.run(queries)
)

# Add event listeners

scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Data Engineer',
        options=QueryOptions(
            locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'],
            optimize=False,
            limit=940,
            filters=QueryFilters(
                relevance=RelevanceFilters.RELEVANT,
                time=TimeFilters.DAY, 

            )
        )
    ),
     Query(
        query='Data Scientist',
        options=QueryOptions(
            locations=['indonesia','vietnam ','china','singapore','korea selatan','japan','Hong Kong SAR','taiwan'],
            optimize=False,
            limit=940,
            filters=QueryFilters(
                relevance=RelevanceFilters.RELEVANT,
                time=TimeFilters.DAY,                
Exemple #4
0
                None,  # You can pass your custom Chrome options here
                max_workers=
                1,  # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread)
                slow_mo=
                1.2,  # Slow down the scraper to avoid 'Too many requests (429)' errors
            )

            # Add event listeners
            scraper.on(Events.DATA, on_data)
            scraper.on(Events.ERROR, on_error)
            scraper.on(Events.END, on_end)

            queries = [
                Query(options=QueryOptions(
                    optimize=
                    True,  # Blocks requests for resources like images and stylesheet
                    limit=0  # Limit the number of jobs to scrape
                )),
                Query(query=search,
                      options=QueryOptions(
                          locations=['Toronto, Ontario, Canada'],
                          optimize=True,
                          limit=400,
                          filters=QueryFilters(
                              relevance=RelevanceFilters.RELEVANT,
                              time=TimeFilters.WEEK,
                              type=[TypeFilters.FULL_TIME],
                          ))),
            ]

            scraper.run(queries)
Exemple #5
0
def scrapeLinkedinJobs(industries):

    scraper = LinkedinScraper(
        # chrome_executable_path='D:/chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
        chrome_executable_path=
        'C:/Users/iyeng/Desktop/NTU/NTU Sem 4/CZ2006/JobsUpply/JobsUpply/chromedriver.exe',
        chrome_options=None,  # Custom Chrome options here
        headless=True,  # Overrides headless mode only if chrome_options is None
        max_workers=len(
            industries
        ),  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=
        2,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )
    queries = []
    for i in range(len(industries)):
        paramQ = Query(
            query=industries[i],
            options=QueryOptions(
                locations=['Singapore'],
                optimize=True,
                limit=6,
                filters=QueryFilters(
                    company_jobs_url=None,  # Filter by companies
                    relevance=RelevanceFilters.RECENT,
                    time=TimeFilters.MONTH,
                    type=[TypeFilters.FULL_TIME],
                    experience=None,
                )))
        queries.append(paramQ)

    JobList = {}

    def on_data(data: EventData):
        jobData = {}
        jobData["title"] = data.title
        jobData["company"] = data.company
        jobData["place"] = data.place
        jobData["description"] = data.description
        jobData["linkedinUrl"] = data.link
        jobData["descriptionHTML"] = data.description_html
        jobData["employmentType"] = data.employment_type
        jobData["applyUrl"] = data.apply_link
        jobData["date"] = data.date
        jobData["seniority"] = data.seniority_level
        jobData["jobFunction"] = data.job_function
        jobData["industries"] = data.industries
        jobData["skills"] = json.loads(
            extract_skills_from_document(data.description))
        if data.query not in JobList.keys():
            JobList[data.query] = []
            JobList[data.query].append(jobData)
        else:
            JobList[data.query].append(jobData)
        del data
        del jobData

    def on_error(error):
        print('[ON_ERROR]', error)

    def on_end():
        print('[ON_END]')

    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    scraper.run(queries)

    JobList = [{"queryText": q, "jobList": JobList[q]} for q in JobList.keys()]
    return JobList
scraper = LinkedinScraper(
    chrome_options=None,  # You can pass your custom Chrome options here
    max_workers=1,  # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread)
    slow_mo=1,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        options=QueryOptions(
            optimize=True,  # Blocks requests for resources like images and stylesheet
            limit=27  # Limit the number of jobs to scrape
        )
    ),
    Query(
        query='Engineer',
        options=QueryOptions(
            locations=['United States'],
            optimize=False,
            limit=5,
            filters=QueryFilters(
                company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',  # Filter by companies
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.MONTH,
                type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                experience=None,
            )
            scraper = LinkedinScraper(
                chrome_options=None,  # You can pass your custom Chrome options here
                max_workers=1,  # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread)
                slow_mo=1.5,  # Slow down the scraper to avoid 'Too many requests (429)' errors
            )

            # Add event listeners
            scraper.on(Events.DATA, on_data)
            scraper.on(Events.ERROR, on_error)
            scraper.on(Events.END, on_end)

            queries = [
                Query(
                    options=QueryOptions(
                        optimize=True,  # Blocks requests for resources like images and stylesheet
                        limit=0  # Limit the number of jobs to scrape
                    )
                ),
                Query(
                    query= search ,
                    options=QueryOptions(
                        locations=['Canada'],
                        optimize=True,
                        limit=50, #full on 500
                        filters=QueryFilters(
                            relevance=RelevanceFilters.RELEVANT,
                            time=TimeFilters.WEEK,
                            type=[TypeFilters.FULL_TIME],
                        )
                    )
                ),
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Data',
        options=QueryOptions(
            locations=['Belgium'],
            optimize=True,
            limit=10,
            filters=QueryFilters(
                company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1508%2C6754%2C3880216%2C631981%2C166278%2C963211%2C3625182%2C256009%2C157326%2C282760%2C3627928%2C1519%2C281207%2C18735883%2C10070%2C98774%2C15245937%2C3683364%2C251838%2C2642837&geoId=92000000',  # Filter by companies
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.MONTH,
                type=[TypeFilters.FULL_TIME, TypeFilters.TEMPORARY],
                experience=[ExperienceLevelFilters.ENTRY_LEVEL, ExperienceLevelFilters.MID_SENIOR],
            )
        )
    ),
]

scraper.run(queries)

## Currently not working (ie no data export)

results = scraper.run(queries)
import csv
def linkedinsc():

    with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json',
              'r') as readfile:
        try:
            jsondata = json.load(readfile)
            stored_links = []
            for single_data in jsondata:
                stored_links.append(single_data['Page_URL'])
        except:
            jsondata = []
            stored_links = []
    # Change root logger level (default is WARN)
    logging.basicConfig(level=logging.INFO)

    def on_data(data: EventData):
        # print('[ON_DATA]', data.title, data.company, data.date, data.link,data.seniority_level,data.employment_type)

        link = data.link
        link = link.split('?', 1)[0]
        if link not in stored_links:
            stored_links.append(link)
            print("NEW JOB FOUND !!!", link)
            source = requests.get(data.link).text
            soup = BeautifulSoup(source, 'lxml')
            desct = soup.find('main', class_='main').get_text(strip=True)
            jsondata.append({
                'name': data.title,
                'company': data.company,
                'address': data.place,
                'deadline': data.date,
                'time': data.employment_type,
                'Page_URL': link,
                'desct': desct,
                'websitename': 'np.linkedin.com'
            })

    def on_error(error):
        print('[ON_ERROR]', error)

    def on_end():
        print('[ON_END]')

    scraper = LinkedinScraper(
        chrome_options=None,  # You can pass your custom Chrome options here
        headless=True,  # Overrides headless mode only if chrome_options is None
        max_workers=1,
        # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
        slow_mo=
        1.5,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    )

    # Add event listeners
    scraper.on(Events.DATA, on_data)
    scraper.on(Events.ERROR, on_error)
    scraper.on(Events.END, on_end)

    queries = [
        # Query(
        #     options=QueryOptions(
        #         optimize=True,  # Blocks requests for resources like images and stylesheet
        #         limit=50  # Limit the number of jobs to scrape
        #     )
        # ),
        Query(
            query='it',
            options=QueryOptions(
                locations=['Nepal'],
                optimize=True,
                limit=70,
                # filters=QueryFilters(
                #     company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',  # Filter by companies
                #     relevance=RelevanceFilters.RECENT,
                #     time=TimeFilters.MONTH,
                #     type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                #     experience=None,
                # )
            ))
    ]

    scraper.run(queries)
    with open('C:/Projects/itjobseeker/public/jsondata/linkedin.json',
              'w') as outfile:
        json.dump(jsondata, outfile)
    print("linkedin done")
    chrome_options=None,  # You can pass your custom Chrome options here
    max_workers=
    1,  # How many threads will be spawn to run queries concurrently (one Chrome driver for each thread)
    slow_mo=
    0.4,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(options=QueryOptions(
        optimize=
        True,  # Blocks requests for resources like images and stylesheet
        limit=0  # Limit the number of jobs to scrape
    )),
    Query(query='Engineer',
          options=QueryOptions(
              locations=['Toronto, Ontario, Canada'],
              optimize=False,
              limit=5,
              filters=QueryFilters(
                  relevance=RelevanceFilters.RECENT,
                  time=TimeFilters.MONTH,
                  type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                  experience=None,
              ))),
]
Exemple #11
0
    headless=True,  # Overrides headless mode only if chrome_options is None
    max_workers=1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
    slow_mo=5.0,  # Slow down the scraper to avoid 'Too many requests (429)' errors
    #slow_mo=0.4,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='Machine Learning Engineer',
        options=QueryOptions(
            locations=['Bengaluru, Karnataka, India'],
            optimize=True,
            limit=1000,
            filters=QueryFilters(
                #company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000',  # Filter by companies
                relevance=RelevanceFilters.RELEVANT,
                time=TimeFilters.MONTH,
                #type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
                experience=None,
            )
        )
    ),
]

scraper.run(queries)
Exemple #12
0
    chrome_options=None,  # You can pass your custom Chrome options here
    headless=True,  # Overrides headless mode only if chrome_options is None
    max_workers=
    1,  # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
    slow_mo=
    0.4,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners

# Evento que para detectar cada oferta
scraper.on(Events.DATA, on_data)

scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        # Categoria que estás buscando
        query='Engineer',
        options=QueryOptions(
            # Pais donde quieres buscar
            locations=['United States'],
            # En caso sean imágenes o css
            optimize=False,
            # Defines la cantidad
            limit=5)),
]

scraper.run(queries)
Exemple #13
0
    0.6,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

limit = 9000
queries = [
    Query(query='Cloud Engineer',
          options=QueryOptions(locations=['Canada', "United-States"],
                               optimize=False,
                               limit=limit,
                               filters=QueryFilters(
                                   relevance=RelevanceFilters.RECENT,
                                   time=TimeFilters.MONTH,
                                   experience=[
                                       ExperienceLevelFilters.INTERNSHIP,
                                       ExperienceLevelFilters.ASSOCIATE,
                                       ExperienceLevelFilters.ENTRY_LEVEL
                                   ]))),
    Query(query='Cloud architect',
          options=QueryOptions(locations=['Canada', 'United-States'],
                               optimize=False,
                               limit=limit,
                               filters=QueryFilters(
                                   relevance=RelevanceFilters.RECENT,
                                   time=TimeFilters.MONTH,
                                   experience=[
                                       ExperienceLevelFilters.INTERNSHIP,
                                       ExperienceLevelFilters.ASSOCIATE,
Exemple #14
0
    slow_mo=2,  # Slow down the scraper to avoid 'Too many requests (429)' errors
)

# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

queries = [
    Query(
        query='human resources',
        options=QueryOptions(
            locations=['Sharnbrook, England, United Kingdom'],
            optimize=False,
            limit=1000,
            filters=QueryFilters(
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.DAY,
                type=[TypeFilters.FULL_TIME, TypeFilters.CONTRACT, TypeFilters.TEMPORARY],
                experience=ExperienceLevelFilters.ENTRY_LEVEL,                
            )
        )
    ),
    Query(
        query='human resources',
        options=QueryOptions(
            locations=['United Kingdom'],
            optimize=False,
            limit=1000,
            filters=QueryFilters(
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.DAY,