Ejemplo n.º 1
0
def scrape_followers(
    driver: webdriver,
    username: str,
    cookies: List[Dict[str,
                       Any]] = None) -> Tuple[str, str, Set[str], Set[str]]:
    # CSS Selector for followers and following lists
    list_css: str = "div[role='dialog'] a.notranslate"

    if cookies:
        # Load any page before setting cookies
        driver.get("https://www.instagram.com/data/manifest.json")
        for cookie in cookies:
            driver.add_cookie(cookie)

    # Load account page
    driver.get(f"https://www.instagram.com/{username}/")

    num_followers: str = driver.find_element_by_css_selector(
        "a[href*='followers'] span").text
    num_following: str = driver.find_element_by_css_selector(
        "a[href*='following'] span").text

    # Click the 'Followers' link
    driver.find_element_by_partial_link_text("followers").click()
    WebDriverWait(driver, 10).until(
        EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css)))
    # TODO: Scrolling Magic here
    _followers: List = driver.find_elements_by_css_selector(list_css)
    followers: Set[str] = {i.text for i in _followers}

    driver.find_element_by_css_selector(
        "div[role='dialog'] button span[aria-label='Close']").click()

    # Click the 'Following' link
    driver.find_element_by_partial_link_text("following").click()
    WebDriverWait(driver, 10).until(
        EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css)))
    # TODO: Scrolling Magic here
    _following: List = driver.find_elements_by_css_selector(list_css)
    following: Set[str] = {i.text for i in _following}

    return (num_followers, num_following, followers, following)
Ejemplo n.º 2
0
    def run(self, driver: webdriver, search_url: str, query: Query, location: str) -> None:
        """
        Run strategy
        :param driver: webdriver
        :param search_url: str
        :param query: Query
        :param location: str
        :return: None
        """

        tag = f'[{query.query}][{location}]'
        processed = 0
        pagination_index = 1

        # Open main page first to verify/set the session
        debug(tag, f'Opening {HOME_URL}')
        driver.get(HOME_URL)
        sleep(self.scraper.slow_mo)

        if not AuthenticatedStrategy.__is_authenticated_session(driver):
            info(tag, 'Setting authentication cookie')

            try:
                driver.add_cookie({
                    'name': 'li_at',
                    'value': Config.LI_AT_COOKIE,
                    'domain': '.www.linkedin.com'
                })
            except BaseException as e:
                error(tag, e)
                error(tag, traceback.format_exc())
                return

        # Open search url
        info(tag, f'Opening {search_url}')
        driver.get(search_url)
        sleep(self.scraper.slow_mo)

        # Verify session
        if not AuthenticatedStrategy.__is_authenticated_session(driver):
            message = 'The provided session cookie is invalid. ' \
                      'Check the documentation on how to obtain a valid session cookie.'
            raise InvalidCookieException(message)

        # Wait container
        try:
            WebDriverWait(driver, 5).until(ec.presence_of_element_located((By.CSS_SELECTOR, Selectors.container)))
        except BaseException as e:
            warn(tag, 'No jobs found, skip')
            return

        # Pagination loop
        while processed < query.options.limit:
            # Verify session in loop
            if not AuthenticatedStrategy.__is_authenticated_session(driver):
                warn(tag, 'Session is no longer valid, this may cause the scraper to fail')
                self.scraper.emit(Events.INVALID_SESSION)
            else:
                info(tag, 'Session is valid')

            AuthenticatedStrategy.__accept_cookies(driver, tag)
            AuthenticatedStrategy.__close_chat_panel(driver, tag)

            job_index = 0

            job_tot = driver.execute_script('return document.querySelectorAll(arguments[0]).length;', Selectors.jobs)

            if job_tot == 0:
                info(tag, 'No jobs found, skip')
                break

            info(tag, f'Found {job_tot} jobs')

            # Jobs loop
            while job_index < job_tot and processed < query.options.limit:
                sleep(self.scraper.slow_mo)
                tag = f'[{query.query}][{location}][{processed + 1}]'

                # Extract job main fields
                debug(tag, 'Evaluating selectors', [
                    Selectors.jobs,
                    Selectors.links,
                    Selectors.companies,
                    Selectors.places,
                    Selectors.dates])

                try:
                    job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script(
                        '''
                            const index = arguments[0];
                            const job = document.querySelectorAll(arguments[1])[index];
                            const link = job.querySelector(arguments[2]);
                            
                            // Click job link and scroll
                            link.scrollIntoView();
                            link.click();
                            const linkUrl = link.getAttribute("href");
                        
                            const jobId = job.getAttribute("data-job-id");
                
                            const title = job.querySelector(arguments[3]) ?
                                job.querySelector(arguments[3]).innerText : "";

                            const company = job.querySelector(arguments[4]) ?
                                job.querySelector(arguments[4]).innerText : "";

                            const place = job.querySelector(arguments[5]) ?
                                job.querySelector(arguments[5]).innerText : "";

                            const date = job.querySelector(arguments[6]) ?
                                job.querySelector(arguments[6]).getAttribute('datetime') : "";

                            return [
                                jobId,
                                linkUrl,
                                title,
                                company,
                                place,
                                date
                            ];                                                    
                        ''',
                        job_index,
                        Selectors.jobs,
                        Selectors.links,
                        Selectors.title,
                        Selectors.companies,
                        Selectors.places,
                        Selectors.dates)

                    job_title = normalize_spaces(job_title)
                    job_company = normalize_spaces(job_company)
                    job_place = normalize_spaces(job_place)

                    # Join with base location if link is relative
                    job_link = urljoin(get_location(driver.current_url), job_link)

                    sleep(self.scraper.slow_mo)

                    # Wait for job details to load
                    debug(tag, f'Loading details job {job_id}')
                    load_result = AuthenticatedStrategy.__load_job_details(driver, job_id)

                    if not load_result['success']:
                        error(tag, load_result['error'])
                        job_index += 1
                        continue

                    # Extract
                    debug(tag, 'Evaluating selectors', [Selectors.description])

                    job_description, job_description_html = driver.execute_script(
                        '''
                            const el = document.querySelector(arguments[0]);

                            return [
                                el.innerText,
                                el.outerHTML    
                            ];
                        ''',
                        Selectors.description)

                    company_size =  driver.execute_script(
                        '''
                            const panel = document.querySelector(arguments[0]);

                            const company = panel.querySelector(arguments[1]);
                                
                            const company_size = company.querySelector(arguments[2]).innerText;
                            
                            return company_size;
                        ''',
                        Selectors.detailsPanel,
                        '.jobs-details-job-summary__section--center',
                        '.jobs-details-job-summary__text--ellipsis')

                    # TODO how to extract apply link?

                    # Extract criteria
                    debug(tag, 'Evaluating selectors', [Selectors.criteria])

                    job_seniority_level, job_employment_type, job_industries, job_function = driver.execute_script(
                        r'''
                            const nodes = document.querySelectorAll(arguments[0]);

                            const criteria = [
                                "Seniority Level",
                                "Employment Type",
                                "Industry",
                                "Job Functions",
                            ];

                            return Array.from(criteria.map(c => {
                                const el = Array.from(nodes).find(node => node.innerText.trim() === c);

                                if (el && el.nextElementSibling) {
                                    const sibling = el.nextElementSibling;
                                    return sibling.innerText
                                        .replace(/[\s]{2,}/g, ", ")
                                        .replace(/[\n\r]+/g, " ")
                                        .trim();
                                }
                                else {
                                    return "";
                                }
                            }));
                        ''',
                        Selectors.criteria)

                except BaseException as e:
                    # Verify session on error
                    if not AuthenticatedStrategy.__is_authenticated_session(driver):
                        warn(tag, 'Session is no longer valid, this may cause the scraper to fail')
                        self.scraper.emit(Events.INVALID_SESSION)

                    error(tag, e, traceback.format_exc())
                    self.scraper.emit(Events.ERROR, str(e) + '\n' + traceback.format_exc())
                    job_index += 1
                    continue

                data = EventData(
                    query=query.query,
                    location=location,
                    job_id=job_id,
                    job_index=job_index,
                    title=job_title,
                    company=job_company,
                    place=job_place,
                    date=job_date,
                    link=job_link,
                    apply_link='',
                    description=job_description,
                    description_html=job_description_html,
                    seniority_level=job_seniority_level,
                    job_function=job_function,
                    employment_type=job_employment_type,
                    industries=job_industries,
                    company_size=company_size)

                info(tag, 'Processed')

                job_index += 1
                processed += 1

                self.scraper.emit(Events.DATA, data)

                # Try fetching more jobs
                if processed < query.options.limit and job_index == job_tot:
                    job_tot = driver.execute_script('return document.querySelectorAll(arguments[0]).length;',
                                                    Selectors.jobs)

            # Check if we reached the limit of jobs to process
            if processed == query.options.limit:
                break

            # Try to paginate
            pagination_index += 1
            info(tag, f'Pagination requested ({pagination_index})')
            paginate_result = AuthenticatedStrategy.__paginate(driver, pagination_index)

            if not paginate_result['success']:
                info(tag, "Couldn't find more jobs for the running query")
                return
Ejemplo n.º 3
0
    def run(self, driver: webdriver, cdp: CDP, search_url: str, query: Query,
            location: str, apply_link: bool) -> None:
        """
        Run strategy
        :param driver: webdriver
        :param cdp: CDP
        :param search_url: str
        :param query: Query
        :param location: str
        :param apply_link: bool
        :return: None
        """

        tag = f'[{query.query}][{location}]'

        metrics = EventMetrics()

        pagination_index = 0
        pagination_size = 25

        # Open main page first to verify/set the session
        debug(tag, f'Opening {HOME_URL}')
        driver.get(HOME_URL)
        sleep(self.scraper.slow_mo)

        if not AuthenticatedStrategy.__is_authenticated_session(driver):
            info(tag, 'Setting authentication cookie')

            try:
                driver.add_cookie({
                    'name': 'li_at',
                    'value': Config.LI_AT_COOKIE,
                    'domain': '.www.linkedin.com'
                })
            except BaseException as e:
                error(tag, e)
                error(tag, traceback.format_exc())
                return

        # Open search url
        info(tag, f'Opening {search_url}')
        driver.get(search_url)
        sleep(self.scraper.slow_mo)

        # Verify session
        if not AuthenticatedStrategy.__is_authenticated_session(driver):
            message = 'The provided session cookie is invalid. ' \
                      'Check the documentation on how to obtain a valid session cookie.'
            raise InvalidCookieException(message)

        # Wait container
        try:
            WebDriverWait(driver, 5).until(
                ec.presence_of_element_located(
                    (By.CSS_SELECTOR, Selectors.container)))
        except BaseException as e:
            warn(tag, 'No jobs found, skip')
            return

        # Pagination loop
        while metrics.processed < query.options.limit:
            # Verify session in loop
            if not AuthenticatedStrategy.__is_authenticated_session(driver):
                warn(
                    tag,
                    'Session is no longer valid, this may cause the scraper to fail'
                )
                self.scraper.emit(Events.INVALID_SESSION)
            else:
                info(tag, 'Session is valid')

            AuthenticatedStrategy.__accept_cookies(driver, tag)
            AuthenticatedStrategy.__close_chat_panel(driver, tag)
            AuthenticatedStrategy.__accept_privacy(driver, tag)

            job_index = 0

            job_tot = driver.execute_script(
                'return document.querySelectorAll(arguments[0]).length;',
                Selectors.jobs)

            if job_tot == 0:
                info(tag, 'No jobs found, skip')
                break

            # Jobs loop
            while job_index < job_tot and metrics.processed < query.options.limit:
                sleep(self.scraper.slow_mo)
                tag = f'[{query.query}][{location}][{pagination_index * pagination_size + job_index + 1}]'

                # Try to recover focus to main page in case of unwanted tabs still open
                # (generally caused by apply link click).
                if len(driver.window_handles) > 1:
                    debug('Try closing unwanted targets')
                    try:
                        targets_result = cdp.get_targets()

                        # try to close other unwanted tabs (targets)
                        if targets_result['success']:
                            for target in targets_result['result'].targets:
                                if 'linkedin.com/jobs' not in target.url:
                                    debug(f'Closing target {target.url}')
                                    cdp.close_target(target.targetId)
                    finally:
                        debug('Switched to main handle')
                        driver.switch_to.window(driver.window_handles[0])

                try:
                    # Extract job main fields
                    debug(tag, 'Evaluating selectors', [
                        Selectors.jobs, Selectors.link, Selectors.company,
                        Selectors.place, Selectors.date
                    ])

                    job_id, job_link, job_title, job_company, job_company_link, \
                    job_company_img_link, job_place, job_date = \
                        driver.execute_script(
                            '''
                                const index = arguments[0];
                                const job = document.querySelectorAll(arguments[1])[index];
                                const link = job.querySelector(arguments[2]);
                                
                                // Click job link and scroll
                                link.scrollIntoView();
                                link.click();
                                const linkUrl = link.getAttribute("href");
                            
                                const jobId = job.getAttribute("data-job-id");
                    
                                const title = job.querySelector(arguments[3]) ?
                                    job.querySelector(arguments[3]).innerText : "";
                                    
                                let company = "";
                                let companyLink = "";
                                const companyElem = job.querySelector(arguments[4]); 
                                
                                if (companyElem) {                                    
                                    company = companyElem.innerText;
                                    const protocol = window.location.protocol + '//';
                                    const host = window.location.host;
                                    companyLink = `${protocol}${host}${companyElem.getAttribute('href')}`;
                                }
                                
                                const companyImgLink = job.querySelector("img") ? 
                                    job.querySelector("img").getAttribute("src") : "";                                                            
    
                                const place = job.querySelector(arguments[5]) ?
                                    job.querySelector(arguments[5]).innerText : "";
    
                                const date = job.querySelector(arguments[6]) ?
                                    job.querySelector(arguments[6]).getAttribute('datetime') : "";
    
                                return [
                                    jobId,
                                    linkUrl,
                                    title,
                                    company,
                                    companyLink,
                                    companyImgLink,
                                    place,
                                    date,
                                ];                                                    
                            ''',
                            job_index,
                            Selectors.jobs,
                            Selectors.link,
                            Selectors.title,
                            Selectors.company_link,
                            Selectors.place,
                            Selectors.date)

                    job_title = normalize_spaces(job_title)
                    job_company = normalize_spaces(job_company)
                    job_place = normalize_spaces(job_place)

                    # Join with base location if link is relative
                    job_link = urljoin(get_location(driver.current_url),
                                       job_link)

                    sleep(self.scraper.slow_mo)

                    # Wait for job details to load
                    debug(tag, f'Loading details job {job_id}')
                    load_result = AuthenticatedStrategy.__load_job_details(
                        driver, job_id)

                    if not load_result['success']:
                        error(tag, load_result['error'], exc_info=False)
                        info(tag, 'Skipped')
                        job_index += 1
                        metrics.failed += 1
                        continue

                    # Extract
                    debug(tag, 'Evaluating selectors', [Selectors.description])

                    job_description, job_description_html = driver.execute_script(
                        '''
                            const el = document.querySelector(arguments[0]);

                            return [
                                el.innerText,
                                el.outerHTML    
                            ];
                        ''', Selectors.description)

                    # Extract insights
                    debug(tag, 'Evaluating selectors', [Selectors.insights])

                    job_insights = driver.execute_script(
                        r'''
                            const nodes = document.querySelectorAll(arguments[0]);
                            return Array.from(nodes).map(e => e.textContent.replace(/[\n\r\t ]+/g, ' ').trim());                            
                        ''', Selectors.insights)

                    # Apply link
                    job_apply_link = ''

                    if apply_link:
                        try:
                            debug(tag, 'Evaluating selectors',
                                  [Selectors.applyBtn])

                            driver.execute_script(
                                r'''
                                    const applyBtn = document.querySelector(arguments[0]);

                                    if (applyBtn) {
                                        applyBtn.click();
                                        return true;
                                    }

                                    return false;
                                ''', Selectors.applyBtn)

                            if len(driver.window_handles) > 1:
                                debug(tag, 'Try extracting apply link')

                                targets_result = cdp.get_targets()

                                if targets_result['success']:
                                    # The first not attached target should be the apply page
                                    apply_target = next(
                                        (e for e in targets_result['result'].
                                         targets if not e.attached), '')

                                    if apply_target:
                                        job_apply_link = apply_target.url
                                        cdp.close_target(apply_target.targetId)
                                else:
                                    warn(tag, 'Failed to extract apply link',
                                         targets_result['error'])

                        except BaseException as e:
                            warn(tag, 'Failed to extract apply link', e)

                    data = EventData(query=query.query,
                                     location=location,
                                     job_id=job_id,
                                     job_index=job_index,
                                     title=job_title,
                                     company=job_company,
                                     company_link=job_company_link,
                                     company_img_link=job_company_img_link,
                                     place=job_place,
                                     date=job_date,
                                     link=job_link,
                                     apply_link=job_apply_link,
                                     description=job_description,
                                     description_html=job_description_html,
                                     insights=job_insights)

                    info(tag, 'Processed')

                    job_index += 1
                    metrics.processed += 1

                    self.scraper.emit(Events.DATA, data)

                    # Try fetching more jobs
                    if metrics.processed < query.options.limit and job_index == job_tot < pagination_size:
                        load_jobs_result = AuthenticatedStrategy.__load_jobs(
                            driver, job_tot)

                        if load_jobs_result['success']:
                            job_tot = load_jobs_result['count']

                    if job_index == job_tot:
                        break

                except BaseException as e:
                    try:
                        # Verify session on error
                        if not AuthenticatedStrategy.__is_authenticated_session(
                                driver):
                            warn(
                                tag,
                                'Session is no longer valid, this may cause the scraper to fail'
                            )
                            self.scraper.emit(Events.INVALID_SESSION)

                        error(tag, e, traceback.format_exc())
                        self.scraper.emit(
                            Events.ERROR,
                            str(e) + '\n' + traceback.format_exc())
                    finally:
                        info(tag, 'Skipped')
                        job_index += 1
                        metrics.failed += 1

                    continue

            tag = f'[{query.query}][{location}]'

            info(tag, 'No more jobs to process in this page')

            # Check if we reached the limit of jobs to process
            if metrics.processed == query.options.limit:
                info(tag, 'Query limit reached!')
                info(tag, 'Metrics:', str(metrics))
                self.scraper.emit(Events.METRICS, metrics)
                break
            else:
                metrics.missed += pagination_size - job_index
                info(tag, 'Metrics:', str(metrics))
                self.scraper.emit(Events.METRICS, metrics)

            # Try to paginate
            pagination_index += 1
            info(tag, f'Pagination requested [{pagination_index}]')
            offset = pagination_index * pagination_size
            paginate_result = AuthenticatedStrategy.__paginate(
                driver, search_url, tag, offset)

            if not paginate_result['success']:
                info(tag, "Couldn't find more jobs for the running query")
                return