コード例 #1
0
ファイル: random.py プロジェクト: Jhingun1/linkedin-1
 def wait_page_completion(self, driver):
     """
     Abstract function, used to customize how the specific spider have to wait for page completion.
     Blank by default
     :param driver:
     :return:
     """
     # waiting links to other users are shown so the crawl can continue
     get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
コード例 #2
0
def extracts_linkedin_users(driver, api_client):
    """
    Gets from a page containing a list of users, all the users.
    For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22]
    :param driver: The webdriver, logged in, and located in the page which lists users.
    :return: Iterator on LinkedinUser.
    """

    for i in range(1, 11):
        print(f'loading {i}th user')

        last_result_xpath = f'//li[{i}]/*/div[@class="entity-result__item"]'

        result = get_by_xpath_or_none(driver, last_result_xpath)

        if result is not None:
            link_elem = get_by_xpath_or_none(result,
                                             './/*[@class="app-aware-link"]')
            link = link_elem.get_attribute(
                'href') if link_elem is not None else None
            print(link)
            name_elem = get_by_xpath_or_none(result,
                                             './/*[@class="visually-hidden"]')
            name = name_elem.text if name_elem is not None else None
            print(name)
            title_elem = get_by_xpath_or_none(
                result,
                './/*[@class="entity-result__primary-subtitle t-14 t-black t-normal"]'
            )
            title = title_elem.text if name_elem is not None else None
            print(title)

            # extract_profile_id_from_url
            profile_id = link.split('?')[-2]
            profile_id = profile_id.split('/')[-1]
            print(profile_id)

            user = extract_contact_info(api_client, profile_id)
            print(user)

            yield user

            if link_elem is not None:
                driver.execute_script("arguments[0].scrollIntoView();",
                                      link_elem)
            elif name_elem is not None:
                driver.execute_script("arguments[0].scrollIntoView();",
                                      name_elem)
            elif title_elem is not None:
                driver.execute_script("arguments[0].scrollIntoView();",
                                      title_elem)
            else:
                print("Was not possible to scroll")

        time.sleep(random.uniform(0.7, 1.5))
コード例 #3
0
ファイル: random.py プロジェクト: 5l1v3r1/linkedin-1
 def wait_page_completion(self, driver):
     """
     Abstract function, used to customize how the specific spider have to wait for page completion.
     Blank by default
     :param driver:
     :return:
     """
     # waiting links to other users are shown so the crawl can continue
     get_by_xpath_or_none(driver,
                          "//*/div[@class='pv-deferred-area ember-view']",
                          wait_timeout=3)
コード例 #4
0
 def wait_page_completion(self, driver):
     """
     Abstract function, used to customize how the specific spider must wait for a search page completion.
     Blank by default
     :param driver:
     :return:
     """
     profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
     get_by_xpath_or_none(driver, profile_xpath)
     time.sleep(random.uniform(3, 7))
     pass
コード例 #5
0
ファイル: companies.py プロジェクト: NFTercel/linkedin
    def parser_search_results_page(self, response):
        print('Now parsing search result page')

        no_result_found_xpath = '//*[text()="No results found."]'

        no_result_response = get_by_xpath_or_none(driver=self.driver,
                                                  xpath=no_result_found_xpath,
                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT,
                                                  logs=False)

        if no_result_response is not None:
            print('"No results" message shown, stop crawling this company')
            return
        else:
            company = extract_company(self.driver)
            print(f'Company:{company}')

            users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client)
            for user in users:
                yield user

            # incrementing the index at the end of the url
            url = response.request.url
            next_url_split = url.split('=')
            index = int(next_url_split[-1])
            next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)

            yield Request(url=next_url,
                          callback=self.parser_search_results_page,
                          meta={'company': company},
                          dont_filter=True,
                          )
コード例 #6
0
ファイル: search.py プロジェクト: 5l1v3r1/linkedin-1
def extracts_linkedin_users(driver, api_client):
    """
    Gets from a page containing a list of users, all the users.
    For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22]
    :param driver: The webdriver, logged in, and located in the page which lists users.
    :return: Iterator on LinkedinUser.
    """

    for i in range(1, 11):
        print(f'loading {i}th user')

        last_result_xpath = f'//li[{i}]/*/div[@class="search-result__wrapper"]'

        result = get_by_xpath_or_none(driver, last_result_xpath)
        if result is not None:
            link_elem = get_by_xpath_or_none(
                result, './/*[@class="search-result__result-link ember-view"]')
            link = link_elem.get_attribute(
                'href') if link_elem is not None else None

            name_elem = get_by_xpath_or_none(result,
                                             './/*[@class="name actor-name"]')
            name = name_elem.text if name_elem is not None else None

            title_elem = get_by_xpath_or_none(result, './/p')
            title = title_elem.text if name_elem is not None else None

            # extract_profile_id_from_url
            profile_id = link.split('/')[-2]
            user = extract_contact_info(api_client, profile_id)

            yield user

            if link_elem is not None:
                driver.execute_script("arguments[0].scrollIntoView();",
                                      link_elem)
            elif name_elem is not None:
                driver.execute_script("arguments[0].scrollIntoView();",
                                      name_elem)
            elif title_elem is not None:
                driver.execute_script("arguments[0].scrollIntoView();",
                                      title_elem)
            else:
                print("Was not possible to scroll")

        time.sleep(0.7)
コード例 #7
0
def extract_company(driver):
    """
    Extract company name from a search result page.
    :param driver: The selenium webdriver.
    :return: The company string, None if something wrong.
    """
    company_xpath = '//li[@class="search-s-facet search-s-facet--facetCurrentCompany inline-block ' \
                    'search-s-facet--is-closed ember-view"]/form/button/div/div/h3 '
    company_elem = get_by_xpath_or_none(driver, company_xpath)
    return company_elem.text if company_elem is not None else None
コード例 #8
0
    def parser_search_results_page(self, response):
        # getting optional callback's arguments:
        driver = response.meta.pop('driver')

        # maximum number for pagination
        max_page = response.meta.get('max_page', None)

        # stop_criteria : returns True if search must stop
        stop_criteria = response.meta.get('stop_criteria', None)
        stop_criteria_args = response.meta.get('stop_criteria_args', None)

        # Now parsing search result page
        # no_result_found_xpath = '//*[text()="No results found."]'
        #no_result_response = get_by_xpath_or_none(driver=driver, xpath=no_result_found_xpath, wait_timeout=NO_RESULT_WAIT_TIMEOUT)

        result_count_found_xpath = './/*[@class="pb2 t-black--light t-14"]'
        result_count_response = get_by_xpath_or_none(
            driver=driver,
            xpath=result_count_found_xpath,
            wait_timeout=NO_RESULT_WAIT_TIMEOUT)

        if result_count_response is None:
            # no results message shown: stop crawling this company
            driver.close()
            return
        else:
            users = extracts_linkedin_users(driver, api_client=self.api_client)
            for user in users:
                if stop_criteria is not None:
                    if stop_criteria(user, stop_criteria_args):
                        # if stop criteria is matched stops the crawl, and also next pages
                        driver.close()
                        return
                yield user

            # incrementing the index at the end of the url
            url = response.request.url
            next_url_split = url.split('=')
            index = int(next_url_split[-1])
            next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)

            if max_page is not None:
                if index >= max_page:
                    driver.close()
                    return

            driver.close()
            yield Request(
                url=next_url,
                callback=self.parser_search_results_page,
                meta=copy.deepcopy(response.meta),
                dont_filter=True,
            )
コード例 #9
0
ファイル: middlewares.py プロジェクト: hijm/linkedin
    def process_request(self, request, spider):
        driver = spider.driver

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
        get_by_xpath(driver, profile_xpath)

        # waiting links to other users are shown so the crawl can continue
        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        return HtmlResponse(driver.current_url,
                            body=body,
                            encoding='utf-8',
                            request=request)
コード例 #10
0
    def extract_profiles_from_url(self, response):
        # getting optional callback's arguments:
        driver = response.meta.pop('driver')

        # maximum number for pagination
        max_page = response.meta.get('max_page', None)

        # stop_criteria : returns True if search must stop
        stop_criteria = response.meta.get('stop_criteria', None)
        stop_criteria_args = response.meta.get('stop_criteria_args', None)

        # Now parsing search result page
        # no_result_found_xpath = '//*[text()="No results found."]'
        #no_result_response = get_by_xpath_or_none(driver=driver, xpath=no_result_found_xpath, wait_timeout=NO_RESULT_WAIT_TIMEOUT)

        result_count_found_xpath = './/*[@class="pb2 t-black--light t-14"]'
        result_count_response = get_by_xpath_or_none(
            driver=driver,
            xpath=result_count_found_xpath,
            wait_timeout=NO_RESULT_WAIT_TIMEOUT)

        if result_count_response is None:
            # no results message shown: stop crawling this company
            driver.close()
            return
        else:
            for i in range(1, 11):
                print(f'loading {i}th user')
                last_result_xpath = f'//li[{i}]/*/div[@class="entity-result__item"]'
                result = get_by_xpath_or_none(driver, last_result_xpath)
                if result is not None:
                    link_elem = get_by_xpath_or_none(
                        result, './/*[@class="app-aware-link"]')
                    link = link_elem.get_attribute(
                        'href') if link_elem is not None else None
                    print(link)

                    # extract_profiles_from_url
                    profile_id = link.split('?')[-2]
                    profile_id = profile_id.split('/')[-1]
                    print(profile_id)

                    image_elem = get_by_xpath_or_none(
                        result,
                        './/*[@class="ivm-view-attr__img--centered EntityPhoto-circle-3  lazy-image ember-view"]'
                    )
                    image_link = image_elem.get_attribute(
                        'src') if image_elem is not None else None

                    if image_link is not None:
                        print(image_link)
                        get_by_xpath(result,
                                     './/*[@class="app-aware-link"]').click()
                        time.sleep(random.uniform(8, 14))
                        driver.back()
                        #yield Request(url=link,
                        #        callback=self.extract_profiles_from_url,
                        #        dont_filter=True,
                        #        )

                    time.sleep(random.uniform(6, 9))

            # incrementing the index at the end of the url
            url = response.request.url
            next_url_split = url.split('=')
            index = int(next_url_split[-1])
            next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)
            print(next_url)

            if max_page is not None:
                if index >= max_page:
                    driver.close()
                    return

            driver.close()
            yield Request(
                url=next_url,
                callback=self.extract_profiles_from_url,
                meta=copy.deepcopy(response.meta),
                dont_filter=True,
            )