コード例 #1
0
    def get_company_data(self, url):
        #print(url)
        no_industry = False
        if url.split("/")[3] != "company":
            print("no company page")
            return {
                'industry': 'N/A',
                'companyname': 'N/A',
                'location': Location('N/A', 'N/A', 'N/A')
            }

        if url not in self.industries_dict:
            try:
                self.browser.execute_script("window.open('');")
                self.browser.switch_to.window(self.browser.window_handles[1])
                self.browser.get(url)
            except:
                print("error opening company page")
                return {
                    'industry': 'N/A',
                    'companyname': 'N/A',
                    'location': Location('N/A', 'N/A', 'N/A')
                }
            try:
                card_summary_divs = self.browser\
                    .find_element_by_class_name('org-top-card-summary-info-list')\
                    .find_elements_by_class_name('org-top-card-summary-info-list__info-item')
                inline_divs = self.browser\
                    .find_element_by_class_name('org-top-card-summary-info-list')\
                    .find_element_by_class_name('inline-block')\
                    .find_elements_by_class_name('org-top-card-summary-info-list__info-item')
                if len(card_summary_divs) == len(inline_divs):
                    no_industry = True
                #print("card_summary_divs {}, inline_divs {}".format(len(card_summary_divs),
                #                                                    len(inline_divs)))
            except:
                print("error getting company data 3")
            #industry
            try:
                if no_industry:
                    self.industries_dict[url] = "N/A"
                else:
                    self.industries_dict[url] = self.browser.execute_script(
                        "return document.getElementsByClassName("
                        "'org-top-card-summary-info-list__info-item')["
                        "0].innerText")
            except:
                #print("industry wasnt scrapped")
                self.industries_dict[url] = 'N/A'
            #companyname
            try:
                self.companies_dict[url] = self.browser.execute_script(
                    "return document.getElementsByClassName("
                    "'org-top-card-summary__title')["
                    "0].title")
            except:
                print("company name wasnt scrapped")
                self.companies_dict[url] = 'N/A'
            #locations
            try:
                if no_industry:
                    self.locations_dict[url] = self.browser.execute_script(
                        "return document.getElementsByClassName("
                        "'org-top-card-summary-info-list__info-item')["
                        "0].innerText")
                else:
                    self.locations_dict[url] = self.browser.execute_script(
                        "return document.getElementsByClassName("
                        "'org-top-card-summary-info-list__info-item')["
                        "1].innerText")
            except:
                print("location name wasnt scrapped")
                self.locations_dict[url] = 'N/A'

            try:
                self.browser.close()
                self.browser.switch_to.window(self.browser.window_handles[0])
            except:
                print("tab did not close")

        industry = self.industries_dict[url]
        companyname = self.companies_dict[url]
        location = Location()
        location.parse_string(self.locations_dict[url])

        return {
            'industry': industry,
            'companyname': companyname,
            'location': location
        }
コード例 #2
0
    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Phone from Contact Info (email)
            try:
                phone = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-phone')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Birthday from Contact Info (email)
            try:
                birthday = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-birthday')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Date Connected from Contact Info (email)
            try:
                connectedDate = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-connected')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                pass

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser.find_element_by_id(
                    'experience-section').find_elements_by_tag_name('li')
            except:
                job_positions = []

            #Get all the educations
            try:
                educations = self.browser.find_element_by_id(
                    'education-section').find_elements_by_tag_name('li')
            except:
                educations = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                headline = name_div.find_all('h2')
                headline = headline[0].get_text().strip()
                profile_name = name_loc[0].find('li').get_text().strip()
                locationNConnection = name_loc[1].find_all('li')
                location = locationNConnection[0].get_text().strip()
                try:
                    connection = locationNConnection[1].find('a').find(
                        'span').get_text().strip()
                except:
                    connection = locationNConnection[1].find(
                        'span').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Scraping the Desc (using soup)
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('lt-line-clamp__more')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                if (self.browser.execute_script(
                        "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)"
                )):
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

                else:
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

            except:
                profile_desc = []

            # print(profile_desc)

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            education_list = []
            # Parsing the job positions
            if len(educations) > 0:
                # Parse job positions to extract relative the data ranges
                educations_data_ranges = []
                x = 1
                for education in educations:
                    try:
                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'education-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            education_name = a_tags.find(
                                'h3').get_text().strip()

                        except:
                            eudcation_name = None

                        try:
                            education_degree_name = a_tags.find_all(
                                'p')[0].get_text().strip()
                        except:
                            education_degree_name = None

                        try:
                            education_major = a_tags.find_all(
                                'p')[1].get_text().strip()
                        except:
                            education_major = None

                        try:
                            education_year = a_tags.find_all(
                                'p')[2].get_text().strip()
                        except:
                            education_year = None

                            # last_job_company_name = a_tags.find_all('span')[1].get_text().strip()
                            # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip()

                            # spans = exp_section.find('ul').find('li').find_all('span')

                        #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        # last_job_location = Location()
                        # next_span_is_location = False
                        # for span in spans:
                        #     if next_span_is_location:
                        #         last_job_location.parse_string(span.get_text().strip())
                        #         break
                        #     if span.get_text().strip() == 'Location':
                        #         next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]
                        education_list.append(
                            Education(education_name=education_name,
                                      degree_name=education_degree_name,
                                      major=education_major,
                                      year=education_year))

                    except:
                        pass

            for x in range(3 - len(educations)):
                education_list.append(
                    Education(education_name=None,
                              degree_name=None,
                              major=None,
                              year=None))

            last_job = []
            # Parsing the job positions
            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                job_positions_data_ranges = []
                x = 1
                for job_position in job_positions:
                    # Get the date range of the job position
                    try:
                        date_range_element = job_position.find_element_by_class_name(
                            'pv-entity__date-range')
                        date_range_spans = date_range_element.find_elements_by_tag_name(
                            'span')
                        date_range = date_range_spans[1].text

                        job_positions_data_ranges.append(date_range)

                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'experience-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            last_job_company_name = a_tags.find_all(
                                'p')[1].get_text().strip()
                            last_job_title = a_tags.find(
                                'h3').get_text().strip()

                            spans = a_tags.find_all('span')
                        except:
                            last_job_company_name = a_tags.find_all(
                                'span')[1].get_text().strip()
                            last_job_title = exp_section.find('ul').find(
                                'li').find_all('span')[2].get_text().strip()
                            spans = exp_section.find('ul').find('li').find_all(
                                'span')

                        last_job_company_name = last_job_company_name.replace(
                            'Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        last_job_location = Location()
                        next_span_is_location = False
                        for span in spans:
                            if next_span_is_location:
                                last_job_location.parse_string(
                                    span.get_text().strip())
                                break
                            if span.get_text().strip() == 'Location':
                                next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]

                        last_job.append(
                            Job(
                                position=last_job_title,
                                company=Company(
                                    name=last_job_company_name,
                                    #industry=last_job_company_industry
                                ),
                                location=last_job_location))

                    except:
                        last_job.append(
                            Job(
                                position=None,
                                company=Company(
                                    name=None,
                                    #industry=last_job_company_industry
                                ),
                                location=None))

                for x in range(4 - len(job_positions)):
                    last_job.append(
                        Job(
                            position=None,
                            company=Company(name=None,
                                            #industry=last_job_company_industry
                                            ),
                            location=None))

                print(
                    "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n  education {} \n"
                    .format(profile_name, headline, location, connection,
                            profile_desc, email, phone, birthday,
                            connectedDate, skills, last_job[0], last_job[1],
                            last_job[2], last_job[3], education_list[0]))

                return ScrapingResult(
                    Profile(
                        profile_name, headline, location, connection,
                        connectedDate, phone, birthday, profile_desc, email,
                        skills, last_job,
                        JobHistorySummary(profile_known_graduation_date,
                                          job_positions_data_ranges),
                        education_list))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)
コード例 #3
0
    def parsing_jobs(self, job_positions):
        job_positions_data_ranges = []
        #array of Jobs
        Jobs_array = []

        for job_position in job_positions:
            #print('job_pos.text: {0}\n--'.format(job_position.text))
            try:
                # Get the date range of the job position
                # get the date_range
                try:
                    date_range_element = job_position.find_element_by_class_name(
                        'pv-entity__date-range')
                    date_range_spans = date_range_element.find_elements_by_tag_name(
                        'span')
                    date_range = date_range_spans[1].text
                    # print('date_range: {0}'.format(date_range))
                except NoSuchElementException:
                    date_range = "N/A"

                try:
                    # get the title
                    title_range_element = job_position.find_element_by_tag_name(
                        'h3')
                    title = title_range_element.text
                    # print('title: {0}'.format(title))
                except NoSuchElementException:
                    title = "N/A"

                try:
                    # get the companyname
                    companyname_range_element = job_position.find_element_by_class_name(
                        'pv-entity__secondary-title')
                    companyname = companyname_range_element
                    companyname = companyname.text.replace(
                        'Full-time', '').replace('Part-time', '').strip()
                    # print('companyname: {0}'.format(companyname))
                except NoSuchElementException:
                    companyname = "N/A"

                try:
                    # get the company info using bautifulsoup
                    company_url_link = job_position.find_element_by_tag_name(
                        'a').get_attribute('href')
                except NoSuchElementException:
                    company_url_link = "N/A"

                try:
                    companylocation_range_element = job_position.find_element_by_class_name(
                        'pv-entity__location')
                    companylocation_spans = companylocation_range_element.find_elements_by_tag_name(
                        'span')
                    companylocation = companylocation_spans[1].text
                except NoSuchElementException:
                    companylocation = "N/A"
                # print('companylocation: {0}'.format(companylocation))

                job_positions_data_ranges.append(date_range)
                info_company = self.get_company_data(company_url_link)
                try:
                    if info_company['companyname'] == "N/A":
                        info_company['companyname'] = companyname
                    if info_company['location'].full_string == "N/A":
                        loc = Location()
                        loc.parse_string(companylocation)
                        info_company['location'] = loc
                except:
                    print("Oops!", sys.exc_info()[0], "occured.")
                    print(info_company['industry'])
                    print(info_company['companyname'])
                    print(info_company['location'])

                trabajo_oo = Job(
                    position=title.strip(),
                    company=Company(name=info_company['companyname'].strip(),
                                    industry=info_company['industry'].strip()),
                    location=info_company['location'],
                    daterange=date_range.strip())
                Jobs_array.append(trabajo_oo)
                # print(trabajo_oo)

            except:
                print("Oops!, \n{}\n{}\n{}\noccured.".format(
                    sys.exc_info()[0],
                    sys.exc_info()[1],
                    sys.exc_info()[2]))
                print("Job untacking error")
                pass

        return {
            'Jobs_array': Jobs_array,
            "job_positions_data_ranges": job_positions_data_ranges
        }