def get_company_data(self, url): #print(url) no_industry = False if url.split("/")[3] != "company": print("no company page") return { 'industry': 'N/A', 'companyname': 'N/A', 'location': Location('N/A', 'N/A', 'N/A') } if url not in self.industries_dict: try: self.browser.execute_script("window.open('');") self.browser.switch_to.window(self.browser.window_handles[1]) self.browser.get(url) except: print("error opening company page") return { 'industry': 'N/A', 'companyname': 'N/A', 'location': Location('N/A', 'N/A', 'N/A') } try: card_summary_divs = self.browser\ .find_element_by_class_name('org-top-card-summary-info-list')\ .find_elements_by_class_name('org-top-card-summary-info-list__info-item') inline_divs = self.browser\ .find_element_by_class_name('org-top-card-summary-info-list')\ .find_element_by_class_name('inline-block')\ .find_elements_by_class_name('org-top-card-summary-info-list__info-item') if len(card_summary_divs) == len(inline_divs): no_industry = True #print("card_summary_divs {}, inline_divs {}".format(len(card_summary_divs), # len(inline_divs))) except: print("error getting company data 3") #industry try: if no_industry: self.industries_dict[url] = "N/A" else: self.industries_dict[url] = self.browser.execute_script( "return document.getElementsByClassName(" "'org-top-card-summary-info-list__info-item')[" "0].innerText") except: #print("industry wasnt scrapped") self.industries_dict[url] = 'N/A' #companyname try: self.companies_dict[url] = self.browser.execute_script( "return document.getElementsByClassName(" "'org-top-card-summary__title')[" "0].title") except: print("company name wasnt scrapped") self.companies_dict[url] = 'N/A' #locations try: if no_industry: self.locations_dict[url] = self.browser.execute_script( "return document.getElementsByClassName(" "'org-top-card-summary-info-list__info-item')[" "0].innerText") else: self.locations_dict[url] = self.browser.execute_script( "return document.getElementsByClassName(" "'org-top-card-summary-info-list__info-item')[" "1].innerText") except: print("location name wasnt scrapped") self.locations_dict[url] = 'N/A' try: self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) except: print("tab did not close") industry = self.industries_dict[url] companyname = self.companies_dict[url] location = Location() location.parse_string(self.locations_dict[url]) return { 'industry': industry, 'companyname': companyname, 'location': location }
def scrap_profile(self, profile_linkedin_url, profile_known_graduation_date): if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Scraping of the profile may fail due to human check forced by LinkedIn try: # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Opening of the profile page self.browser.get(profile_linkedin_url) if not str(self.browser.current_url).strip( ) == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page self.browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Phone from Contact Info (email) try: phone = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-phone')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Birthday from Contact Info (email) try: birthday = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-birthday')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Date Connected from Contact Info (email) try: connectedDate = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-connected')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) self.browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: pass # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = self.browser.execute_script( "return window.innerHeight") scrolls = 1 while scrolls * window_height < self.browser.execute_script( "return document.body.offsetHeight"): self.browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: self.browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: job_positions = self.browser.find_element_by_id( 'experience-section').find_elements_by_tag_name('li') except: job_positions = [] #Get all the educations try: educations = self.browser.find_element_by_id( 'education-section').find_elements_by_tag_name('li') except: educations = [] # Parsing of the page html structure soup = BeautifulSoup(self.browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') headline = name_div.find_all('h2') headline = headline[0].get_text().strip() profile_name = name_loc[0].find('li').get_text().strip() locationNConnection = name_loc[1].find_all('li') location = locationNConnection[0].get_text().strip() try: connection = locationNConnection[1].find('a').find( 'span').get_text().strip() except: connection = locationNConnection[1].find( 'span').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Scraping the Desc (using soup) try: self.browser.execute_script( "document.getElementsByClassName('lt-line-clamp__more')[0].click()" ) time.sleep(loading_pause_time) except: pass try: if (self.browser.execute_script( "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)" )): profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) else: profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) except: profile_desc = [] # print(profile_desc) # Parsing skills try: self.browser.execute_script( "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()" ) time.sleep(loading_pause_time) except: pass try: skills = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()" ) except: skills = [] education_list = [] # Parsing the job positions if len(educations) > 0: # Parse job positions to extract relative the data ranges educations_data_ranges = [] x = 1 for education in educations: try: # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'education-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: education_name = a_tags.find( 'h3').get_text().strip() except: eudcation_name = None try: education_degree_name = a_tags.find_all( 'p')[0].get_text().strip() except: education_degree_name = None try: education_major = a_tags.find_all( 'p')[1].get_text().strip() except: education_major = None try: education_year = a_tags.find_all( 'p')[2].get_text().strip() except: education_year = None # last_job_company_name = a_tags.find_all('span')[1].get_text().strip() # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip() # spans = exp_section.find('ul').find('li').find_all('span') #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location # last_job_location = Location() # next_span_is_location = False # for span in spans: # if next_span_is_location: # last_job_location.parse_string(span.get_text().strip()) # break # if span.get_text().strip() == 'Location': # next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] education_list.append( Education(education_name=education_name, degree_name=education_degree_name, major=education_major, year=education_year)) except: pass for x in range(3 - len(educations)): education_list.append( Education(education_name=None, degree_name=None, major=None, year=None)) last_job = [] # Parsing the job positions if len(job_positions) > 0: # Parse job positions to extract relative the data ranges job_positions_data_ranges = [] x = 1 for job_position in job_positions: # Get the date range of the job position try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text job_positions_data_ranges.append(date_range) # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'experience-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: last_job_company_name = a_tags.find_all( 'p')[1].get_text().strip() last_job_title = a_tags.find( 'h3').get_text().strip() spans = a_tags.find_all('span') except: last_job_company_name = a_tags.find_all( 'span')[1].get_text().strip() last_job_title = exp_section.find('ul').find( 'li').find_all('span')[2].get_text().strip() spans = exp_section.find('ul').find('li').find_all( 'span') last_job_company_name = last_job_company_name.replace( 'Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location last_job_location = Location() next_span_is_location = False for span in spans: if next_span_is_location: last_job_location.parse_string( span.get_text().strip()) break if span.get_text().strip() == 'Location': next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] last_job.append( Job( position=last_job_title, company=Company( name=last_job_company_name, #industry=last_job_company_industry ), location=last_job_location)) except: last_job.append( Job( position=None, company=Company( name=None, #industry=last_job_company_industry ), location=None)) for x in range(4 - len(job_positions)): last_job.append( Job( position=None, company=Company(name=None, #industry=last_job_company_industry ), location=None)) print( "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n education {} \n" .format(profile_name, headline, location, connection, profile_desc, email, phone, birthday, connectedDate, skills, last_job[0], last_job[1], last_job[2], last_job[3], education_list[0])) return ScrapingResult( Profile( profile_name, headline, location, connection, connectedDate, phone, birthday, profile_desc, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), education_list)) else: return ScrapingResult(Profile(profile_name, email, skills)) except HumanCheckException: if self.headless_option: raise CannotProceedScrapingException linkedin_logout(self.browser) linkedin_login(self.browser, self.config.get('linkedin', 'username'), self.config.get('linkedin', 'password')) while self.browser.current_url != 'https://www.linkedin.com/feed/': message_to_user('Please execute manual check', self.config) time.sleep(30) return self.scrap_profile(profile_linkedin_url, profile_known_graduation_date)
def parsing_jobs(self, job_positions): job_positions_data_ranges = [] #array of Jobs Jobs_array = [] for job_position in job_positions: #print('job_pos.text: {0}\n--'.format(job_position.text)) try: # Get the date range of the job position # get the date_range try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text # print('date_range: {0}'.format(date_range)) except NoSuchElementException: date_range = "N/A" try: # get the title title_range_element = job_position.find_element_by_tag_name( 'h3') title = title_range_element.text # print('title: {0}'.format(title)) except NoSuchElementException: title = "N/A" try: # get the companyname companyname_range_element = job_position.find_element_by_class_name( 'pv-entity__secondary-title') companyname = companyname_range_element companyname = companyname.text.replace( 'Full-time', '').replace('Part-time', '').strip() # print('companyname: {0}'.format(companyname)) except NoSuchElementException: companyname = "N/A" try: # get the company info using bautifulsoup company_url_link = job_position.find_element_by_tag_name( 'a').get_attribute('href') except NoSuchElementException: company_url_link = "N/A" try: companylocation_range_element = job_position.find_element_by_class_name( 'pv-entity__location') companylocation_spans = companylocation_range_element.find_elements_by_tag_name( 'span') companylocation = companylocation_spans[1].text except NoSuchElementException: companylocation = "N/A" # print('companylocation: {0}'.format(companylocation)) job_positions_data_ranges.append(date_range) info_company = self.get_company_data(company_url_link) try: if info_company['companyname'] == "N/A": info_company['companyname'] = companyname if info_company['location'].full_string == "N/A": loc = Location() loc.parse_string(companylocation) info_company['location'] = loc except: print("Oops!", sys.exc_info()[0], "occured.") print(info_company['industry']) print(info_company['companyname']) print(info_company['location']) trabajo_oo = Job( position=title.strip(), company=Company(name=info_company['companyname'].strip(), industry=info_company['industry'].strip()), location=info_company['location'], daterange=date_range.strip()) Jobs_array.append(trabajo_oo) # print(trabajo_oo) except: print("Oops!, \n{}\n{}\n{}\noccured.".format( sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])) print("Job untacking error") pass return { 'Jobs_array': Jobs_array, "job_positions_data_ranges": job_positions_data_ranges }