data.append(None) data.append(None) else: data.append(current_job.location.city) data.append(current_job.location.country) x += 1 x = 0 for education in p.education: if x < 3: data.append(education.education_name) data.append(education.degree_name) data.append(education.major) data.append(education.year) x += 1 for h in range(len(headers)): worksheet.write(0, h, headers[h]) for j in range(len(data)): worksheet.write(i + 1, j, data[j]) workbook.close() if any(scraper.interrupted for scraper in scrapers): message_to_user( "The scraping didnt end correctly due to Human Check. The excel file was generated but it will " "contain some entries reporting an error string.", config) else: message_to_user('Scraping successfully ended.', config)
def scrap_profile(self, profile_linkedin_url, profile_known_graduation_date): if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Scraping of the profile may fail due to human check forced by LinkedIn try: # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Opening of the profile page self.browser.get(profile_linkedin_url) if not str(self.browser.current_url).strip( ) == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page self.browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Phone from Contact Info (email) try: phone = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-phone')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Birthday from Contact Info (email) try: birthday = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-birthday')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Date Connected from Contact Info (email) try: connectedDate = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-connected')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) self.browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: pass # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = self.browser.execute_script( "return window.innerHeight") scrolls = 1 while scrolls * window_height < self.browser.execute_script( "return document.body.offsetHeight"): self.browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: self.browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: job_positions = self.browser.find_element_by_id( 'experience-section').find_elements_by_tag_name('li') except: job_positions = [] #Get all the educations try: educations = self.browser.find_element_by_id( 'education-section').find_elements_by_tag_name('li') except: educations = [] # Parsing of the page html structure soup = BeautifulSoup(self.browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') headline = name_div.find_all('h2') headline = headline[0].get_text().strip() profile_name = name_loc[0].find('li').get_text().strip() locationNConnection = name_loc[1].find_all('li') location = locationNConnection[0].get_text().strip() try: connection = locationNConnection[1].find('a').find( 'span').get_text().strip() except: connection = locationNConnection[1].find( 'span').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Scraping the Desc (using soup) try: self.browser.execute_script( "document.getElementsByClassName('lt-line-clamp__more')[0].click()" ) time.sleep(loading_pause_time) except: pass try: if (self.browser.execute_script( "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)" )): profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) else: profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) except: profile_desc = [] # print(profile_desc) # Parsing skills try: self.browser.execute_script( "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()" ) time.sleep(loading_pause_time) except: pass try: skills = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()" ) except: skills = [] education_list = [] # Parsing the job positions if len(educations) > 0: # Parse job positions to extract relative the data ranges educations_data_ranges = [] x = 1 for education in educations: try: # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'education-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: education_name = a_tags.find( 'h3').get_text().strip() except: eudcation_name = None try: education_degree_name = a_tags.find_all( 'p')[0].get_text().strip() except: education_degree_name = None try: education_major = a_tags.find_all( 'p')[1].get_text().strip() except: education_major = None try: education_year = a_tags.find_all( 'p')[2].get_text().strip() except: education_year = None # last_job_company_name = a_tags.find_all('span')[1].get_text().strip() # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip() # spans = exp_section.find('ul').find('li').find_all('span') #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location # last_job_location = Location() # next_span_is_location = False # for span in spans: # if next_span_is_location: # last_job_location.parse_string(span.get_text().strip()) # break # if span.get_text().strip() == 'Location': # next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] education_list.append( Education(education_name=education_name, degree_name=education_degree_name, major=education_major, year=education_year)) except: pass for x in range(3 - len(educations)): education_list.append( Education(education_name=None, degree_name=None, major=None, year=None)) last_job = [] # Parsing the job positions if len(job_positions) > 0: # Parse job positions to extract relative the data ranges job_positions_data_ranges = [] x = 1 for job_position in job_positions: # Get the date range of the job position try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text job_positions_data_ranges.append(date_range) # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'experience-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: last_job_company_name = a_tags.find_all( 'p')[1].get_text().strip() last_job_title = a_tags.find( 'h3').get_text().strip() spans = a_tags.find_all('span') except: last_job_company_name = a_tags.find_all( 'span')[1].get_text().strip() last_job_title = exp_section.find('ul').find( 'li').find_all('span')[2].get_text().strip() spans = exp_section.find('ul').find('li').find_all( 'span') last_job_company_name = last_job_company_name.replace( 'Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location last_job_location = Location() next_span_is_location = False for span in spans: if next_span_is_location: last_job_location.parse_string( span.get_text().strip()) break if span.get_text().strip() == 'Location': next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] last_job.append( Job( position=last_job_title, company=Company( name=last_job_company_name, #industry=last_job_company_industry ), location=last_job_location)) except: last_job.append( Job( position=None, company=Company( name=None, #industry=last_job_company_industry ), location=None)) for x in range(4 - len(job_positions)): last_job.append( Job( position=None, company=Company(name=None, #industry=last_job_company_industry ), location=None)) print( "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n education {} \n" .format(profile_name, headline, location, connection, profile_desc, email, phone, birthday, connectedDate, skills, last_job[0], last_job[1], last_job[2], last_job[3], education_list[0])) return ScrapingResult( Profile( profile_name, headline, location, connection, connectedDate, phone, birthday, profile_desc, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), education_list)) else: return ScrapingResult(Profile(profile_name, email, skills)) except HumanCheckException: if self.headless_option: raise CannotProceedScrapingException linkedin_logout(self.browser) linkedin_login(self.browser, self.config.get('linkedin', 'username'), self.config.get('linkedin', 'password')) while self.browser.current_url != 'https://www.linkedin.com/feed/': message_to_user('Please execute manual check', self.config) time.sleep(30) return self.scrap_profile(profile_linkedin_url, profile_known_graduation_date)
config = ConfigParser() config.read('config.ini') if config.get('system', 'os') == 'linux': display = Display(visible=0, size=(800, 800)) display.start() headless_option = len(sys.argv) >= 2 and sys.argv[1] == 'HEADLESS' # Creation of a new instance of Chrome browser = webdriver.Chrome(executable_path=config.get('system', 'driver'), options=get_browser_options(headless_option, config)) # Doing login on LinkedIn linkedin_login(browser, config.get('linkedin', 'username'), config.get('linkedin', 'password')) message_to_user('Starting reading Linkedin profiles', config) results = [] cont = 0 for query in open(config.get('profiles_data_by_name', 'input_file_name'), "r"): cont += 1 print(f"Scraping {cont}") query = query.split(config.get('profiles_data_by_name', 'delimiter')) first_name = query[0] last_name = query[1] try: university = query[2].split(',') except:
def scrap_profile(self, profile_linkedin_url, profile_known_graduation_date): if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Scraping of the profile may fail due to human check forced by LinkedIn try: # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Opening of the profile page self.browser.get(profile_linkedin_url) if not str(self.browser.current_url).strip( ) == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page self.browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) self.browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: email = 'N/A' # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = self.browser.execute_script( "return window.innerHeight") scrolls = 1 while scrolls * window_height < self.browser.execute_script( "return document.body.offsetHeight"): self.browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: self.browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: job_positions = self.browser\ .find_element_by_id('experience-section')\ .find_elements_by_tag_name('li') except NoSuchElementException: print("job_positions is null") job_positions = [] # Get all the education positions try: education_positions = self.browser\ .find_element_by_id('education-section')\ .find_elements_by_tag_name('li') except NoSuchElementException: print("job_positions is null") education_positions = [] # Parsing of the page html structure soup = BeautifulSoup(self.browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') profile_name = name_loc[0].find('li').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Parsing skills try: self.browser.execute_script( "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()" ) time.sleep(loading_pause_time) except: pass try: skills = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()" ) except: skills = [] # Parsing the job positions if len(job_positions) > 0: # Parse job positions to extract relative the data ranges js = self.parsing_jobs(job_positions) job_positions_data_ranges = js['job_positions_data_ranges'] Jobs_array = js['Jobs_array'] last_job = Jobs_array[0] if len(education_positions) > 0: eds = self.parsing_educations(education_positions) return ScrapingResult( Profile( profile_name, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), Jobs_array, eds)) else: return ScrapingResult( Profile( profile_name, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), Jobs_array)) else: return ScrapingResult(Profile(profile_name, email, skills)) except HumanCheckException: if self.headless_option: raise CannotProceedScrapingException linkedin_logout(self.browser) linkedin_login(self.browser, self.config.get('linkedin', 'username'), self.config.get('linkedin', 'password')) while self.browser.current_url != 'https://www.linkedin.com/feed/': message_to_user('Please execute manual check', self.config) time.sleep(30) return self.scrap_profile(profile_linkedin_url, profile_known_graduation_date)