def __scrape_profile(self, profile_linkedin_url): if not is_url_valid(profile_linkedin_url): raise ScrapingException self.browser.get(profile_linkedin_url) # Check correct loading of profile and eventual Human Check if not str(self.browser.current_url).strip() == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': raise ScrapingException else: raise HumanCheckException self.load_full_page() # SCRAPING profile_name = self.scrape_profile_name() email = self.scrape_email() skills = self.scrape_skills() jobs = self.scrape_jobs() # keep as last scraping return Profile( name=profile_name, email=email, skills=skills, jobs=jobs )
def main( folder=("d", "", "Folder with transcription files"), page=("p", "", "Output github wiki page (in markdown)"), url_prefix=("u", "", "URL prefix of the filenames") ): if (len(folder) == 0) or (not os.path.isdir(folder)): print("Folder should be specified") sys.exit(1) if len(page) == 0: print("Github page should be specified") sys.exit(1) if not utils.is_url_valid(url_prefix): print("URL prefix isn't avalid url") sys.exit(1) with open(page, "w") as wiki_file: for tr in os.listdir(folder): if not fnmatch.fnmatch(tr, "*.md"): continue folder_basename = os.path.basename(folder) pretty_name, ext = os.path.splitext(os.path.basename(tr.replace("_", " "))) item_string = "* [{link}]({prefix}{name})\n".format( link = pretty_name, prefix = url_prefix, name = tr) wiki_file.write(item_string)
def add(user=None): if 'url' not in request.form: return jsonify(errors=['Dude, what\'s wrong with you ?', 'You are missing Bookmark Url']), 400 if not is_url_valid(request.form['url']): return jsonify(errors=['Dude, what\'s wrong with you ?', 'Invalid Bookmark Url']), 400 bookmark = Bookmark(url=request.form['url'], user=user) if bookmark.save(): return jsonify(bookmark=bookmark.to_dict()), 200 return jsonify(errors=[bookmark.to_dict()]), 400
def __scrape_profile(self, profile_linkedin_url): if not is_url_valid(profile_linkedin_url): raise ScrapingException self.browser.get(profile_linkedin_url) # Check correct loading of profile and eventual Human Check if not str(self.browser.current_url).strip() == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': raise ScrapingException else: raise HumanCheckException self.load_full_page() # SCRAPING profile_name = self.scrape_profile_name() profile_title=self.scrape_title() profile_image= self.scrape_image() email = self.scrape_email() about_profile=self.scrape_about_profile() education_profile=self.scrape_education() edu1=self.scrape_edu1() edu2=self.scrape_edu2() edu3=self.scrape_edu3() profile_recommendations=self.scrape_recommendations() skills = self.scrape_skills() jobs = self.scrape_jobs() # keep as last scraping return Profile( name=profile_name, title=profile_title, image=profile_image, email=email, about=about_profile, education=education_profile, edu1=edu1, edu2=edu2, edu3=edu3, recommendations= profile_recommendations, skills=skills, jobs=jobs )
def add(user=None): if 'url' not in request.form: return jsonify(errors=[ 'Dude, what\'s wrong with you ?', 'You are missing Bookmark Url' ]), 400 if not is_url_valid(request.form['url']): return jsonify( errors=['Dude, what\'s wrong with you ?', 'Invalid Bookmark Url' ]), 400 bookmark = Bookmark(url=request.form['url'], user=user) if bookmark.save(): return jsonify(bookmark=bookmark.to_dict()), 200 return jsonify(errors=[bookmark.to_dict()]), 400
def parse_args(): parser = argparse.ArgumentParser(description='A simple python web crawler') parser.add_argument('url', metavar='url', type=str, help='Url to crawl.') parser.add_argument('--max_page_depth', dest='max_page_depth', type=int, default=5, help='Maximum depth to crawl on a given url.') parser.add_argument('--max_external_sites_page_depth', dest='max_external_sites_page_depth', type=int, default=4, help='Maximum external sites depth to crawl on a given url.') parser.add_argument('--request_rate_limit', dest='request_rate_limit', type=int, default=4, help='Maximum requests at once.') parsed = parser.parse_args() if not is_url_valid(parsed.url): print 'Please enter a url in the following format: http://example.com/optional_query' else: return parsed
def check_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add("Field url with value [{single_url}] and number {number} is wrong".format( single_url=single_url, number=number )) if not utils.is_url_self_served(single_url, item): continue match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: errors.add("Self served url [{single_url}] doesn't match SELF_SERVED_URL_REGEXP".format( single_url=single_url )) continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url(single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owners = metadata.get("owner").split("+") if not owners: errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format( number=number, url=single_url, filename=single_filename )) continue for owner in owners: owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: annotation = item.get("annotation") if ( (not annotation) or (owner_fullname not in annotation) ): errors.add("Owner fullname ({owner_fullname}) should be present in annotation".format( owner_fullname=owner_fullname ))
def check_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") booktype = item.get("booktype") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add("Field url with value [{single_url}] and number {number} is wrong".format( single_url=single_url, number=number )) match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: continue #inproceedings can have self-served url pointing #to entire full proceedings book #TODO: invent something like PARTIAL_BOOKTYPES if (booktype == "inproceedings"): continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url(single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owner = metadata.get("owner") if owner is None: errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format( number=number, url=single_url, filename=single_filename )) continue owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: annotation = item.get("annotation") if ( (not annotation) or (owner_fullname not in annotation) ): errors.add("Owner fullname ({owner_fullname}) should be present in annotation".format( owner_fullname=owner_fullname ))
def validate_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add("Field url with value [{single_url}] and number {number} is wrong".format( single_url=single_url, number=number )) if not utils.is_url_self_served(single_url, item): continue match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: errors.add("Self served url [{single_url}] doesn't match SELF_SERVED_URL_REGEXP".format( single_url=single_url )) continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url(single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owners = metadata.get("owner").split("+") if not owners: errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format( number=number, url=single_url, filename=single_filename )) continue for owner in owners: owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: note = item.get("note") if ( (not note) or (owner_fullname not in note) ): errors.add("Owner fullname ({owner_fullname}) should be present in note".format( owner_fullname=owner_fullname ))
def check_url_validity(item, errors): """ Checks url for validity """ url = item.get("url") item_id = item.get("id") booktype = item.get("booktype") if url is None: return for number, single_url in enumerate(url): if not utils.is_url_valid(single_url, item): errors.add( "Field url with value [{single_url}] and number {number} is wrong" .format(single_url=single_url, number=number)) match = utils.SELF_SERVED_URL_REGEXP.match(single_url) if not match: continue #inproceedings can have self-served url pointing #to entire full proceedings book #TODO: invent something like PARTIAL_BOOKTYPES if (booktype == "inproceedings"): continue if (match.group("item_id") != item_id): errors.add("Wrong item_id specified in self-served url") continue single_filename, single_filesize = utils.get_file_info_from_url( single_url, item) metadata = utils.extract_metadata_from_file(single_filename) owner = metadata.get("owner") if owner is None: errors.add( "Owner specification expected for self-served url #{number} (url={url}, filename={filename})" .format(number=number, url=single_url, filename=single_filename)) continue owner_fullname = config.parser.bookkeepers.get(owner) if owner_fullname: annotation = item.get("annotation") if ((not annotation) or (owner_fullname not in annotation)): errors.add( "Owner fullname ({owner_fullname}) should be present in annotation" .format(owner_fullname=owner_fullname))
def get_last_date(driver): logger.debug('Trying to find last article on page') home_page = bs(get_html(driver), 'html.parser') articles_tags = home_page.find('ul', { 'class': 'posts-listing__list' }).find_all('article') last_index = -1 while not is_url_valid(URL + articles_tags[last_index].header.a['href']): last_index -= 1 url = URL + articles_tags[last_index].header.a['href'] logger.debug('Found last article element') logger.debug('Fetching last article data from url: [{}]'.format(url)) article_div = bs(get_content(url), 'html.parser').body.main.find( 'div', {'class': 'post post-page__article'}) date = article_div.find('div', { 'class': 'post-meta__publish-date' }).time['datetime'] logger.debug('Extracted date from last article [{}]'.format(str(date))) return date
def scrap_profile(self, profile_linkedin_url, profile_known_graduation_date): if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Scraping of the profile may fail due to human check forced by LinkedIn try: # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Opening of the profile page self.browser.get(profile_linkedin_url) if not str(self.browser.current_url).strip( ) == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page self.browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Phone from Contact Info (email) try: phone = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-phone')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Birthday from Contact Info (email) try: birthday = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-birthday')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) except: pass # Scraping the Date Connected from Contact Info (email) try: connectedDate = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-connected')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) self.browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: pass # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = self.browser.execute_script( "return window.innerHeight") scrolls = 1 while scrolls * window_height < self.browser.execute_script( "return document.body.offsetHeight"): self.browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: self.browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: job_positions = self.browser.find_element_by_id( 'experience-section').find_elements_by_tag_name('li') except: job_positions = [] #Get all the educations try: educations = self.browser.find_element_by_id( 'education-section').find_elements_by_tag_name('li') except: educations = [] # Parsing of the page html structure soup = BeautifulSoup(self.browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') headline = name_div.find_all('h2') headline = headline[0].get_text().strip() profile_name = name_loc[0].find('li').get_text().strip() locationNConnection = name_loc[1].find_all('li') location = locationNConnection[0].get_text().strip() try: connection = locationNConnection[1].find('a').find( 'span').get_text().strip() except: connection = locationNConnection[1].find( 'span').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Scraping the Desc (using soup) try: self.browser.execute_script( "document.getElementsByClassName('lt-line-clamp__more')[0].click()" ) time.sleep(loading_pause_time) except: pass try: if (self.browser.execute_script( "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)" )): profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) else: profile_desc = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()" ) except: profile_desc = [] # print(profile_desc) # Parsing skills try: self.browser.execute_script( "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()" ) time.sleep(loading_pause_time) except: pass try: skills = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()" ) except: skills = [] education_list = [] # Parsing the job positions if len(educations) > 0: # Parse job positions to extract relative the data ranges educations_data_ranges = [] x = 1 for education in educations: try: # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'education-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: education_name = a_tags.find( 'h3').get_text().strip() except: eudcation_name = None try: education_degree_name = a_tags.find_all( 'p')[0].get_text().strip() except: education_degree_name = None try: education_major = a_tags.find_all( 'p')[1].get_text().strip() except: education_major = None try: education_year = a_tags.find_all( 'p')[2].get_text().strip() except: education_year = None # last_job_company_name = a_tags.find_all('span')[1].get_text().strip() # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip() # spans = exp_section.find('ul').find('li').find_all('span') #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location # last_job_location = Location() # next_span_is_location = False # for span in spans: # if next_span_is_location: # last_job_location.parse_string(span.get_text().strip()) # break # if span.get_text().strip() == 'Location': # next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] education_list.append( Education(education_name=education_name, degree_name=education_degree_name, major=education_major, year=education_year)) except: pass for x in range(3 - len(educations)): education_list.append( Education(education_name=None, degree_name=None, major=None, year=None)) last_job = [] # Parsing the job positions if len(job_positions) > 0: # Parse job positions to extract relative the data ranges job_positions_data_ranges = [] x = 1 for job_position in job_positions: # Get the date range of the job position try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text job_positions_data_ranges.append(date_range) # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'experience-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.contents[x].find('div') a_tags = div_tags.find('a') x += 1 # Scraping of the last Job - company_name, job_title try: last_job_company_name = a_tags.find_all( 'p')[1].get_text().strip() last_job_title = a_tags.find( 'h3').get_text().strip() spans = a_tags.find_all('span') except: last_job_company_name = a_tags.find_all( 'span')[1].get_text().strip() last_job_title = exp_section.find('ul').find( 'li').find_all('span')[2].get_text().strip() spans = exp_section.find('ul').find('li').find_all( 'span') last_job_company_name = last_job_company_name.replace( 'Full-time', '').replace('Part-time', '').strip() # Scraping of last Job - location last_job_location = Location() next_span_is_location = False for span in spans: if next_span_is_location: last_job_location.parse_string( span.get_text().strip()) break if span.get_text().strip() == 'Location': next_span_is_location = True # # Scraping of Industry related to last Job # last_job_company_url = a_tags.get('href') # if last_job_company_url not in self.industries_dict: # try: # self.browser.get('https://www.linkedin.com' + last_job_company_url) # self.industries_dict[last_job_company_url] = self.browser.execute_script( # "return document.getElementsByClassName(" # "'org-top-card-summary-info-list__info-item')[" # "0].innerText") # except: # self.industries_dict[last_job_company_url] = 'N/A' # last_job_company_industry = self.industries_dict[last_job_company_url] last_job.append( Job( position=last_job_title, company=Company( name=last_job_company_name, #industry=last_job_company_industry ), location=last_job_location)) except: last_job.append( Job( position=None, company=Company( name=None, #industry=last_job_company_industry ), location=None)) for x in range(4 - len(job_positions)): last_job.append( Job( position=None, company=Company(name=None, #industry=last_job_company_industry ), location=None)) print( "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n education {} \n" .format(profile_name, headline, location, connection, profile_desc, email, phone, birthday, connectedDate, skills, last_job[0], last_job[1], last_job[2], last_job[3], education_list[0])) return ScrapingResult( Profile( profile_name, headline, location, connection, connectedDate, phone, birthday, profile_desc, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), education_list)) else: return ScrapingResult(Profile(profile_name, email, skills)) except HumanCheckException: if self.headless_option: raise CannotProceedScrapingException linkedin_logout(self.browser) linkedin_login(self.browser, self.config.get('linkedin', 'username'), self.config.get('linkedin', 'password')) while self.browser.current_url != 'https://www.linkedin.com/feed/': message_to_user('Please execute manual check', self.config) time.sleep(30) return self.scrap_profile(profile_linkedin_url, profile_known_graduation_date)
def get_profile_data(profile_data_line): global industries_dict # this function supports data as: # # https://www.linkedin.com/in/federicohaag ==> parse name, email, last job # # https://www.linkedin.com/in/federicohaag:::01/01/1730 ==> parse name, email, last job # and also produces a "job history summary" returning if the person was working while studying, # and how fast she/he got a job after the graduation. # As graduation date is used the one passed as parameter, NOT the date it could be on LinkedIn # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Get known graduation date known_graduation_date = None if len(profile_data_line) == 2: known_graduation_date = time.strptime( '/'.join(profile_data_line[1].strip().split("/")[1:]), '%m/%y') # Get the url of LinkedIn profile profile_linkedin_url = profile_data_line[0] if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Opening of the profile page browser.get(profile_linkedin_url) if browser.current_url != profile_linkedin_url: if browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: email = 'N/A' # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = browser.execute_script("return window.innerHeight") scrolls = 1 while scrolls * window_height < browser.execute_script( "return document.body.offsetHeight"): browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: list_of_job_positions = browser.find_element_by_id( 'experience-section').find_elements_by_tag_name('li') except: list_of_job_positions = [] # Get job experiences (two different positions in Company X is considered one job experience) try: job_experiences = browser.find_elements_by_class_name( 'pv-profile-section__card-item-v2') except: job_experiences = [] # Parsing of the page html structure soup = BeautifulSoup(browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') profile_name = name_loc[0].find('li').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Parsing the job positions if len(list_of_job_positions) > 0: # Parse job positions to extract relative the data ranges job_positions_data_ranges = [] for job_position in list_of_job_positions: # Get the date range of the job position try: date_range_element = job_position.find_element_by_class_name( 'pv-entity__date-range') date_range_spans = date_range_element.find_elements_by_tag_name( 'span') date_range = date_range_spans[1].text job_positions_data_ranges.append(date_range) except: pass # Compute the 'job history' of the profile if the graduation date is provided in profiles_data.txt file job_history_summary = compute_job_history_summary( known_graduation_date, job_positions_data_ranges, job_experiences) # Scraping of the last (hopefully current) Job exp_section = soup.find('section', {'id': 'experience-section'}) exp_section = exp_section.find('ul') div_tags = exp_section.find('div') a_tags = div_tags.find('a') # Scraping of the last (hopefully current) Job - company_name, job_title try: current_job_company_name = a_tags.find_all( 'p')[1].get_text().strip() current_job_title = a_tags.find('h3').get_text().strip() spans = a_tags.find_all('span') except: current_job_company_name = a_tags.find_all( 'span')[1].get_text().strip() current_job_title = exp_section.find('ul').find('li').find_all( 'span')[2].get_text().strip() spans = exp_section.find('ul').find('li').find_all('span') current_job_company_name = current_job_company_name.replace( 'Full-time', '').replace('Part-time', '').strip() # Scraping of last (hopefully current) Job - location location = Location() next_span_is_location = False for span in spans: if next_span_is_location: location.parse_string(span.get_text().strip()) break if span.get_text().strip() == 'Location': next_span_is_location = True # Scraping of Industry related to last (hopefully current) Job company_url = a_tags.get('href') if company_url not in industries_dict: try: browser.get('https://www.linkedin.com' + company_url) industries_dict[company_url] = browser.execute_script( "return document.getElementsByClassName(" "'org-top-card-summary-info-list__info-item')[0].innerText" ) except: industries_dict[company_url] = 'N/A' current_job_company_industry = industries_dict[company_url] company = Company(name=current_job_company_name, industry=current_job_company_industry) current_job = Job(position=current_job_title, company=company, location=location) profile = Profile(profile_name, email, current_job, job_history_summary) else: profile = Profile(profile_name, email) return ScrapingResult(profile)
def main( root=("r", "", "E-library root"), check_head=("", False, "Perform HTTP HEAD request to url values") ): """ Validates bibliography over a bunch of rules """ if (len(root) == 0) or (not os.path.isdir(root)): print("Root folder is inaccessible") sys.exit(1) root = os.path.abspath(root) print("Going to process {0} items".format(len(items))) SOURCE_REGEXP = re.compile("(?P<basename>[_\-\w\.]+).bib:\d+") MULTILANG_FILES = {"proceedings-spb", "proceedings-rothenfelser", "_missing", "_problems"} VALID_BOOKTYPES = { "book", "mvbook", "inproceedings", "proceedings", "reference", "mvreference", "periodical", "unpublished", "thesis", "article" } NON_MULTIVOLUME_BOOKTYPES = {"article", "periodical"} MULTIVOLUME_BOOKTYPES = {"mvbook", "mvreference"} #don't validate filename for the given entrytypes MULTIENTRY_BOOKTYPES = {"proceedings", "inproceedings"} SHORTHAND_LIMIT = 25 #magic constant LAST_ORIGINAL_YEAR = 1937 NON_ORIGINAL_KEYWORDS = {"reissue", "research"} RESEARCH_BOOKTYPES = {"book", "mvbook"} UNPUBLISHED_NOTE_PREFIX = "Unpublished manuscript" erroneous_entries = 0 errors_count = 0 for item in items: errors = [] #datamodel validation author = item.get("author") booktype = item.get("booktype") booktitle = item.get("booktitle") commentator = item.get("commentator") edition = item.get("edition") filename = item.get("filename") id = item.get("id") isbn = item.get("isbn") institution = item.get("institution") journaltitle = item.get("journaltitle") keywords = set(item.get("keywords")) if item.get("keywords") else None langid = item.get("langid") location = item.get("location") note = item.get("note") number = item.get("number") origlanguage = item.get("origlanguage") publisher = item.get("publisher") series = item.get("series") shorthand = item.get("shorthand") source = item.get("source") title = item.get("title") translator = item.get("translator") type = item.get("type") url = item.get("url") volume = item.get("volume") volumes = item.get("volumes") year = item.get("year") year_from = item.get("year_from") year_to = item.get("year_to") year_circa = item.get("year_circa") added_on = item.get("added_on") match = SOURCE_REGEXP.match(source) if not match: raise RuntimeError("Failed to parse 'source' for item ({id})".format( id=id )) source_basename = match.group("basename") parser_obligatory = [id, booktype, source, year_from, year_to, year_circa] none_checker = lambda obj: obj is not None if not all(map(none_checker, parser_obligatory)): raise RuntimeError("Parser hasn't generated all required auxiliary fields {fields}".format( fields=parser_obligatory )) general_obligatory = [langid, year, title, added_on] if not all(general_obligatory): errors.append("Item doesn't define one of [langid, year, title]") translation_obligatory = [origlanguage, translator] if not utils.all_or_none(translation_obligatory): errors.append("All of [origlanguage, translator] must be present for translations") series_obligatory = [series, number] if not utils.all_or_none(series_obligatory): errors.append("All of [series, number] must be present for serial books") if not any([author, shorthand]): errors.append("'author' or 'shorthand' must be present") if (publisher is not None) and (location is None): errors.append("If publisher present, location must be present") #booktype validation booktype = booktype.lower() if booktype not in VALID_BOOKTYPES: errors.append("Invalid booktype ({booktype})".format( booktype=booktype )) if (booktype not in NON_MULTIVOLUME_BOOKTYPES): if (volume is not None) and (volumes is None): errors.append("If volume present, volumes must be present") if (booktype in MULTIVOLUME_BOOKTYPES): if volumes is None: errors.append("volumes must be present for @{0}".format(booktype)) if (booktype == "article"): if journaltitle is None: errors.append("journaltitle must be present for @article") if (booktype == "inproceedings"): if booktitle is None: errors.append("bootitle must be present for @inprocessing") if (booktype == "thesis"): if url is None: errors.append("url must be present for @thesis") if type is None: errors.append("type must be present for @thesis") if institution is None: errors.append("institution must be present for @thesis") #data validation #author validation empty #booktitle validation empty #commentator if commentator is not None: if (keywords is None) or ("commentary" not in keywords): errors.append("Keywords should contain 'commentary' when commentator specified") #filename validation if edition is not None: #edition should be greater than 1 if edition <= 1: errors.append("Wrong edition {edition}".format( edition=edition )) if volume is not None: #volume should be positive integer if volume <= 0: errors.append("Wrong volume {volume}".format( volume=volume )) if volumes is not None: if volume > volumes: errors.append("Volume ({volume}) can't be greater than volumes ({volumes})".format( volume=volume, volumes=volumes )) #filename validation if (filename is not None) and (booktype not in MULTIENTRY_BOOKTYPES): for filename_ in filename: #filename starts with "/" which will mix os.path.join up abspath = os.path.join(root, filename_[1:]) #each filename should be accessible if not os.path.isfile(abspath): errors.append("File {filename_} is not accessible".format( filename_=filename_ )) #item should be searchable by its filename metadata metadata = utils.extract_metadata_from_file(filename_) #validating optional author, edition, tome #in case when item specifies value, but filename doesn't if not utils.all_or_none([metadata.get("author", None), author]): errors.append("File {filename_} and entry have different author specifications".format( filename_=filename_ )) if not utils.all_or_none([metadata.get("edition", None), edition]): errors.append("File {filename_} and entry have different edition specifications".format( filename_=filename_ )) if not utils.all_or_none([metadata.get("tome", None), any([volume, volumes])]): errors.append("File {filename_} and entry have different volume specifications".format( filename_=filename_ )) meta_keywords = metadata.get("keywords", None) if meta_keywords is not None: if ("incomplete" not in meta_keywords) and (source_basename == "_problems"): errors.append("Incomplete books should be stored in _problems.bib") meta_keywords.discard("incomplete") if len(meta_keywords) > 0: if keywords is None: errors.append("No keywords specified (should be {meta_keywords}".format( meta_keywords=meta_keywords )) elif not keywords >= meta_keywords: errors.append("Item keywords {keywords} do not match metadata keywords {meta_keywords}".format( keywords=keywords, meta_keywords=meta_keywords )) search_ = utils.create_search_from_metadata(metadata) if not search_(item): errors.append("File {filename_} is not searchable by extracted params".format( filename_=filename_ )) #id validation empty if len(item_index["id"][id]) != 1: errors.append("Id is not unique") #isbn validation if isbn is not None: for isbn_ in isbn: correct, msg = utils.is_isbn_valid(isbn_) if not correct: errors.append("ISBN {isbn_} isn't valid: {msg}".format( isbn_=isbn_, msg=msg )) #institution validation empty #journaltitle validation empty #keywords validation #if item was issued after LAST_ORIGINAL_YEAR, it should define keywords if True: if (year_from > LAST_ORIGINAL_YEAR) and (booktype in RESEARCH_BOOKTYPES): if (keywords is None) or (len(keywords & NON_ORIGINAL_KEYWORDS) == 0): errors.append("Item was issued after {last_year}, but keywords don't define any of {keywords}".format( last_year=LAST_ORIGINAL_YEAR, keywords=NON_ORIGINAL_KEYWORDS )) if (keywords is not None): if ("translation" in keywords) and not all([translator, origlanguage]): errors.append("When 'translation' keyword specified, translator and origlanguage should be present") if ("commentary" in keywords) and not commentator: errors.append("When 'commentary' keyword specified, commentator should be present") #langid validation if source_basename not in MULTILANG_FILES: source_lang = const.LONG_LANG_MAP[source_basename] #item language should match source language if langid != source_lang: errors.append("Source language ({source_lang}) doesn't match item language ({langid})".format( source_lang=source_lang, langid=langid )) #location validation empty #note validation note_unpublished = (note is not None) and (note.startswith(UNPUBLISHED_NOTE_PREFIX)) booktype_unpublished = (booktype == "unpublished") if not utils.all_or_none([note_unpublished, booktype_unpublished]): errors.append("For unpublished books, note should begin with [{note_prefix}] and booktype should be {booktype}".format( booktype="unpublished", note_prefix=UNPUBLISHED_NOTE_PREFIX )) #number validation empty #origlanguage validation empty #publisher validation empty #series validation empty #shorthand validation empty if shorthand is not None: length = len(shorthand) if length > SHORTHAND_LIMIT: errors.append("The length of shorthand ({length}) should not exceed limit ({limit})".format( length=length, limit=SHORTHAND_LIMIT )) if (author is None) and (not title.startswith(shorthand)): errors.append("Title ({title}) should begin with from shorthand ({shorthand})".format( title=title, shorthand=shorthand )) #source validation empty #title validation empty if title is not None: if (" " in title): errors.append("Consecutive spaces in title") if ("\t" in title): errors.append("Tabs in title") if title.startswith(" ") or title.endswith(" "): errors.append("Title isn't stripped") #translator validation if translator is not None: if (keywords is None) or ("translation" not in keywords): errors.append("Keywords should contain 'translation' when 'translator' field specified") #type validation empty #url validation empty if url is not None: correct, msg = utils.is_url_valid(url, check_head) if not correct: errors.append("URL {url} isn't valid: {msg}".format( url=url, msg=msg )) #volume validation empty #volumes validation empty #year validation empty #printing errors if len(errors) > 0: erroneous_entries += 1 errors_count += len(errors) print("Errors for {id} ({source})".format( id=id, source=source )) for error in errors: print(" " + error) print("Found {entries} erroneous entries ({errors} errors)".format( entries=erroneous_entries, errors=errors_count ))
def scrap_profile(self, profile_linkedin_url, profile_known_graduation_date): if not is_url_valid(profile_linkedin_url): return ScrapingResult('BadFormattedLink') # Scraping of the profile may fail due to human check forced by LinkedIn try: # Setting of the delay (seconds) between operations that need to be sure loading of page is ended loading_pause_time = 2 loading_scroll_time = 1 # Opening of the profile page self.browser.get(profile_linkedin_url) if not str(self.browser.current_url).strip( ) == profile_linkedin_url.strip(): if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/': return ScrapingResult('ProfileUnavailable') else: raise HumanCheckException # Scraping the Email Address from Contact Info (email) # > click on 'Contact info' link on the page self.browser.execute_script( "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; " "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()" ) time.sleep(loading_pause_time) # > gets email from the 'Contact info' popup try: email = self.browser.execute_script( "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let " "el = " "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes(" "'ci-email')){ " "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()" ) self.browser.execute_script( "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()" ) except: email = 'N/A' # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling) window_height = self.browser.execute_script( "return window.innerHeight") scrolls = 1 while scrolls * window_height < self.browser.execute_script( "return document.body.offsetHeight"): self.browser.execute_script( f"window.scrollTo(0, {window_height * scrolls});") time.sleep(loading_scroll_time) scrolls += 1 try: self.browser.execute_script( "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()" ) time.sleep(loading_pause_time) except: pass # Get all the job positions try: job_positions = self.browser\ .find_element_by_id('experience-section')\ .find_elements_by_tag_name('li') except NoSuchElementException: print("job_positions is null") job_positions = [] # Get all the education positions try: education_positions = self.browser\ .find_element_by_id('education-section')\ .find_elements_by_tag_name('li') except NoSuchElementException: print("job_positions is null") education_positions = [] # Parsing of the page html structure soup = BeautifulSoup(self.browser.page_source, 'lxml') # Scraping the Name (using soup) try: name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') profile_name = name_loc[0].find('li').get_text().strip() except: return ScrapingResult('ERROR IN SCRAPING NAME') # Parsing skills try: self.browser.execute_script( "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()" ) time.sleep(loading_pause_time) except: pass try: skills = self.browser.execute_script( "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()" ) except: skills = [] # Parsing the job positions if len(job_positions) > 0: # Parse job positions to extract relative the data ranges js = self.parsing_jobs(job_positions) job_positions_data_ranges = js['job_positions_data_ranges'] Jobs_array = js['Jobs_array'] last_job = Jobs_array[0] if len(education_positions) > 0: eds = self.parsing_educations(education_positions) return ScrapingResult( Profile( profile_name, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), Jobs_array, eds)) else: return ScrapingResult( Profile( profile_name, email, skills, last_job, JobHistorySummary(profile_known_graduation_date, job_positions_data_ranges), Jobs_array)) else: return ScrapingResult(Profile(profile_name, email, skills)) except HumanCheckException: if self.headless_option: raise CannotProceedScrapingException linkedin_logout(self.browser) linkedin_login(self.browser, self.config.get('linkedin', 'username'), self.config.get('linkedin', 'password')) while self.browser.current_url != 'https://www.linkedin.com/feed/': message_to_user('Please execute manual check', self.config) time.sleep(30) return self.scrap_profile(profile_linkedin_url, profile_known_graduation_date)