Example #1
0
    def __scrape_profile(self, profile_linkedin_url):

        if not is_url_valid(profile_linkedin_url):
            raise ScrapingException

        self.browser.get(profile_linkedin_url)

        # Check correct loading of profile and eventual Human Check
        if not str(self.browser.current_url).strip() == profile_linkedin_url.strip():
            if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                raise ScrapingException
            else:
                raise HumanCheckException

        self.load_full_page()

        # SCRAPING

        profile_name = self.scrape_profile_name()

        email = self.scrape_email()

        skills = self.scrape_skills()

        jobs = self.scrape_jobs()  # keep as last scraping

        return Profile(
            name=profile_name,
            email=email,
            skills=skills,
            jobs=jobs
        )
def main(
	folder=("d", "", "Folder with transcription files"),
	page=("p", "", "Output github wiki page (in markdown)"),
	url_prefix=("u", "", "URL prefix of the filenames")
):
	if (len(folder) == 0) or (not os.path.isdir(folder)):
		print("Folder should be specified")
		sys.exit(1)
	
	if len(page) == 0:
		print("Github page should be specified")
		sys.exit(1)
		
	if not utils.is_url_valid(url_prefix):
		print("URL prefix isn't avalid url")
		sys.exit(1)
	
	with open(page, "w") as wiki_file:
		for tr in os.listdir(folder):
			if not fnmatch.fnmatch(tr, "*.md"):
				continue
			
			folder_basename = os.path.basename(folder)
			
			pretty_name, ext =  os.path.splitext(os.path.basename(tr.replace("_", " ")))
			item_string = "* [{link}]({prefix}{name})\n".format(
				link = pretty_name,
				prefix = url_prefix,
				name = tr)
			wiki_file.write(item_string)	
Example #3
0
def add(user=None):
    if 'url' not in request.form:
        return jsonify(errors=['Dude, what\'s wrong with you ?', 'You are missing Bookmark Url']), 400
    if not is_url_valid(request.form['url']):
        return jsonify(errors=['Dude, what\'s wrong with you ?', 'Invalid Bookmark Url']), 400
    bookmark = Bookmark(url=request.form['url'], user=user)
    if bookmark.save():
        return jsonify(bookmark=bookmark.to_dict()), 200
    return jsonify(errors=[bookmark.to_dict()]), 400
Example #4
0
    def __scrape_profile(self, profile_linkedin_url):

        if not is_url_valid(profile_linkedin_url):
            raise ScrapingException

        self.browser.get(profile_linkedin_url)

        # Check correct loading of profile and eventual Human Check
        if not str(self.browser.current_url).strip() == profile_linkedin_url.strip():
            if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                raise ScrapingException
            else:
                raise HumanCheckException

        self.load_full_page()

        # SCRAPING

        profile_name = self.scrape_profile_name()

        profile_title=self.scrape_title()

        profile_image= self.scrape_image()

        email = self.scrape_email()

        about_profile=self.scrape_about_profile()

        education_profile=self.scrape_education()

        edu1=self.scrape_edu1()
        edu2=self.scrape_edu2()
        edu3=self.scrape_edu3()


        profile_recommendations=self.scrape_recommendations()

        skills = self.scrape_skills()        

        jobs = self.scrape_jobs()  # keep as last scraping

        
        return Profile(
            name=profile_name,
            title=profile_title,
            image=profile_image,
            email=email,
            about=about_profile,
            education=education_profile,
            edu1=edu1,
            edu2=edu2,
            edu3=edu3,
            recommendations= profile_recommendations,
            skills=skills,            
            jobs=jobs
           )
Example #5
0
def add(user=None):
    if 'url' not in request.form:
        return jsonify(errors=[
            'Dude, what\'s wrong with you ?', 'You are missing Bookmark Url'
        ]), 400
    if not is_url_valid(request.form['url']):
        return jsonify(
            errors=['Dude, what\'s wrong with you ?', 'Invalid Bookmark Url'
                    ]), 400
    bookmark = Bookmark(url=request.form['url'], user=user)
    if bookmark.save():
        return jsonify(bookmark=bookmark.to_dict()), 200
    return jsonify(errors=[bookmark.to_dict()]), 400
Example #6
0
def parse_args():
    parser = argparse.ArgumentParser(description='A simple python web crawler')
    parser.add_argument('url', metavar='url', type=str, help='Url to crawl.')
    parser.add_argument('--max_page_depth', dest='max_page_depth', type=int, default=5,
                        help='Maximum depth to crawl on a given url.')
    parser.add_argument('--max_external_sites_page_depth', dest='max_external_sites_page_depth', type=int, default=4,
                        help='Maximum external sites depth to crawl on a given url.')
    parser.add_argument('--request_rate_limit', dest='request_rate_limit', type=int, default=4,
                        help='Maximum requests at once.')
    parsed = parser.parse_args()
    if not is_url_valid(parsed.url):
        print 'Please enter a url in the following format: http://example.com/optional_query'
    else:
        return parsed
Example #7
0
def check_url_validity(item, errors):
	"""
	Checks url for validity
	"""
	url = item.get("url")
	item_id = item.get("id")
	if url is None:
		return
	for number, single_url in enumerate(url):
		if not utils.is_url_valid(single_url, item):
			errors.add("Field url with value [{single_url}] and number {number} is wrong".format(
				single_url=single_url,
				number=number
			))

		if not utils.is_url_self_served(single_url, item):
			continue

		match = utils.SELF_SERVED_URL_REGEXP.match(single_url)
		if not match:
			errors.add("Self served url [{single_url}] doesn't match SELF_SERVED_URL_REGEXP".format(
				single_url=single_url
			))
			continue
		if (match.group("item_id") != item_id):
			errors.add("Wrong item_id specified in self-served url")
			continue

		single_filename, single_filesize = utils.get_file_info_from_url(single_url, item)
		metadata = utils.extract_metadata_from_file(single_filename)
		owners = metadata.get("owner").split("+")
		if not owners:
			errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format(
				number=number,
				url=single_url,
				filename=single_filename
			))
			continue
		for owner in owners:
			owner_fullname = config.parser.bookkeepers.get(owner)
			if owner_fullname:
				annotation = item.get("annotation")
				if (
					(not annotation) or
					(owner_fullname not in annotation)
				):
					errors.add("Owner fullname ({owner_fullname}) should be present in annotation".format(
						owner_fullname=owner_fullname
					))
Example #8
0
def check_url_validity(item, errors):
	"""
	Checks url for validity
	"""
	url = item.get("url")
	item_id = item.get("id")
	booktype = item.get("booktype")
	if url is None:
		return
	for number, single_url in enumerate(url):
		if not utils.is_url_valid(single_url, item):
			errors.add("Field url with value [{single_url}] and number {number} is wrong".format(
				single_url=single_url,
				number=number
			))

		match = utils.SELF_SERVED_URL_REGEXP.match(single_url)
		if not match:
			continue
		#inproceedings can have self-served url pointing 
		#to entire full proceedings book
		#TODO: invent something like PARTIAL_BOOKTYPES
		if (booktype == "inproceedings"):
			continue
			
		if (match.group("item_id") != item_id):
			errors.add("Wrong item_id specified in self-served url")
			continue
			
		single_filename, single_filesize = utils.get_file_info_from_url(single_url, item)
		metadata = utils.extract_metadata_from_file(single_filename)
		owner = metadata.get("owner")
		if owner is None:
			errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format(
				number=number,
				url=single_url,
				filename=single_filename
			))
			continue
		owner_fullname = config.parser.bookkeepers.get(owner)
		if owner_fullname:
			annotation = item.get("annotation")
			if (
				(not annotation) or 
				(owner_fullname not in annotation)
			):
				errors.add("Owner fullname ({owner_fullname}) should be present in annotation".format(
					owner_fullname=owner_fullname
				))
def validate_url_validity(item, errors):
	"""
	Checks url for validity
	"""
	url = item.get("url")
	item_id = item.get("id")
	if url is None:
		return
	for number, single_url in enumerate(url):
		if not utils.is_url_valid(single_url, item):
			errors.add("Field url with value [{single_url}] and number {number} is wrong".format(
				single_url=single_url,
				number=number
			))

		if not utils.is_url_self_served(single_url, item):
			continue

		match = utils.SELF_SERVED_URL_REGEXP.match(single_url)
		if not match:
			errors.add("Self served url [{single_url}] doesn't match SELF_SERVED_URL_REGEXP".format(
				single_url=single_url
			))
			continue
		if (match.group("item_id") != item_id):
			errors.add("Wrong item_id specified in self-served url")
			continue

		single_filename, single_filesize = utils.get_file_info_from_url(single_url, item)
		metadata = utils.extract_metadata_from_file(single_filename)
		owners = metadata.get("owner").split("+")
		if not owners:
			errors.add("Owner specification expected for self-served url #{number} (url={url}, filename={filename})".format(
				number=number,
				url=single_url,
				filename=single_filename
			))
			continue
		for owner in owners:
			owner_fullname = config.parser.bookkeepers.get(owner)
			if owner_fullname:
				note = item.get("note")
				if (
					(not note) or
					(owner_fullname not in note)
				):
					errors.add("Owner fullname ({owner_fullname}) should be present in note".format(
						owner_fullname=owner_fullname
					))
Example #10
0
def check_url_validity(item, errors):
    """
	Checks url for validity
	"""
    url = item.get("url")
    item_id = item.get("id")
    booktype = item.get("booktype")
    if url is None:
        return
    for number, single_url in enumerate(url):
        if not utils.is_url_valid(single_url, item):
            errors.add(
                "Field url with value [{single_url}] and number {number} is wrong"
                .format(single_url=single_url, number=number))

        match = utils.SELF_SERVED_URL_REGEXP.match(single_url)
        if not match:
            continue
        #inproceedings can have self-served url pointing
        #to entire full proceedings book
        #TODO: invent something like PARTIAL_BOOKTYPES
        if (booktype == "inproceedings"):
            continue

        if (match.group("item_id") != item_id):
            errors.add("Wrong item_id specified in self-served url")
            continue

        single_filename, single_filesize = utils.get_file_info_from_url(
            single_url, item)
        metadata = utils.extract_metadata_from_file(single_filename)
        owner = metadata.get("owner")
        if owner is None:
            errors.add(
                "Owner specification expected for self-served url #{number} (url={url}, filename={filename})"
                .format(number=number,
                        url=single_url,
                        filename=single_filename))
            continue
        owner_fullname = config.parser.bookkeepers.get(owner)
        if owner_fullname:
            annotation = item.get("annotation")
            if ((not annotation) or (owner_fullname not in annotation)):
                errors.add(
                    "Owner fullname ({owner_fullname}) should be present in annotation"
                    .format(owner_fullname=owner_fullname))
Example #11
0
def get_last_date(driver):
    logger.debug('Trying to find last article on page')

    home_page = bs(get_html(driver), 'html.parser')
    articles_tags = home_page.find('ul', {
        'class': 'posts-listing__list'
    }).find_all('article')

    last_index = -1
    while not is_url_valid(URL + articles_tags[last_index].header.a['href']):
        last_index -= 1

    url = URL + articles_tags[last_index].header.a['href']
    logger.debug('Found last article element')

    logger.debug('Fetching last article data from url: [{}]'.format(url))
    article_div = bs(get_content(url), 'html.parser').body.main.find(
        'div', {'class': 'post post-page__article'})
    date = article_div.find('div', {
        'class': 'post-meta__publish-date'
    }).time['datetime']
    logger.debug('Extracted date from last article [{}]'.format(str(date)))

    return date
    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Phone from Contact Info (email)
            try:
                phone = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-phone')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Birthday from Contact Info (email)
            try:
                birthday = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-birthday')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )
            except:
                pass

            # Scraping the Date Connected from Contact Info (email)
            try:
                connectedDate = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-connected')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                pass

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser.find_element_by_id(
                    'experience-section').find_elements_by_tag_name('li')
            except:
                job_positions = []

            #Get all the educations
            try:
                educations = self.browser.find_element_by_id(
                    'education-section').find_elements_by_tag_name('li')
            except:
                educations = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                headline = name_div.find_all('h2')
                headline = headline[0].get_text().strip()
                profile_name = name_loc[0].find('li').get_text().strip()
                locationNConnection = name_loc[1].find_all('li')
                location = locationNConnection[0].get_text().strip()
                try:
                    connection = locationNConnection[1].find('a').find(
                        'span').get_text().strip()
                except:
                    connection = locationNConnection[1].find(
                        'span').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Scraping the Desc (using soup)
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('lt-line-clamp__more')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                if (self.browser.execute_script(
                        "return (els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line').length)"
                )):
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

                else:
                    profile_desc = self.browser.execute_script(
                        "return (function(){els = document.getElementsByClassName('pv-oc')[0].getElementsByClassName('lt-line-clamp__raw-line');results = [];for (var i=0; i < els.length; i++){results.push(els[i].innerText);}return results;})()"
                    )

            except:
                profile_desc = []

            # print(profile_desc)

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            education_list = []
            # Parsing the job positions
            if len(educations) > 0:
                # Parse job positions to extract relative the data ranges
                educations_data_ranges = []
                x = 1
                for education in educations:
                    try:
                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'education-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            education_name = a_tags.find(
                                'h3').get_text().strip()

                        except:
                            eudcation_name = None

                        try:
                            education_degree_name = a_tags.find_all(
                                'p')[0].get_text().strip()
                        except:
                            education_degree_name = None

                        try:
                            education_major = a_tags.find_all(
                                'p')[1].get_text().strip()
                        except:
                            education_major = None

                        try:
                            education_year = a_tags.find_all(
                                'p')[2].get_text().strip()
                        except:
                            education_year = None

                            # last_job_company_name = a_tags.find_all('span')[1].get_text().strip()
                            # last_job_title = exp_section.find('ul').find('li').find_all('span')[2].get_text().strip()

                            # spans = exp_section.find('ul').find('li').find_all('span')

                        #last_job_company_name = last_job_company_name.replace('Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        # last_job_location = Location()
                        # next_span_is_location = False
                        # for span in spans:
                        #     if next_span_is_location:
                        #         last_job_location.parse_string(span.get_text().strip())
                        #         break
                        #     if span.get_text().strip() == 'Location':
                        #         next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]
                        education_list.append(
                            Education(education_name=education_name,
                                      degree_name=education_degree_name,
                                      major=education_major,
                                      year=education_year))

                    except:
                        pass

            for x in range(3 - len(educations)):
                education_list.append(
                    Education(education_name=None,
                              degree_name=None,
                              major=None,
                              year=None))

            last_job = []
            # Parsing the job positions
            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                job_positions_data_ranges = []
                x = 1
                for job_position in job_positions:
                    # Get the date range of the job position
                    try:
                        date_range_element = job_position.find_element_by_class_name(
                            'pv-entity__date-range')
                        date_range_spans = date_range_element.find_elements_by_tag_name(
                            'span')
                        date_range = date_range_spans[1].text

                        job_positions_data_ranges.append(date_range)

                        # Scraping of the last (hopefully current) Job
                        exp_section = soup.find('section',
                                                {'id': 'experience-section'})
                        exp_section = exp_section.find('ul')
                        div_tags = exp_section.contents[x].find('div')
                        a_tags = div_tags.find('a')
                        x += 1

                        # Scraping of the last Job - company_name, job_title
                        try:
                            last_job_company_name = a_tags.find_all(
                                'p')[1].get_text().strip()
                            last_job_title = a_tags.find(
                                'h3').get_text().strip()

                            spans = a_tags.find_all('span')
                        except:
                            last_job_company_name = a_tags.find_all(
                                'span')[1].get_text().strip()
                            last_job_title = exp_section.find('ul').find(
                                'li').find_all('span')[2].get_text().strip()
                            spans = exp_section.find('ul').find('li').find_all(
                                'span')

                        last_job_company_name = last_job_company_name.replace(
                            'Full-time', '').replace('Part-time', '').strip()

                        # Scraping of last Job - location
                        last_job_location = Location()
                        next_span_is_location = False
                        for span in spans:
                            if next_span_is_location:
                                last_job_location.parse_string(
                                    span.get_text().strip())
                                break
                            if span.get_text().strip() == 'Location':
                                next_span_is_location = True

                        # # Scraping of Industry related to last Job
                        # last_job_company_url = a_tags.get('href')
                        # if last_job_company_url not in self.industries_dict:
                        #     try:
                        #         self.browser.get('https://www.linkedin.com' + last_job_company_url)
                        #         self.industries_dict[last_job_company_url] = self.browser.execute_script(
                        #             "return document.getElementsByClassName("
                        #             "'org-top-card-summary-info-list__info-item')["
                        #             "0].innerText")
                        #     except:
                        #         self.industries_dict[last_job_company_url] = 'N/A'

                        # last_job_company_industry = self.industries_dict[last_job_company_url]

                        last_job.append(
                            Job(
                                position=last_job_title,
                                company=Company(
                                    name=last_job_company_name,
                                    #industry=last_job_company_industry
                                ),
                                location=last_job_location))

                    except:
                        last_job.append(
                            Job(
                                position=None,
                                company=Company(
                                    name=None,
                                    #industry=last_job_company_industry
                                ),
                                location=None))

                for x in range(4 - len(job_positions)):
                    last_job.append(
                        Job(
                            position=None,
                            company=Company(name=None,
                                            #industry=last_job_company_industry
                                            ),
                            location=None))

                print(
                    "profile_name {} \n headline {} \n location {} \n connection {} \n profile_desc {} \n email {} \n phone {} \n birthday {} \n connectedDate {} \n skills {} \n last_job {} \n last_job {} \n last_job {} \n last_job {} \n  education {} \n"
                    .format(profile_name, headline, location, connection,
                            profile_desc, email, phone, birthday,
                            connectedDate, skills, last_job[0], last_job[1],
                            last_job[2], last_job[3], education_list[0]))

                return ScrapingResult(
                    Profile(
                        profile_name, headline, location, connection,
                        connectedDate, phone, birthday, profile_desc, email,
                        skills, last_job,
                        JobHistorySummary(profile_known_graduation_date,
                                          job_positions_data_ranges),
                        education_list))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)
Example #13
0
def get_profile_data(profile_data_line):
    global industries_dict
    # this function supports data as:
    #
    #   https://www.linkedin.com/in/federicohaag ==> parse name, email, last job
    #
    #   https://www.linkedin.com/in/federicohaag:::01/01/1730 ==> parse name, email, last job
    #   and also produces a "job history summary" returning if the person was working while studying,
    #   and how fast she/he got a job after the graduation.
    #   As graduation date is used the one passed as parameter, NOT the date it could be on LinkedIn

    # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
    loading_pause_time = 2
    loading_scroll_time = 1

    # Get known graduation date
    known_graduation_date = None
    if len(profile_data_line) == 2:
        known_graduation_date = time.strptime(
            '/'.join(profile_data_line[1].strip().split("/")[1:]), '%m/%y')

    # Get the url of LinkedIn profile
    profile_linkedin_url = profile_data_line[0]
    if not is_url_valid(profile_linkedin_url):
        return ScrapingResult('BadFormattedLink')

    # Opening of the profile page
    browser.get(profile_linkedin_url)

    if browser.current_url != profile_linkedin_url:
        if browser.current_url == 'https://www.linkedin.com/in/unavailable/':
            return ScrapingResult('ProfileUnavailable')
        else:
            raise HumanCheckException

    # Scraping the Email Address from Contact Info (email)

    # > click on 'Contact info' link on the page
    browser.execute_script(
        "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
        "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
    )
    time.sleep(loading_pause_time)

    # > gets email from the 'Contact info' popup
    try:
        email = browser.execute_script(
            "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
            "el = "
            "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
            "'ci-email')){ "
            "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
        )

        browser.execute_script(
            "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
        )
    except:
        email = 'N/A'

    # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
    window_height = browser.execute_script("return window.innerHeight")
    scrolls = 1
    while scrolls * window_height < browser.execute_script(
            "return document.body.offsetHeight"):
        browser.execute_script(
            f"window.scrollTo(0, {window_height * scrolls});")
        time.sleep(loading_scroll_time)
        scrolls += 1

    try:
        browser.execute_script(
            "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
        )
        time.sleep(loading_pause_time)
    except:
        pass

    # Get all the job positions
    try:
        list_of_job_positions = browser.find_element_by_id(
            'experience-section').find_elements_by_tag_name('li')
    except:
        list_of_job_positions = []

    # Get job experiences (two different positions in Company X is considered one job experience)
    try:
        job_experiences = browser.find_elements_by_class_name(
            'pv-profile-section__card-item-v2')
    except:
        job_experiences = []

    # Parsing of the page html structure
    soup = BeautifulSoup(browser.page_source, 'lxml')

    # Scraping the Name (using soup)
    try:
        name_div = soup.find('div', {'class': 'flex-1 mr5'})
        name_loc = name_div.find_all('ul')
        profile_name = name_loc[0].find('li').get_text().strip()
    except:
        return ScrapingResult('ERROR IN SCRAPING NAME')

    # Parsing the job positions
    if len(list_of_job_positions) > 0:

        # Parse job positions to extract relative the data ranges
        job_positions_data_ranges = []
        for job_position in list_of_job_positions:

            # Get the date range of the job position
            try:
                date_range_element = job_position.find_element_by_class_name(
                    'pv-entity__date-range')
                date_range_spans = date_range_element.find_elements_by_tag_name(
                    'span')
                date_range = date_range_spans[1].text

                job_positions_data_ranges.append(date_range)
            except:
                pass

        # Compute the 'job history' of the profile if the graduation date is provided in profiles_data.txt file
        job_history_summary = compute_job_history_summary(
            known_graduation_date, job_positions_data_ranges, job_experiences)

        # Scraping of the last (hopefully current) Job
        exp_section = soup.find('section', {'id': 'experience-section'})
        exp_section = exp_section.find('ul')
        div_tags = exp_section.find('div')
        a_tags = div_tags.find('a')

        # Scraping of the last (hopefully current) Job - company_name, job_title
        try:
            current_job_company_name = a_tags.find_all(
                'p')[1].get_text().strip()
            current_job_title = a_tags.find('h3').get_text().strip()

            spans = a_tags.find_all('span')
        except:
            current_job_company_name = a_tags.find_all(
                'span')[1].get_text().strip()
            current_job_title = exp_section.find('ul').find('li').find_all(
                'span')[2].get_text().strip()

            spans = exp_section.find('ul').find('li').find_all('span')

        current_job_company_name = current_job_company_name.replace(
            'Full-time', '').replace('Part-time', '').strip()

        # Scraping of last (hopefully current) Job - location
        location = Location()
        next_span_is_location = False
        for span in spans:
            if next_span_is_location:
                location.parse_string(span.get_text().strip())
                break
            if span.get_text().strip() == 'Location':
                next_span_is_location = True

        # Scraping of Industry related to last (hopefully current) Job
        company_url = a_tags.get('href')
        if company_url not in industries_dict:
            try:
                browser.get('https://www.linkedin.com' + company_url)
                industries_dict[company_url] = browser.execute_script(
                    "return document.getElementsByClassName("
                    "'org-top-card-summary-info-list__info-item')[0].innerText"
                )
            except:
                industries_dict[company_url] = 'N/A'

        current_job_company_industry = industries_dict[company_url]

        company = Company(name=current_job_company_name,
                          industry=current_job_company_industry)
        current_job = Job(position=current_job_title,
                          company=company,
                          location=location)
        profile = Profile(profile_name, email, current_job,
                          job_history_summary)

    else:
        profile = Profile(profile_name, email)

    return ScrapingResult(profile)
def main(
	root=("r", "", "E-library root"),
	check_head=("", False, "Perform HTTP HEAD request to url values")
):
	"""
	Validates bibliography over a bunch of rules
	"""	
	if (len(root) == 0) or (not os.path.isdir(root)):
		print("Root folder is inaccessible")
		sys.exit(1)
		
	root = os.path.abspath(root)
	print("Going to process {0} items".format(len(items)))

	SOURCE_REGEXP = re.compile("(?P<basename>[_\-\w\.]+).bib:\d+")
	MULTILANG_FILES = {"proceedings-spb", "proceedings-rothenfelser", "_missing", "_problems"}
	VALID_BOOKTYPES = {
		"book",
		"mvbook",
		"inproceedings",
		"proceedings",
		"reference",
		"mvreference",
		"periodical",
		"unpublished",
		"thesis",
		"article"
	}
	NON_MULTIVOLUME_BOOKTYPES = {"article", "periodical"}
	MULTIVOLUME_BOOKTYPES = {"mvbook", "mvreference"}
	
	#don't validate filename for the given entrytypes
	MULTIENTRY_BOOKTYPES = {"proceedings", "inproceedings"}
	SHORTHAND_LIMIT = 25

	#magic constant
	LAST_ORIGINAL_YEAR = 1937
	NON_ORIGINAL_KEYWORDS = {"reissue", "research"}
	RESEARCH_BOOKTYPES = {"book", "mvbook"}
	
	UNPUBLISHED_NOTE_PREFIX = "Unpublished manuscript"

	erroneous_entries = 0
	errors_count = 0
	for item in items:
		errors = []
		#datamodel validation
		author = item.get("author")
		booktype = item.get("booktype")
		booktitle = item.get("booktitle")
		commentator = item.get("commentator")
		edition = item.get("edition")
		filename = item.get("filename")
		id = item.get("id")
		isbn = item.get("isbn")
		institution = item.get("institution")
		journaltitle = item.get("journaltitle")
		keywords = set(item.get("keywords")) if item.get("keywords") else None
		langid = item.get("langid")
		location = item.get("location")
		note = item.get("note")
		number = item.get("number")
		origlanguage = item.get("origlanguage")
		publisher = item.get("publisher")
		series = item.get("series")
		shorthand = item.get("shorthand")
		source = item.get("source")
		title = item.get("title")
		translator = item.get("translator")
		type = item.get("type")
		url = item.get("url")
		volume = item.get("volume")
		volumes = item.get("volumes")
		year = item.get("year")
		year_from = item.get("year_from")
		year_to = item.get("year_to")
		year_circa = item.get("year_circa")
		added_on = item.get("added_on")
		
		match = SOURCE_REGEXP.match(source)
		if not match:
			raise RuntimeError("Failed to parse 'source' for item ({id})".format(
				id=id
			))
		source_basename = match.group("basename")
		
		parser_obligatory = [id, booktype, source, year_from, year_to, year_circa]
		none_checker = lambda obj: obj is not None
		if not all(map(none_checker, parser_obligatory)):
			raise RuntimeError("Parser hasn't generated all required auxiliary fields {fields}".format(
				fields=parser_obligatory
			))
		
		general_obligatory = [langid, year, title, added_on]
		if not all(general_obligatory):
			errors.append("Item doesn't define one of [langid, year, title]")
		
		translation_obligatory = [origlanguage, translator]
		if not utils.all_or_none(translation_obligatory):
			errors.append("All of [origlanguage, translator] must be present for translations")
		
		series_obligatory = [series, number]
		if not utils.all_or_none(series_obligatory):
			errors.append("All of [series, number] must be present for serial books")
		
		if not any([author, shorthand]):
			errors.append("'author' or 'shorthand' must be present")
		
		if (publisher is not None) and (location is None):
			errors.append("If publisher present, location must be present")
		
		#booktype validation
		booktype = booktype.lower()
		if booktype not in VALID_BOOKTYPES:
			errors.append("Invalid booktype ({booktype})".format(
				booktype=booktype
			))
		
		if (booktype not in NON_MULTIVOLUME_BOOKTYPES):
			if (volume is not None) and (volumes is None):
				errors.append("If volume present, volumes must be present")
		
		if (booktype in MULTIVOLUME_BOOKTYPES):
			if volumes is None:
				errors.append("volumes must be present for @{0}".format(booktype))
		
		if (booktype == "article"):
			if journaltitle is None:
				errors.append("journaltitle must be present for @article")
		
		if (booktype == "inproceedings"):
			if booktitle is None:
				errors.append("bootitle must be present for @inprocessing")
		
		if (booktype == "thesis"):
			if url is None:
				errors.append("url must be present for @thesis")
			if type is None:
				errors.append("type must be present for @thesis")
			if institution is None:
				errors.append("institution must be present for @thesis")
		
		#data validation
		#author validation empty
		
		#booktitle validation empty
		
		#commentator
		if commentator is not None:
			if (keywords is None) or ("commentary" not in keywords):
				errors.append("Keywords should contain 'commentary' when commentator specified")
		
		#filename validation
		if edition is not None:
			#edition should be greater than 1
			if edition <= 1:
				errors.append("Wrong edition {edition}".format(
					edition=edition
				))
		
		if volume is not None:
			#volume should be positive integer
			if volume <= 0:
				errors.append("Wrong volume {volume}".format(
					volume=volume
				))
			if volumes is not None:
				if volume > volumes:
					errors.append("Volume ({volume}) can't be greater than volumes ({volumes})".format(
						volume=volume,
						volumes=volumes
					))
		
		#filename validation
		if (filename is not None) and (booktype not in MULTIENTRY_BOOKTYPES):
			for filename_ in filename:
				#filename starts with "/" which will mix os.path.join up
				abspath = os.path.join(root, filename_[1:])
				#each filename should be accessible
				if not os.path.isfile(abspath):
					errors.append("File {filename_} is not accessible".format(
						filename_=filename_
					))
					
				#item should be searchable by its filename metadata
				metadata = utils.extract_metadata_from_file(filename_)
				
				#validating optional author, edition, tome
				#in case when item specifies value, but filename doesn't
				if not utils.all_or_none([metadata.get("author", None), author]):
					errors.append("File {filename_} and entry have different author specifications".format(
						filename_=filename_
					))
					
				if not utils.all_or_none([metadata.get("edition", None), edition]):
					errors.append("File {filename_} and entry have different edition specifications".format(
						filename_=filename_
					))
					
				if not utils.all_or_none([metadata.get("tome", None), any([volume, volumes])]):
					errors.append("File {filename_} and entry have different volume specifications".format(
						filename_=filename_
					))
				
				meta_keywords = metadata.get("keywords", None)
				if meta_keywords is not None:
					if ("incomplete" not in meta_keywords) and (source_basename == "_problems"):
						errors.append("Incomplete books should be stored in _problems.bib")
					meta_keywords.discard("incomplete")
					
					if len(meta_keywords) > 0:
						if keywords is None:
							errors.append("No keywords specified (should be {meta_keywords}".format(
								meta_keywords=meta_keywords
							))
						elif not keywords >= meta_keywords:
							errors.append("Item keywords {keywords} do not match metadata keywords {meta_keywords}".format(
								keywords=keywords,
								meta_keywords=meta_keywords
							))
				
				search_ = utils.create_search_from_metadata(metadata)
				if not search_(item):
					errors.append("File {filename_} is not searchable by extracted params".format(
						filename_=filename_
					))
		
		#id validation empty
		if len(item_index["id"][id]) != 1:
			errors.append("Id is not unique")
			
		#isbn validation
		if isbn is not None:
			for isbn_ in isbn:
				correct, msg = utils.is_isbn_valid(isbn_)
				if not correct:
					errors.append("ISBN {isbn_} isn't valid: {msg}".format(
						isbn_=isbn_,
						msg=msg
					))
		
		#institution validation empty
		
		#journaltitle validation empty
		
		#keywords validation
		#if item was issued after LAST_ORIGINAL_YEAR, it should define keywords
		if True:
			if (year_from > LAST_ORIGINAL_YEAR) and (booktype in RESEARCH_BOOKTYPES):
				if (keywords is None) or (len(keywords & NON_ORIGINAL_KEYWORDS) == 0):
					errors.append("Item was issued after {last_year}, but keywords don't define any of {keywords}".format(
						last_year=LAST_ORIGINAL_YEAR,
						keywords=NON_ORIGINAL_KEYWORDS
					))
			if (keywords is not None):
				if ("translation" in keywords) and not all([translator, origlanguage]):
					errors.append("When 'translation' keyword specified, translator and origlanguage should be present")
				if ("commentary" in keywords) and not commentator:
					errors.append("When 'commentary' keyword specified, commentator should be present")
				
		#langid validation
		if source_basename not in MULTILANG_FILES:
			source_lang = const.LONG_LANG_MAP[source_basename]
			#item language should match source language
			if langid != source_lang:
				errors.append("Source language ({source_lang}) doesn't match item language ({langid})".format(
					source_lang=source_lang,
					langid=langid
				))
		#location validation empty
		
		#note validation
		note_unpublished = (note is not None) and (note.startswith(UNPUBLISHED_NOTE_PREFIX))
		booktype_unpublished = (booktype == "unpublished")
		if not utils.all_or_none([note_unpublished, booktype_unpublished]):
			errors.append("For unpublished books, note should begin with [{note_prefix}] and booktype should be {booktype}".format(
				booktype="unpublished",
				note_prefix=UNPUBLISHED_NOTE_PREFIX
			))
		
		
		#number validation empty
		
		#origlanguage validation empty
		
		#publisher validation empty
		
		#series validation empty
		
		#shorthand validation empty
		if shorthand is not None:
			length = len(shorthand)
			if length > SHORTHAND_LIMIT:
				errors.append("The length of shorthand ({length}) should not exceed limit ({limit})".format(
					length=length,
					limit=SHORTHAND_LIMIT
				))
			if (author is None) and (not title.startswith(shorthand)):
				errors.append("Title ({title}) should begin with from shorthand ({shorthand})".format(
					title=title,
					shorthand=shorthand
				))
		
		#source validation empty
		
		#title validation empty
		if title is not None:
			if ("  " in title):
				errors.append("Consecutive spaces in title")
			if ("\t" in title):
				errors.append("Tabs in title")
			if title.startswith(" ") or title.endswith(" "):
				errors.append("Title isn't stripped")
		
		#translator validation
		if translator is not None:
			if (keywords is None) or ("translation" not in keywords):
				errors.append("Keywords should contain 'translation' when 'translator' field specified")
				
		#type validation empty
		
		#url validation empty
		if url is not None:
			correct, msg = utils.is_url_valid(url, check_head)
			if not correct:
				errors.append("URL {url} isn't valid: {msg}".format(
					url=url,
					msg=msg
				))
			
		#volume validation empty
		
		#volumes validation empty
		
		#year validation empty
		
		#printing errors
		if len(errors) > 0:
			erroneous_entries += 1
			errors_count += len(errors)
			print("Errors for {id} ({source})".format(
				id=id,
				source=source
			))
			for error in errors:
				print("    " + error)
		
	print("Found {entries} erroneous entries ({errors} errors)".format(
		entries=erroneous_entries,
		errors=errors_count
	))
Example #15
0
    def scrap_profile(self, profile_linkedin_url,
                      profile_known_graduation_date):

        if not is_url_valid(profile_linkedin_url):
            return ScrapingResult('BadFormattedLink')

        # Scraping of the profile may fail due to human check forced by LinkedIn
        try:

            # Setting of the delay (seconds) between operations that need to be sure loading of page is ended
            loading_pause_time = 2
            loading_scroll_time = 1

            # Opening of the profile page
            self.browser.get(profile_linkedin_url)

            if not str(self.browser.current_url).strip(
            ) == profile_linkedin_url.strip():
                if self.browser.current_url == 'https://www.linkedin.com/in/unavailable/':
                    return ScrapingResult('ProfileUnavailable')
                else:
                    raise HumanCheckException

            # Scraping the Email Address from Contact Info (email)

            # > click on 'Contact info' link on the page
            self.browser.execute_script(
                "(function(){try{for(i in document.getElementsByTagName('a')){let el = document.getElementsByTagName('a')[i]; "
                "if(el.innerHTML.includes('Contact info')){el.click();}}}catch(e){}})()"
            )
            time.sleep(loading_pause_time)

            # > gets email from the 'Contact info' popup
            try:
                email = self.browser.execute_script(
                    "return (function(){try{for (i in document.getElementsByClassName('pv-contact-info__contact-type')){ let "
                    "el = "
                    "document.getElementsByClassName('pv-contact-info__contact-type')[i]; if(el.className.includes("
                    "'ci-email')){ "
                    "return el.children[2].children[0].innerText; } }} catch(e){return '';}})()"
                )

                self.browser.execute_script(
                    "document.getElementsByClassName('artdeco-modal__dismiss')[0].click()"
                )
            except:
                email = 'N/A'

            # Loading the entire page (LinkedIn loads content asynchronously based on your scrolling)
            window_height = self.browser.execute_script(
                "return window.innerHeight")
            scrolls = 1
            while scrolls * window_height < self.browser.execute_script(
                    "return document.body.offsetHeight"):
                self.browser.execute_script(
                    f"window.scrollTo(0, {window_height * scrolls});")
                time.sleep(loading_scroll_time)
                scrolls += 1

            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-profile-section__see-more-inline')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            # Get all the job positions
            try:
                job_positions = self.browser\
                    .find_element_by_id('experience-section')\
                    .find_elements_by_tag_name('li')
            except NoSuchElementException:
                print("job_positions is null")
                job_positions = []

            # Get all the education positions
            try:
                education_positions = self.browser\
                    .find_element_by_id('education-section')\
                    .find_elements_by_tag_name('li')
            except NoSuchElementException:
                print("job_positions is null")
                education_positions = []

            # Parsing of the page html structure
            soup = BeautifulSoup(self.browser.page_source, 'lxml')

            # Scraping the Name (using soup)
            try:
                name_div = soup.find('div', {'class': 'flex-1 mr5'})
                name_loc = name_div.find_all('ul')
                profile_name = name_loc[0].find('li').get_text().strip()
            except:
                return ScrapingResult('ERROR IN SCRAPING NAME')

            # Parsing skills
            try:
                self.browser.execute_script(
                    "document.getElementsByClassName('pv-skills-section__additional-skills')[0].click()"
                )
                time.sleep(loading_pause_time)
            except:
                pass

            try:
                skills = self.browser.execute_script(
                    "return (function(){els = document.getElementsByClassName('pv-skill-category-entity');results = [];for (var i=0; i < els.length; i++){results.push(els[i].getElementsByClassName('pv-skill-category-entity__name-text')[0].innerText);}return results;})()"
                )
            except:
                skills = []

            # Parsing the job positions

            if len(job_positions) > 0:
                # Parse job positions to extract relative the data ranges
                js = self.parsing_jobs(job_positions)
                job_positions_data_ranges = js['job_positions_data_ranges']
                Jobs_array = js['Jobs_array']
                last_job = Jobs_array[0]

                if len(education_positions) > 0:
                    eds = self.parsing_educations(education_positions)

                    return ScrapingResult(
                        Profile(
                            profile_name, email, skills, last_job,
                            JobHistorySummary(profile_known_graduation_date,
                                              job_positions_data_ranges),
                            Jobs_array, eds))

                else:
                    return ScrapingResult(
                        Profile(
                            profile_name, email, skills, last_job,
                            JobHistorySummary(profile_known_graduation_date,
                                              job_positions_data_ranges),
                            Jobs_array))

            else:
                return ScrapingResult(Profile(profile_name, email, skills))

        except HumanCheckException:

            if self.headless_option:
                raise CannotProceedScrapingException

            linkedin_logout(self.browser)

            linkedin_login(self.browser,
                           self.config.get('linkedin', 'username'),
                           self.config.get('linkedin', 'password'))

            while self.browser.current_url != 'https://www.linkedin.com/feed/':
                message_to_user('Please execute manual check', self.config)
                time.sleep(30)

            return self.scrap_profile(profile_linkedin_url,
                                      profile_known_graduation_date)