def scrape_image(name, url, csv, image_credit=''): url = url.strip() if not url: return print "scraping %s" % name parts = urlparse.urlparse(url) if not parts.scheme: url = 'http://%s' % url parts = urlparse.urlparse(url) resp = requests.get(url) mime_type = resp.headers['content-type'] if not mime_type.startswith('image/'): scrape_func = VALID_ENDPOINTS.get(parts.netloc, None) if scrape_func is None: raise ScraperException("Cannot scrape image from %s" % parts.netloc) image_url = scrape_func(resp) url = make_abs_url(url, image_url) elif '.%s' % mime_type.split('/')[1] not in ACCEPTED_IMAGE_EXTENSIONS: raise ScraperException("Unsupported image format at %s" % url) csv.write('profileimages.csv', { 'name': name, 'image_url': url, 'image_credit': image_credit })
def get_absolute_url(url): if not url.startswith(ENDPOINT_URL): parts = urlparse.urlparse(url) if parts.scheme == 'https': raise ScraperException("Who's Who does not accept https connections") elif parts.netloc: raise ScraperException("'%s' is not a Who's Who URL" % url) return urlparse.urlunparse(['http', 'whoswho.co.za'] + list(parts[2:])) return url
def _scrape_from_parliament(resp): root = html.fromstring(resp.text) # selects the first image after the brownHeading el # NOTE: should we rather not scrape this at all? img_el = root.xpath("//*[@class='brownHeading']//following::*//img") if len(img_el) > 0: return img_el[0].get('src') raise ScraperException("Image not found at %s" % resp.url)
def _scrape_from_whoswho(resp): root = html.fromstring(resp.text) pic_el = root.get_element_by_id('profile-pic', None) if pic_el is not None: pic_el = pic_el.xpath('a[1]/img') if pic_el: return pic_el[0].get('src') raise ScraperException("Image not found at %s" % resp.url)
def _scrape_from_pa(resp): root = html.fromstring(resp.text) pic_el = root.find_class('profile-pic') if pic_el: pic_el = pic_el[0].xpath('img[1]') if pic_el: return pic_el[0].get('src') raise ScraperException("Image not found at %s" % resp.url)
def _scrape_from_wikipedia(resp): match = re.search(r'(#mediaviewer|wiki)/(?P<filename>File:.*)$', resp.url) if match: filename = match.group('filename') else: root = html.fromstring(resp.text) image_el = root.find_class('vcard')[0].find_class('image') if not image_el: raise ScraperException("Image not found at %s" % resp.url) filename = image_el[0].get('href') filename = filename[filename.index('File:'):] # use the Wikimedia API to get the file url at a reasonable size params = WIKI_PARAMS.copy() params['titles'] = filename.replace('_', ' ') response = requests.get(WIKI_ENDPOINT_URL, params=params) data = response.json()['query'] if 'pages' not in data or len(data['pages']) == 0: raise ScraperException("Image not found at %s" % resp.url) return data['pages'].values()[0]['imageinfo'][0]['thumburl']
def _scrape_from_google(resp): parts = urlparse.urlparse(resp.url) if parts.path == '/imgres': params = dict(urlparse.parse_qsl(parts.query, True)) if 'imgurl' in params: return params['imgurl'] elif parts.path == '/search': if parts.fragment.strip() != '': params = dict(urlparse.parse_qsl(parts.fragment, True)) if 'imgrc' in params: url = urllib.unquote(params['imgrc']).split(';')[-4] return urllib.unquote(url) raise ScraperException("Image not found at %s" % resp.url)
def parse_content(content): # TODO: achievements sections data = { 'related_profiles': [], 'professional_details': [], 'activities': [], 'education': [], } root = html.fromstring(content) # basic info basic_el = root.xpath("//*[@itemtype='http://schema.org/Person'][1]") if len(basic_el) == 0: raise ScraperException("Content doesn't appear to be a person's profile") basic_el = basic_el[0] display_name = first_or_empty(basic_el.xpath("*[@itemprop='name'][1]/text()")) full_name = first_or_empty(basic_el.xpath("*[@itemprop='name']/following-sibling::p[1]/em/text()")) job_title = first_or_empty(basic_el.xpath("*[@itemprop='jobTitle'][1]/text()")) bio = first_or_empty(basic_el.xpath("*[@id='contact_info']/preceding-sibling::p[1]/text()")) data['basic_info'] = { 'display_name': display_name, 'full_name': full_name, 'job_title': job_title, 'bio': bio } # date of birth birth_node = basic_el.xpath("p[contains(., 'Born')][1]") if birth_node: birth_node = birth_node[0] birth_date = birth_node.xpath('a[1]/text()') if birth_date: data['basic_info']['birth_date'] = datetime.strptime( birth_date[0], DATE_FORMAT ) birth_town = birth_node.find_class('locality') if birth_town: if birth_town[0].xpath('a'): data['basic_info']['birth_town'] = birth_town[0].xpath('a[1]/text()')[0] elif birth_town[0].text: text = birth_town[0].text.strip() if text.startswith('in '): text = text[3:] data['basic_info']['birth_town'] = text birth_country = birth_node.xpath("*[@itemprop='nationality'][1]/text()") if birth_country: data['basic_info']['country'] = birth_country[0] # professional info prof_el = root.get_element_by_id('professional-details', None) if prof_el is not None: current = None for el in prof_el: if el.tag == 'h2': if current is None: current = True else: current = False elif el.tag == 'div' and current is not None and \ not el.get('class', ''): role_parts = [s.strip() for s in el.xpath('h6/br/preceding-sibling::text()[1]')[0] .split('|') if s.strip() != ''] date_parts = [s.strip() for s in el.xpath('h6/br/following-sibling::text()[1]')[0] .split('|') if s.strip() != ''] role_data = { 'role_name': first_or_empty(role_parts), 'status': 'active' if current else 'inactive' } # get start and end year if date_parts: date_parts = R_YEAR_RANGE.match(date_parts[-1]) if date_parts: role_data['role_start_year'] = int(date_parts.group('start')) if date_parts.group('current'): assert current elif date_parts.group('end'): role_data['role_end_year'] = int(date_parts.group('end')) elif not current: role_data['role_end_year'] = role_data['role_start_year'] # get organization info org_el = el.xpath('h6/a[last()]') if len(org_el) > 0: org_el = org_el[0] role_data['organization_name'] = org_el.text role_data['organization_url'] = org_el.get('href', None) if role_data['organization_url']: role_data['organization_url'] = '%s%s' % ( ENDPOINT_URL.rstrip('/'), role_data['organization_url'] ) # the organization doesn't have a url # use 2nd last piece of plain text elif len(role_parts) > 2: role_data['organization_name'] = role_parts[-2] else: continue data['professional_details'].append(role_data) # education info edu_el = root.get_element_by_id('education', None) if edu_el is not None: level = None for el in edu_el.xpath("h1[1]/following-sibling::node()"): if not isinstance(el, html.HtmlElement): continue if el.tag == 'h2': level = el.text.lower() continue elif el.tag != 'div' or el.get('class', None) == 'clear': continue # parse secondary education (single line) if level == 'secondary': org_parts = el.xpath('h6[1]/text()')[0] org_parts = [s.strip() for s in org_parts.split(',')] place = ', '.join(org_parts[1:]) edu_data = { 'organization_name': org_parts[0], 'level': level, 'place': place, } match = re.match(r'.*(?P<year>\d{4})$', place) if match: edu_data['year_awarded'] = int(match.group('year')) edu_data['status'] = 'complete' edu_data['place'] = place[:-7] # parse tertiary education (complex tags) elif level == 'tertiary': edu_data = {'level': level} org_name = el.xpath('h6[1]/a') if org_name: edu_data['organization_name'] = org_name[0].text else: org_name = el.xpath('h6[1]/text()') if org_name: edu_data['organization_name'] = org_name[0] try: date_parts = el.xpath('p[1]/text()')[0] \ .split('|')[-1] \ .strip() except IndexError: continue if date_parts.startswith('Awarded in ') or \ date_parts.startswith('Completed '): edu_data['year_awarded'] = int(date_parts[-4:]) if date_parts.startswith('Completed '): edu_data['status'] = 'complete' qualification = el.xpath('p[2]/text()') if qualification: edu_data['qualification'] = qualification[0] else: date_parts = R_YEAR_RANGE.match(date_parts) if date_parts: qualification = el.xpath('p[2]/text()') if qualification: edu_data['qualification'] = qualification[0] edu_data['start_year'] = int(date_parts.group('start')) if date_parts.group('current'): edu_data['status'] = 'in progress' elif date_parts.group('end'): edu_data['status'] = 'complete' edu_data['year_awarded'] = int(date_parts.group('end')) else: edu_data['qualification'] = date_parts data['education'].append(edu_data) # activities info activity_el = root.get_element_by_id('activities', None) if activity_el is not None: # only doing memberships for el in activity_el.xpath("h2[.='Memberships']/following-sibling::node()"): if not isinstance(el, html.HtmlElement): continue if el.tag != 'div' or el.get('class', None) == 'clear': break org_name = el.xpath('h6[1]/text()')[0] role_data = {'organization_name': org_name} role_parts = el.xpath('p[1]/em')[0].text if role_parts: role_parts = role_parts.split(',') role_data['role_name'] = role_parts[0].strip() if len(role_parts) == 2: date_parts = R_YEAR_RANGE.match(role_parts[1].strip()) if date_parts: role_data['role_start_year'] = int(date_parts.group('start')) if date_parts.group('current'): role_data['status'] = 'active' elif date_parts.group('end'): role_data['status'] = 'inactive' role_data['role_end_year'] = int(date_parts.group('end')) data['activities'].append(role_data) # related profile info related_el = root.get_element_by_id('related', None) if related_el is not None: for el in related_el.find_class('item'): a_el = el.xpath('a')[0] related_data = {'url': a_el.get('href')} img_el = a_el.xpath('img') if len(img_el) > 0: img_el = img_el[0] related_data['image_url'] = img_el.get('src') related_data['title'] = img_el.get('title') data['related_profiles'].append(related_data) return data