def main(): opts = parse_args() log_to_stderr(verbose=opts.verbose, quiet=opts.quiet) if opts.urls: all_urls = opts.urls elif environ.get('MORPH_URLS'): all_urls = filter(None, environ['MORPH_URLS'].split()) else: all_urls = set() for db_name in SOURCE_DBS: download_db(db_name) db = open_db(db_name) for table in show_tables(db): if table in SKIP_TABLES: continue urls = select_urls(db, table) if urls: log.info('read {} urls from {}.{}'.format( len(urls), db_name, table)) all_urls.update(urls) create_table_if_not_exists('url', with_scraper_id=False) dt = open_dt() failures = [] # tuple of (url, exception) for i, url in enumerate(sorted(all_urls)): log.info('scraping {} ({} of {})'.format( url, i + 1, len(all_urls))) try: html = scrape(url) soup = BeautifulSoup(html) row = dict(url=url, last_scraped=iso_now()) row['twitter_handle'] = scrape_twitter_handle( soup, required=False) row['facebook_url'] = scrape_facebook_url( soup, required=False) log.debug('`url`: {}'.format(repr(row))) dt.upsert(row, 'url') except Exception as e: failures.append((url, e)) print_exc() # show a summary of failures if failures: log.warn('Failed to scrape {} of {} URL{}:'.format( len(failures), len(all_urls), 's' if len(failures) > 2 else '')) for url, e in failures: log.warn(u' {}: {}'.format(url, repr(e))) if len(failures) > len(all_urls) * MAX_PROPORTION_FAILURES: raise Exception('too many failures')
def scrape_industries(): industry_list = scrape(INDUSTRIES_URL, headers={}) match = JSON_CALLBACK_RE.search(industry_list) industry_json = json.loads(match.group(1)) return { int(i['Industry']['id']): i['Industry']['name'] for i in industry_json['Industries'] }
def do_corp(url, industry): biz_id = url.split('/')[-1] # whitelist of businesses if 'MORPH_B_CORP_BIZ_IDS' in environ: if biz_id not in environ['MORPH_B_CORP_BIZ_IDS'].split(','): return log.info('Business page: {}'.format(biz_id)) try: html = scrape(url) except HTTPError as e: if 'infinite loop' in e.msg: log.warn('infinite loop when fetching {}'.format(url)) return elif e.code == 403 and e.geturl() != url: log.warn('redirect to bad URL: {}'.format(url)) return else: raise soup = BeautifulSoup(html) c = {} # just being in the directory gets you a good judgment r = {'judgment': 1, 'company': c, 'url': url} # scrape score anyway # some pages don't have score (e.g. # http://www.bcorporation.net/community/farm-capital-services-llc-0) score_div = soup.find('div', class_='field-name-field-overall-b-score') if score_div: r['score'] = int(score_div.text) r['max_score'] = MAX_SCORE c['company'] = soup.select('h1#page-title')[0].text # use both industry and category on page (industry is more consistent) c['categories'] = [industry] # *almost* all bizs have their own category description, but not all category_h3s = soup.select('.company-desc-inner h3') if category_h3s: cat = category_h3s[0].text.strip() if cat: c['categories'].append(cat) # social media left_col = soup.select('.two-col.last')[0] c['twitter_handle'] = scrape_twitter_handle(left_col, required=False) c['facebook_url'] = scrape_facebook_url(left_col, required=False) homepage_as = soup.select('.company-desc-inner a') if homepage_as: c['url'] = homepage_as[0]['href'] # logo not always available; e.g. on # http://www.bcorporation.net/community/atayne-llc logo_img = soup.find('img', class_='image-style-company-logo-full') if logo_img: c['logo_url'] = urljoin(url, logo_img['src']) # TODO: add store_url. This is in the lower-right box, # but not consistently formatted. Examples: # http://www.bcorporation.net/community/one-village-coffee-llc # http://www.bcorporation.net/community/feelgoodz-llc # turn Company Highlights into claims ch_section = soup.find( 'section', class_='field-name-field-company-highlights') if ch_section: claims = [] for strong in ch_section.select('strong'): if isinstance(strong.nextSibling, unicode): # the colon for the heading isn't inside <strong> claims.extend(strong.nextSibling.lstrip(':').split(';')) elif strong.nextSibling is None: claims.extend(strong.stripped_strings) for claim in claims: claim = claim.strip() if claim: yield 'claim', dict( company=c['company'], claim=claim, judgment=1) yield 'rating', r
def scrape_soup_from_bad_html(url): """Discard a bad comment that's frustrating BeautifulSoup""" return BeautifulSoup(scrape(url)[3:])
def scrape_rating_page(rating_id): url = RATINGS_URL + str(rating_id) soup = BeautifulSoup(scrape(url, headers={}), from_encoding='utf-8') d = {} d['url'] = url # handle header field (brand) brand = soup.select('.rating-name')[0].text.strip() log.info('Rating {}: {}'.format(rating_id, brand)) # get logo image logo_url = None brand_logo_img = soup.find('img', alt='brand logo') if brand_logo_img and 'src' in brand_logo_img.attrs: logo_url = brand_logo_img['src'] for suffix in SUFFIXES: if brand.endswith(suffix): brand = brand[:-len(suffix)] d.update(SUFFIXES[suffix]) break d['brand'] = brand h3_spans = { span.text.strip().lower(): span for span in soup.select('td h3 span') } scope_span = h3_spans['scope'] scope_table = scope_span.find_parent('table') scope_tds = scope_table.select('tr td[colspan=3]') # handle "Rating applies to these products/ lines" field scope = scope_tds[0].text.strip() # fix dangling comma on "Woolworths manufactured apparel," scope = scope.rstrip(',') if scope in SCOPE_CORRECTIONS: d.update(SCOPE_CORRECTIONS[scope]) elif scope: d['scope'] = scope # handle "Rating based on assessment of" field company = scope_tds[1].text.strip() # fix e.g. "Clean Clothes, Inc.: Maggie's Organics" if company.endswith(': ' + brand): company = company[:-(2 + len(brand))] for prefix in COMPANY_PREFIXES: if company.startswith(prefix): company = company[len(prefix):].rstrip(')') d.update(COMPANY_PREFIXES[prefix]) break for suffix in SUFFIXES: if company.endswith(suffix): company = company[:-len(suffix)] d.update(SUFFIXES[suffix]) break # handle empty company field (e.g. Frontier) if not company: company = brand if company in COMPANY_CORRECTIONS: d.update(COMPANY_CORRECTIONS[company]) else: d['company'] = company # handle "Industries" field # # in cases where a company is rated, this seems to be attached to # the company, not the specific brands, so it's okay to just # add this to the rating (whether it's a company or brand rating) categories = scope_tds[2].text.strip() if categories: d['categories'] = [c.strip() for c in categories.split(',')] # handle "Date Published" field date = to_iso_date(scope_tds[3].text.strip()) # if no date, guess based on relevant report if not date and d.get('categories'): for category, year in REPORT_YEARS: if category in d['categories']: date = str(year) break if date is not None: d['date'] = date # handle grades gb_span = h3_spans['grade breakdown'] gb_tr = gb_span.find_parent('tr').find_next_sibling('tr') area_to_grade = {} for grade_span in gb_tr.select('span.grade_circle'): area = grade_span.next_sibling if not isinstance(area, unicode): area = area.text # "Overall" is bolded, others are not area = area.lower().strip() grade = grade_span.text area_to_grade[area] = grade d['grade'] = area_to_grade['overall'] # convert to judgment d['judgment'] = grade_to_judgment(d['grade']) # attach logo_url to brand or company as appropriate if logo_url: if 'brand' in d and 'rating_brands' not in d: yield 'brand', dict( company=d['company'], brand=d['brand'], logo_url=logo_url) else: yield 'company', dict( company=d['company'], logo_url=logo_url) # work out claims claims = [] about_span = h3_spans.get('about this rating') if about_span: # not all companies have this about_text = about_span.find_parent( 'tbody').find_next_sibling('tbody').text # about_text looks like POLICIES: stuff. TRANSPARENCY: more stuff ... # need to convert this to area -> claim areas = [] starts = [] ends = [] for m in CLAIM_AREA_RE.finditer(about_text): areas.append(m.group(1).lower()) starts.append(m.start()) ends.append(m.end()) for area, start, end in zip(areas, ends, starts[1:] + [-1]): area_claim = about_text[start:end] for claim in extract_claims(area_claim): judgment = judge_claim(claim) claims.append( dict(company=company, claim=claim, judgment=judgment)) # rate company or brands as appropriate if 'rating_brands' in d: rating_brands = d.pop('rating_brands') for rating_brand in rating_brands: rating = d.copy() rating['brand'] = rating_brand yield 'rating', rating for claim in claims: claim = claim.copy() claim['brand'] = rating_brand yield 'claim', claim else: rating = d.copy() if 'brand' in rating: rating['brands'] = [rating.pop('brand')] yield 'rating', rating for claim in claims: yield 'claim', claim
def scrape_org_page(org_id, org_name=''): company = {} rating = {} url = PROFILE_URL_FMT.format(org_id) rating['url'] = url html = scrape(url) # skip some HTML comments that confuse BeautifulSoup soup = BeautifulSoup(html[100:]) sections = soup.select('div.legislation-box') # rating section rating_section = sections[1] company_h2 = rating_section.h2 if company_h2.span.small.text != 'RATING': raise ValueError('company section not found') company['company'] = company_h2.text[:company_h2.text.index('[')].strip() if not company['company']: # Nestlé Purina had no name on org page as of 2015-04-30 company['company'] = org_name score = company_h2.span.text.split()[-1] if score != 'RATING': # OSI RESTAURANT PARTNERS has no rating (52300) rating['score'] = int(score) rating['judgment'] = STYLE_TO_JUDGMENT[company_h2.span['style']] website_label = rating_section.find('strong', text='Website:') if website_label: # sometimes missing, like on Invesco (1109) url_a = website_label.findNextSibling() if url_a.name == 'a': company['url'] = fix_url(url_a['href']) # feedback section feedback_section = sections[2] if feedback_section.h2.text != 'Customer Feedback': raise ValueError('feedback section not found') feedback_url_a = feedback_section.find( 'strong', text='Website:').findNextSibling() if feedback_url_a.name == 'a': company['feedback_url'] = fix_url(feedback_url_a['href']) feedback_dict = list_to_dict(list(feedback_section.p.stripped_strings)) if feedback_dict['Phone:'] != 'N/A': company['phone'] = feedback_dict['Phone:'] if feedback_dict['Email:'] != 'N/A': company['email'] = feedback_dict['Email:'] # brands section brands_section = sections[3] if brands_section.h2.text != 'Brands & Products': raise ValueError('feedback section not found') # when there are no brands, HRC helpfully puts this in a # second p company['brands'] = [ b for b in brands_section.p.stripped_strings if b != 'end While'] rating['company'] = company yield 'rating', rating