def scrape_brands(): yield COMPANY start_soup = scrape_soup(START_URL) urls = [urljoin(START_URL, a['href']) for a in start_soup.select('#sNavigation a') if a.text.strip().lower() not in SKIP_LINKS] for url in urls: soup = scrape_soup(url) for a in soup.select('.brandCarousel a'): href = a['href'] # weirdly, brand is only available in the URL fragment if href.startswith('#'): href = href[1:] if '|' in href: href = href[:href.index('|')] # stop at the (r)/(tm) for c in R_AND_TM: if c in href: href = href[:href.index(c)] yield href
def scrape_brands(): yield COMPANY start_soup = scrape_soup(START_URL) urls = [urljoin(START_URL, a['href']) for a in start_soup.select('#alphaPaginationContent a')] for url in urls: if url == START_URL + '#': soup = start_soup else: soup = scrape_soup(url) for a in soup.select('td.tableItalic a'): brand = a.text.strip() for prefix in SHORTEN_BRANDS: if brand.startswith(prefix): brand = prefix if '/' in brand: for part in brand.split('/'): yield part else: yield brand
def scrape_sectors(known_brands): log.info('scraping all sectors') soup = scrape_soup(SECTORS_URL) for a in soup.select('#sector a'): log.info(u'scraping sector: {}'.format(a.text.strip())) sector_url = urljoin(SECTORS_URL, a['href']) sector_soup = scrape_soup(sector_url) urls_seen = set() # somehow getting same URLs twice for a in sector_soup.select('#sector div a'): # ignore http://i2.climatecounts.org links if not a['href'].startswith('/'): continue if a['href'] in urls_seen: continue urls_seen.add(a['href']) log.info(u'scraping company: {}'.format(strip_company(a.text))) company_url = urljoin(sector_url, a['href']) for record in scrape_company(company_url, known_brands): yield record
def scrape_brands(): for brand in EXTRA_BRANDS: yield brand start_soup = scrape_soup(NOVARTIS_OTC_START_URL) urls = [urljoin(NOVARTIS_OTC_START_URL, a['href']) for a in start_soup.select('.tabs.statictabs a')] for url in urls: if url == NOVARTIS_OTC_START_URL: soup = start_soup else: soup = scrape_soup(url) for i in soup.select('.panes .text-container i'): yield i.text alcon_soup = scrape_soup(ALCON_PRODUCTS_URL) start_div = [div for div in alcon_soup.select('div.accordionButton') if div.text.lower() == 'over-the-counter'][0] otc_div = start_div.findNextSibling( 'div', attrs={'class':'accordionContent'}) for h4 in otc_div.select('h4'): yield h4.text
def scrape_company(): yield 'company', dict(company=COMPANY, url=COMPANY_URL) for brand in MORE_BRANDS: yield 'brand', dict(company=COMPANY, brand=brand) # get logo for brands brands_soup = scrape_soup(BRANDS_URL) sb_to_logo_url = {} # map smunch(brand) to logo_url for img in brands_soup.select('#scroller img'): sb = smunch(img['alt']) sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb) logo_url = img['src'] sb_to_logo_url[sb] = logo_url cat_soup = scrape_soup(CATEGORY_URL) for a in cat_soup.select('li.active ul li a'): cat = a.text url = a['href'] # TODO: match brands with logos # treat "French's" as single brand # correct "Cillet Bang" -> "Cillit Bang" soup = scrape_soup(url) for h2 in soup.select('h2'): brand = h2.text.strip() if brand: # special case for French's for kb in KNOWN_BRANDS: if brand.startswith(kb + ' '): sub_cat = brand[len(kb) + 1:] yield 'subcategory', dict(category=cat, subcategory=sub_cat) brand = kb brand_cat = sub_cat # don't redefine cat else: brand_cat = cat yield 'brand', dict( company=COMPANY, brand=brand, category=brand_cat, logo_url = sb_to_logo_url.get(smunch(brand)))
def scrape_brands(): yield COMPANY for brand in MORE_BRANDS: yield brand start_soup = scrape_soup(START_URL) urls = [a['href'] for a in start_soup.select('div.topmenu a') if a['title'].lower() not in SKIP_CATEGORIES] for url in urls: soup = scrape_soup(url) for a in soup.select('div#shopByBrand a'): yield a.text
def scrape_claims(url, company, brand, soup=None): """Scrape claims from the Sustainability report section of the brand page. You'll have to add company/brand yourself""" if soup is None: soup = scrape_soup(url) claim_url = url + '#detailed-report' for section in soup.select('div.brand-report-section'): area = section.h4.text.strip() if area.startswith('Questions about '): area = area[len('Questions about '):] for tr in section.select('tr'): question = tr.select('td.question')[0].text status_img_src = tr.select('td.status img')[0]['src'] judgment = status_img_src_to_judgment(status_img_src) remark = tr.select('td.remark')[0].text for claim in extract_claims(remark, company, brand, question): yield dict(area=area, question=question, judgment=judgment, claim=claim, company=company, brand=brand, url=claim_url)
def scrape_campaign(): soup = scrape_soup(DIRECTORY_URL) c = { 'campaign': CAMPAIGN, 'url': CAMPAIGN_URL, 'goal': GOAL, 'author': AUTHOR, } c['copyright'] = scrape_copyright(soup) c['facebook_url'] = scrape_facebook_url(soup) c['twitter_handle'] = scrape_twitter_handle(soup) yield 'campaign', c select = soup.find('select', id='edit-field-industry') for option in select.select('option'): industry = option.get('value') if industry: industry_url = '{}?{}={}'.format( DIRECTORY_URL, select['name'], quote_plus(industry)) for record in scrape_industry(industry_url, industry): yield record
def scrape_rating_ids_for_industry(industry_id): url = INDUSTRY_URL + str(industry_id) # Accepts: text/html leads to a 406 soup = scrape_soup(url, headers={}) for a in soup.select('.score-card-button a'): yield int(a['href'].split('/')[-1])
def scrape_campaign(url=URL): log.info('Landing Page') soup = scrape_soup(url) c = {} # campaign dict c['goal'], c['campaign'] = soup.title.text.split('|')[-2:] c['goal'] = c['goal'].capitalize() # for consistency c['url'] = url # there isn't a copyright notice on the page! c['donate_url'] = urljoin(url, soup.find('a', text='Support us')['href']) c['facebook_url'] = scrape_facebook_url(soup) th = scrape_twitter_handle(soup) c['twitter_handle'] = TWITTER_CORRECTIONS.get(th.lower(), th) yield 'campaign', c for a in soup.select('ul.sectors a'): sector = a.text sector_url = urljoin(url, a['href']) for record in scrape_sector(sector_url, sector): yield record
def scrape_company(): yield 'company', {'company': COMPANY, 'category': CATEGORY} soup = scrape_soup(URL) for i in soup.select('#CompanyTxt i'): for brand in i.text.split(', '): yield 'brand', {'company': COMPANY, 'brand': brand}
def scrape_brands(): yield COMPANY soup = scrape_soup(PRODUCTS_URL) for a in soup.select('.ourBrands li a'): yield a.text
def scrape_sector(url, sector): log.info(u'Sector: {}'.format(sector)) soup = scrape_soup(url) current_li = soup.find('li', class_='current') if current_li: subsector_as = current_li.select('ul li a') if subsector_as: for a in subsector_as: subsector = a.text subsector_url = urljoin(url, a['href']) for record in scrape_subsector( subsector_url, [sector, subsector]): yield record else: # no subsectors for record in scrape_subsector(url, [sector], soup=soup): yield record else: # possible to be one or no brands in sector if soup.select('div.logobox'): # single brand in sector (e.g. T-Mobile in telecom) for record in scrape_brand(url, [sector], soup=soup): yield record
def scrape_brands(): for b in EXTRA_BRANDS: yield b soup = scrape_soup(URL) for a in soup.select('#navleft-brand a'): yield a.text
def scrape_campaign(): log.info('scraping Detox Catwalk page') soup = scrape_soup(URL) yield 'campaign', CAMPAIGN for page in soup.select('.page'): company = page.select('.headline2')[0].text # handle LVMH Group / Christian Dior Couture, which is two separate # but entangled companies. Greenpeace isn't wrong to treat them as # single unit, but it makes the data messy. if ' / ' in company: companies = company.split(' / ') else: companies = [company] for company in companies: yield 'company', dict(company=company) yield 'category', dict( company=company, category=CATEGORY) for b in page.select('b'): # look for "Brands Owned" m = BRANDS_OWNED_RE.match(b.text) if not m: continue # for LVMH/Christian Dior, there's a separate brand list for each # company company = m.group('company') or companies[0] brands = b.next.next.strip().split(', ') for brand in brands: # strip irrelevant crud from brand brand = BRAND_RE.match(brand).group('brand') yield 'brand', dict(company=company, brand=brand) # would like to use the correct fragment for each rating # (the rest of the url is the same), but the logic for that is # buried deep in JS somewhere. ct = page.select('.ct-table') # in theory, we'd get this from the class of the rating logo, but # that's set by JS if ct: if ct[0].select('.negative'): judgment = 0 else: judgment = 1 else: judgment = -1 yield 'rating', dict( company=company, judgment=judgment, description=JUDGMENT_TO_DESCRIPTION[judgment])
def scrape_brands(): yield COMPANY for brand in EXTRA_BRANDS: yield brand start_soup = scrape_soup(START_URL) urls = [urljoin(START_URL, a['href']) for a in start_soup.select('#category-navigation a') if a.text.strip().startswith('Global')] for url in urls: soup = scrape_soup(url) for div in soup.select('.list-prods div.product'): brand = div.text if brand not in LICENSED_BRANDS: yield brand
def scrape_brands(): yield COMPANY soup = scrape_soup(URL) for div in soup.select('div.gray-container'): category = div.h2.text for a in div.select('.views-field-title a'): yield {'brand': a.text, 'category': category}
def scrape_twitter_handle_from_nudge_url(url): soup = scrape_soup(url) twitter_p = soup.select('#email_tpl div p')[0] if twitter_p.text.find('^Unfortunately'): return for word in twitter_p.text.split(): if word.startswith('@'): return word
def scrape_product_types(): log.info('scraping product types') soup = scrape_soup(PRODUCT_TYPES_URL) for a in soup.select('#search_results_results a'): cat = a.text cat_url = urljoin(PRODUCT_TYPES_URL, a['href']) log.info(u'scraping category: {}'.format(cat)) cat_soup = scrape_soup(cat_url) for company, brand, sector in scrape_brand_results(cat_soup): if '-' in sector: # Beer-Beverages parent_sector, sector = sector.split('-', 1) yield 'subcategory', dict( category=parent_sector, subcategory=sector) yield 'subcategory', dict(category=sector, subcategory=cat) yield 'category', dict(company=company, brand=brand, category=cat)
def scrape_brands(): yield COMPANY soup = scrape_soup(URL) for a in soup.select('#hsb_shop_bl_container li ul li a'): brand = a.text if brand in LICENSED_BRANDS: yield dict(brand=brand, is_licensed=True) else: yield brand
def scrape_brands(): start_soup = scrape_soup(START_URL) urls = [urljoin(START_URL, section_a['href']) for section_a in start_soup.select('#subnav a') if section_a.text.strip() not in SKIP_SECTIONS] for url in urls: soup = scrape_soup(url) for a in soup.select('.productrow h4 a'): brand = a.text.strip() if '/' in brand: for part in brand.split('/'): yield part.strip() elif brand in DESCRIPTION_TO_BRANDS: # "Monsters" is a family of brands for real_brand in DESCRIPTION_TO_BRANDS[brand]: yield real_brand else: yield brand
def scrape_brands(): for lb in LICENSED_BRANDS: yield {"brand": lb, "is_licensed": True} for fb in FORMER_BRANDS: yield {"brand": fb, "is_former": True} soup = scrape_soup(URL) for a in soup.select("#contentTwo a"): yield {"brand": a.text, "url": a["href"], "categories": list(scrape_categories(a["href"]))}
def scrape_brands(): yield COMPANY soup = scrape_soup(URL) for div in soup.select('div.brand'): yield { 'brand': div.img['alt'], # "joint venture" brands don't belong to PepsiCo (e.g. Starbucks) 'is_licensed': any( jv_text in div.p.text for jv_text in JOINT_VENTURES_TEXT) }
def scrape_landing_page(): d = {} d['campaign'] = CAMPAIGN soup = scrape_soup(URL) d['cats'] = options_to_dict( soup.select('select[name=category] option')) d['orgs'] = options_to_dict( soup.select('select[name=orgid] option')) return d
def scrape_brands(): yield COMPANY for brand in EXTRA_BRANDS: yield brand start_soup = scrape_soup(START_URL) urls = [urljoin(START_URL, a['href']) for a in start_soup.select('#nav_secondary a') if a.text.strip().lower() not in SKIP_CATEGORIES] for url in urls: soup = scrape_soup(url) for item in soup.select('.product-list-item'): brand = item.text # leave out generic names of drugs if ' (' in brand: brand = brand[:brand.index(' (')] if brand.strip().lower() in NON_BRANDS: continue yield brand
def scrape_campaign(): log.info('Solutions page') solutions_soup = scrape_soup(SOLUTIONS_URL) scorecard_a = solutions_soup.find('a', text=HOW_SCORED_RE) campaign_url = urljoin(SOLUTIONS_URL, scorecard_a['href']) log.info('Campaign page') campaign_soup = scrape_soup(campaign_url) campaign = {'url': campaign_url} campaign.update(CAMPAIGN) yield 'campaign', campaign # you have to click twice to see how the companies scored scores_a = campaign_soup.find( 'div', class_='right-column').find( 'a', text=HOW_SCORED_RE) scores_url = urljoin(campaign_url, scores_a['href']) log.info('Scores page') scores_soup = scrape_soup(scores_url) category_as = scores_soup.select('div.right-column a') if not category_as: raise ValueError("Can't find links to actual scores.") for category_a in category_as: m = SEE_SCORES_RE.match(category_a.text) if m: category = m.group(1) category_url = urljoin(scores_url, category_a['href']) for record in scrape_category(category_url, category): yield record
def scrape_landing_page(): d = {} soup = scrape_soup(URL) d['signatories_url'] = soup.find('a', text='Signatories')['href'] d['campaign'] = CAMPAIGN d['campaign']['copyright'] = scrape_copyright(soup) d['campaign']['twitter_handle'] = scrape_twitter_handle(soup) # doesn't accept donations; the whole point is that the garment # companies pay return d
def scrape_campaign(): soup = scrape_soup(URL) # campaign record c = {'url': URL, 'goal': GOAL} c['campaign'], c['author'] = soup.title.text.split('|') # remove double spaces c['copyright'] = ' '.join( soup.select('#footer ul.privacy')[0].li.stripped_strings) c['twitter_handle'] = scrape_twitter_handle(soup) # TODO: make a method for scraping facebook URLs c['facebook_url'] = soup.select('a.facebook')[0]['href'] c['donate_url'] = urljoin(URL, soup.select('a.donate')[0]['href']) yield 'campaign', c # rating records trs = soup.table.findAll('tr') num_ranked = len(trs) for tr in trs: header_match = HEADER_RE.match(tr.h2.text.strip()) company_in_caps, score, max_score = header_match.groups() score = Decimal(score) max_score = int(max_score) judgment = score_to_judgment(score) rank = int(IMG_RE.match(tr.img['alt'].strip()).group(1)) # get company name not in ALL CAPS company = REPORT_CARD_RE.match((tr.a.text.strip())).group(1) if company.upper() != company_in_caps.upper(): raise ValueError(u"Non-matching company name: {}".format(company)) yield 'rating', { 'company': company, 'score': score, 'max_score': max_score, 'rank': rank, 'num_ranked': num_ranked, 'judgment': judgment, 'categories': [CATEGORY], }
def scrape_category(url, category): log.info('{} page'.format(category)) soup = scrape_soup(url) for tr in soup.select('div.main-column tbody tr'): score_td, company_td, country_td = tr.select('td') c = {'category': category} # company r = {'company': c, 'max_score': MAX_SCORE} # rating r['score'] = float(score_td.text) color = COLOR_RE.search(score_td['style']).group(0) r['judgment'] = color_to_judgment(color) company = company_td.text m = COMPANY_PARENS_RE.match(company) if m: # stuff in parentheses... it can mean so much! company, aside = m.groups() if aside.strip() == 'Subway': c['company'] = aside c['parent_company'] = company elif aside.startswith('prev.'): c['company'] = company elif company == 'Aldi': c['company'] = company + ' ' + aside elif aside.startswith('UK'): c['company'] = company r['scope'] = aside elif aside == 'Global': c['company'] = company elif ' of ' in aside: # e.g. division/subsidiary of c['company'] = company c['parent_company'] = aside[(aside.index(' of ') + 4):] else: c['company'] = company c['parent_company'] = aside elif '/' in company: company, brand = company.split('/', 1) c['company'] = company c['brands'] = [brand] else: c['company'] = company c['hq_country'] = country_td.text yield 'rating', r
def scrape_brands(): soup = scrape_soup(URL) yield COMPANY for tr in soup.select('#primary table tbody tr'): td = tr.td # pick first td if td: td_brands = SEPARATOR_RE.split(td.text) for brand in td_brands: if ' (' in brand: brand = brand[:brand.index(' (')] if brand.strip().lower() in OTC_BRANDS: yield brand else: yield dict(brand=brand, is_prescription=True)