Python scrape_soup Examples, srs.scrape.scrape_soup Python Examples

Example #1

0

Show file

File: nestle.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY

    start_soup = scrape_soup(START_URL)

    urls = [urljoin(START_URL, a['href'])
            for a in start_soup.select('#sNavigation a')
            if a.text.strip().lower() not in SKIP_LINKS]

    for url in urls:
        soup = scrape_soup(url)

        for a in soup.select('.brandCarousel a'):
            href = a['href']
            # weirdly, brand is only available in the URL fragment
            if href.startswith('#'):
                href = href[1:]
            if '|' in href:
                href = href[:href.index('|')]

            # stop at the (r)/(tm)
            for c in R_AND_TM:
                if c in href:
                    href = href[:href.index(c)]

            yield href

Example #2

0

Show file

File: gsk.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY

    start_soup = scrape_soup(START_URL)

    urls = [urljoin(START_URL, a['href'])
            for a in start_soup.select('#alphaPaginationContent a')]

    for url in urls:
        if url == START_URL + '#':
            soup = start_soup
        else:
            soup = scrape_soup(url)

        for a in soup.select('td.tableItalic a'):
            brand = a.text.strip()
            for prefix in SHORTEN_BRANDS:
                if brand.startswith(prefix):
                    brand = prefix

            if '/' in brand:
                for part in brand.split('/'):
                    yield part
            else:
                yield brand

Example #3

0

Show file

File: climate_counts.py Project: davidmarin/scrape-campaigns

def scrape_sectors(known_brands):
    log.info('scraping all sectors')
    soup = scrape_soup(SECTORS_URL)

    for a in soup.select('#sector a'):
        log.info(u'scraping sector: {}'.format(a.text.strip()))
        sector_url = urljoin(SECTORS_URL, a['href'])
        sector_soup = scrape_soup(sector_url)

        urls_seen = set()  # somehow getting same URLs twice
        for a in sector_soup.select('#sector div a'):
            # ignore http://i2.climatecounts.org links
            if not a['href'].startswith('/'):
                continue

            if a['href'] in urls_seen:
                continue

            urls_seen.add(a['href'])

            log.info(u'scraping company: {}'.format(strip_company(a.text)))
            company_url = urljoin(sector_url, a['href'])

            for record in scrape_company(company_url, known_brands):
                yield record

Example #4

0

Show file

File: novartis.py Project: spendright/scrape-companies

def scrape_brands():
    for brand in EXTRA_BRANDS:
        yield brand

    start_soup = scrape_soup(NOVARTIS_OTC_START_URL)
    urls = [urljoin(NOVARTIS_OTC_START_URL, a['href'])
            for a in start_soup.select('.tabs.statictabs a')]

    for url in urls:
        if url == NOVARTIS_OTC_START_URL:
            soup = start_soup
        else:
            soup = scrape_soup(url)

        for i in soup.select('.panes .text-container i'):
            yield i.text

    alcon_soup = scrape_soup(ALCON_PRODUCTS_URL)

    start_div = [div for div in alcon_soup.select('div.accordionButton')
                 if div.text.lower() == 'over-the-counter'][0]
    otc_div = start_div.findNextSibling(
        'div', attrs={'class':'accordionContent'})

    for h4 in otc_div.select('h4'):
        yield h4.text

Example #5

0

Show file

File: reckitt_benckiser.py Project: spendright/scrape-companies

def scrape_company():

    yield 'company', dict(company=COMPANY, url=COMPANY_URL)

    for brand in MORE_BRANDS:
        yield 'brand', dict(company=COMPANY, brand=brand)

    # get logo for brands
    brands_soup = scrape_soup(BRANDS_URL)

    sb_to_logo_url = {}  # map smunch(brand) to logo_url

    for img in brands_soup.select('#scroller img'):
        sb = smunch(img['alt'])
        sb = SMUNCHED_BRAND_CORRECTIONS.get(sb, sb)
        logo_url = img['src']

        sb_to_logo_url[sb] = logo_url

    cat_soup = scrape_soup(CATEGORY_URL)

    for a in cat_soup.select('li.active ul li a'):
        cat = a.text
        url = a['href']

        # TODO: match brands with logos
        # treat "French's" as single brand
        # correct "Cillet Bang" -> "Cillit Bang"

        soup = scrape_soup(url)
        for h2 in soup.select('h2'):
            brand = h2.text.strip()

            if brand:
                # special case for French's
                for kb in KNOWN_BRANDS:
                    if brand.startswith(kb + ' '):
                        sub_cat = brand[len(kb) + 1:]
                        yield 'subcategory', dict(category=cat,
                                                  subcategory=sub_cat)
                        brand = kb
                        brand_cat = sub_cat  # don't redefine cat
                    else:
                        brand_cat = cat

                yield 'brand', dict(
                    company=COMPANY,
                    brand=brand,
                    category=brand_cat,
                    logo_url = sb_to_logo_url.get(smunch(brand)))

Example #6

0

Show file

File: avon.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY
    for brand in MORE_BRANDS:
        yield brand

    start_soup = scrape_soup(START_URL)

    urls = [a['href'] for a in start_soup.select('div.topmenu a')
            if a['title'].lower() not in SKIP_CATEGORIES]

    for url in urls:
        soup = scrape_soup(url)

        for a in soup.select('div#shopByBrand a'):
            yield a.text

Example #7

0

Show file

File: rankabrand.py Project: spendright/scrape-campaigns

def scrape_claims(url, company, brand, soup=None):
    """Scrape claims from the Sustainability report section
    of the brand page. You'll have to add company/brand yourself"""
    if soup is None:
        soup = scrape_soup(url)

    claim_url = url + '#detailed-report'

    for section in soup.select('div.brand-report-section'):
        area = section.h4.text.strip()
        if area.startswith('Questions about '):
            area = area[len('Questions about '):]

        for tr in section.select('tr'):
            question = tr.select('td.question')[0].text

            status_img_src = tr.select('td.status img')[0]['src']
            judgment = status_img_src_to_judgment(status_img_src)

            remark = tr.select('td.remark')[0].text

            for claim in extract_claims(remark, company, brand, question):
                yield dict(area=area,
                           question=question,
                           judgment=judgment,
                           claim=claim,
                           company=company,
                           brand=brand,
                           url=claim_url)

Example #8

0

Show file

File: b_corp.py Project: davidmarin/scrape-campaigns

def scrape_campaign():
    soup = scrape_soup(DIRECTORY_URL)

    c = {
        'campaign': CAMPAIGN,
        'url': CAMPAIGN_URL,
        'goal': GOAL,
        'author': AUTHOR,
    }

    c['copyright'] = scrape_copyright(soup)
    c['facebook_url'] = scrape_facebook_url(soup)
    c['twitter_handle'] = scrape_twitter_handle(soup)

    yield 'campaign', c

    select = soup.find('select', id='edit-field-industry')

    for option in select.select('option'):
        industry = option.get('value')
        if industry:
            industry_url = '{}?{}={}'.format(
                DIRECTORY_URL, select['name'], quote_plus(industry))

            for record in scrape_industry(industry_url, industry):
                yield record

Example #9

0

Show file

File: free2work.py Project: davidmarin/scrape-campaigns

def scrape_rating_ids_for_industry(industry_id):
    url = INDUSTRY_URL + str(industry_id)
    # Accepts: text/html leads to a 406
    soup = scrape_soup(url, headers={})

    for a in soup.select('.score-card-button a'):
        yield int(a['href'].split('/')[-1])

Example #10

0

Show file

File: rankabrand.py Project: spendright/scrape-campaigns

def scrape_campaign(url=URL):
    log.info('Landing Page')
    soup = scrape_soup(url)

    c = {}  # campaign dict

    c['goal'], c['campaign'] = soup.title.text.split('|')[-2:]
    c['goal'] = c['goal'].capitalize()  # for consistency
    c['url'] = url

    # there isn't a copyright notice on the page!
    c['donate_url'] = urljoin(url,
                              soup.find('a', text='Support us')['href'])
    c['facebook_url'] = scrape_facebook_url(soup)

    th = scrape_twitter_handle(soup)
    c['twitter_handle'] = TWITTER_CORRECTIONS.get(th.lower(), th)

    yield 'campaign', c

    for a in soup.select('ul.sectors a'):
        sector = a.text
        sector_url = urljoin(url, a['href'])
        for record in scrape_sector(sector_url, sector):
            yield record

Example #11

0

Show file

File: hanesbrands.py Project: spendright/scrape-companies

def scrape_company():
    yield 'company', {'company': COMPANY, 'category': CATEGORY}

    soup = scrape_soup(URL)
    for i in soup.select('#CompanyTxt i'):
        for brand in i.text.split(', '):
            yield 'brand', {'company': COMPANY, 'brand': brand}

Example #12

0

Show file

File: steelcase.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY

    soup = scrape_soup(PRODUCTS_URL)

    for a in soup.select('.ourBrands li a'):
        yield a.text

Example #13

0

Show file

File: rankabrand.py Project: spendright/scrape-campaigns

def scrape_sector(url, sector):
    log.info(u'Sector: {}'.format(sector))
    soup = scrape_soup(url)

    current_li = soup.find('li', class_='current')

    if current_li:
        subsector_as = current_li.select('ul li a')

        if subsector_as:
            for a in subsector_as:
                subsector = a.text
                subsector_url = urljoin(url, a['href'])
                for record in scrape_subsector(
                        subsector_url, [sector, subsector]):
                    yield record
        else:
            # no subsectors
            for record in scrape_subsector(url, [sector], soup=soup):
                yield record
    else:
        # possible to be one or no brands in sector
        if soup.select('div.logobox'):
            # single brand in sector (e.g. T-Mobile in telecom)
            for record in scrape_brand(url, [sector], soup=soup):
                yield record

Example #14

0

Show file

File: kellogg.py Project: spendright/scrape-companies

def scrape_brands():
    for b in EXTRA_BRANDS:
        yield b

    soup = scrape_soup(URL)

    for a in soup.select('#navleft-brand a'):
        yield a.text

Example #15

0

Show file

File: detox_catwalk.py Project: spendright/scrape-campaigns

def scrape_campaign():
    log.info('scraping Detox Catwalk page')
    soup = scrape_soup(URL)

    yield 'campaign', CAMPAIGN

    for page in soup.select('.page'):

        company = page.select('.headline2')[0].text

        # handle LVMH Group / Christian Dior Couture, which is two separate
        # but entangled companies. Greenpeace isn't wrong to treat them as
        # single unit, but it makes the data messy.
        if ' / ' in company:
            companies = company.split(' / ')
        else:
            companies = [company]

        for company in companies:
            yield 'company', dict(company=company)
            yield 'category', dict(
                company=company, category=CATEGORY)

        for b in page.select('b'):
            # look for "Brands Owned"
            m = BRANDS_OWNED_RE.match(b.text)
            if not m:
                continue

            # for LVMH/Christian Dior, there's a separate brand list for each
            # company
            company = m.group('company') or companies[0]

            brands = b.next.next.strip().split(', ')
            for brand in brands:
                # strip irrelevant crud from brand
                brand = BRAND_RE.match(brand).group('brand')
                yield 'brand', dict(company=company, brand=brand)

        # would like to use the correct fragment for each rating
        # (the rest of the url is the same), but the logic for that is
        # buried deep in JS somewhere.

        ct = page.select('.ct-table')

        # in theory, we'd get this from the class of the rating logo, but
        # that's set by JS
        if ct:
            if ct[0].select('.negative'):
                judgment = 0
            else:
                judgment = 1
        else:
            judgment = -1

        yield 'rating', dict(
            company=company, judgment=judgment,
            description=JUDGMENT_TO_DESCRIPTION[judgment])

Example #16

0

Show file

File: procter_and_gamble.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY
    for brand in EXTRA_BRANDS:
        yield brand

    start_soup = scrape_soup(START_URL)

    urls = [urljoin(START_URL, a['href'])
            for a in start_soup.select('#category-navigation a')
            if a.text.strip().startswith('Global')]

    for url in urls:
        soup = scrape_soup(url)

        for div in soup.select('.list-prods div.product'):
            brand = div.text
            if brand not in LICENSED_BRANDS:
                yield brand

Example #17

0

Show file

File: johnson_and_johnson.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY

    soup = scrape_soup(URL)

    for div in soup.select('div.gray-container'):
        category = div.h2.text

        for a in div.select('.views-field-title a'):
            yield {'brand': a.text, 'category': category}

Example #18

0

Show file

File: rankabrand.py Project: spendright/scrape-campaigns

def scrape_twitter_handle_from_nudge_url(url):
    soup = scrape_soup(url)

    twitter_p = soup.select('#email_tpl div p')[0]
    if twitter_p.text.find('^Unfortunately'):
        return

    for word in twitter_p.text.split():
        if word.startswith('@'):
            return word

Example #19

0

Show file

File: climate_counts.py Project: davidmarin/scrape-campaigns

def scrape_product_types():
    log.info('scraping product types')
    soup = scrape_soup(PRODUCT_TYPES_URL)

    for a in soup.select('#search_results_results a'):
        cat = a.text
        cat_url = urljoin(PRODUCT_TYPES_URL, a['href'])

        log.info(u'scraping category: {}'.format(cat))
        cat_soup = scrape_soup(cat_url)

        for company, brand, sector in scrape_brand_results(cat_soup):
            if '-' in sector:  # Beer-Beverages
                parent_sector, sector = sector.split('-', 1)
                yield 'subcategory', dict(
                    category=parent_sector, subcategory=sector)

            yield 'subcategory', dict(category=sector, subcategory=cat)
            yield 'category', dict(company=company, brand=brand, category=cat)

Example #20

0

Show file

File: hasbro.py Project: davidmarin/scrape-companies

def scrape_brands():
    yield COMPANY

    soup = scrape_soup(URL)

    for a in soup.select('#hsb_shop_bl_container li ul li a'):
        brand = a.text
        if brand in LICENSED_BRANDS:
            yield dict(brand=brand, is_licensed=True)
        else:
            yield brand

Example #21

0

Show file

File: general_mills.py Project: spendright/scrape-companies

def scrape_brands():
    start_soup = scrape_soup(START_URL)

    urls = [urljoin(START_URL, section_a['href'])
            for section_a in start_soup.select('#subnav a')
            if section_a.text.strip() not in SKIP_SECTIONS]

    for url in urls:
        soup = scrape_soup(url)
        for a in soup.select('.productrow h4 a'):
            brand = a.text.strip()
            if '/' in brand:
                for part in brand.split('/'):
                    yield part.strip()
            elif brand in DESCRIPTION_TO_BRANDS:
                # "Monsters" is a family of brands
                for real_brand in DESCRIPTION_TO_BRANDS[brand]:
                    yield real_brand
            else:
                yield brand

Example #22

0

Show file

File: l_brands.py Project: spendright/scrape-companies

def scrape_brands():
    for lb in LICENSED_BRANDS:
        yield {"brand": lb, "is_licensed": True}

    for fb in FORMER_BRANDS:
        yield {"brand": fb, "is_former": True}

    soup = scrape_soup(URL)

    for a in soup.select("#contentTwo a"):
        yield {"brand": a.text, "url": a["href"], "categories": list(scrape_categories(a["href"]))}

Example #23

0

Show file

File: pepsico.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY

    soup = scrape_soup(URL)

    for div in soup.select('div.brand'):
        yield {
            'brand': div.img['alt'],
            # "joint venture" brands don't belong to PepsiCo (e.g. Starbucks)
            'is_licensed': any(
                jv_text in div.p.text for jv_text in JOINT_VENTURES_TEXT)
        }

Example #24

0

Show file

File: hrc.py Project: davidmarin/scrape-campaigns

def scrape_landing_page():
    d = {}
    d['campaign'] = CAMPAIGN

    soup = scrape_soup(URL)

    d['cats'] = options_to_dict(
        soup.select('select[name=category] option'))

    d['orgs'] = options_to_dict(
        soup.select('select[name=orgid] option'))

    return d

Example #25

0

Show file

File: abbott.py Project: spendright/scrape-companies

def scrape_brands():
    yield COMPANY
    for brand in EXTRA_BRANDS:
        yield brand

    start_soup = scrape_soup(START_URL)
    urls = [urljoin(START_URL, a['href'])
            for a in start_soup.select('#nav_secondary a')
            if a.text.strip().lower() not in SKIP_CATEGORIES]

    for url in urls:
        soup = scrape_soup(url)

        for item in soup.select('.product-list-item'):
            brand = item.text
            # leave out generic names of drugs
            if ' (' in brand:
                brand = brand[:brand.index(' (')]

            if brand.strip().lower() in NON_BRANDS:
                continue

            yield brand

Example #26

0

Show file

File: wwf_palm_oil.py Project: spendright/scrape-campaigns

def scrape_campaign():
    log.info('Solutions page')
    solutions_soup = scrape_soup(SOLUTIONS_URL)

    scorecard_a = solutions_soup.find('a', text=HOW_SCORED_RE)

    campaign_url = urljoin(SOLUTIONS_URL, scorecard_a['href'])

    log.info('Campaign page')
    campaign_soup = scrape_soup(campaign_url)

    campaign = {'url': campaign_url}
    campaign.update(CAMPAIGN)
    yield 'campaign', campaign

    # you have to click twice to see how the companies scored
    scores_a = campaign_soup.find(
        'div', class_='right-column').find(
            'a', text=HOW_SCORED_RE)

    scores_url = urljoin(campaign_url, scores_a['href'])

    log.info('Scores page')
    scores_soup = scrape_soup(scores_url)

    category_as = scores_soup.select('div.right-column a')
    if not category_as:
        raise ValueError("Can't find links to actual scores.")

    for category_a in category_as:
        m = SEE_SCORES_RE.match(category_a.text)
        if m:
            category = m.group(1)
            category_url = urljoin(scores_url, category_a['href'])

            for record in scrape_category(category_url, category):
                yield record

Example #27

0

Show file

File: bang_accord.py Project: spendright/scrape-campaigns

def scrape_landing_page():
    d = {}

    soup = scrape_soup(URL)

    d['signatories_url'] = soup.find('a', text='Signatories')['href']

    d['campaign'] = CAMPAIGN
    d['campaign']['copyright'] = scrape_copyright(soup)
    d['campaign']['twitter_handle'] = scrape_twitter_handle(soup)

    # doesn't accept donations; the whole point is that the garment
    # companies pay

    return d

Example #28

0

Show file

File: greenpeace_electronics.py Project: spendright/scrape-campaigns

def scrape_campaign():
    soup = scrape_soup(URL)

    # campaign record
    c = {'url': URL, 'goal': GOAL}

    c['campaign'], c['author'] = soup.title.text.split('|')

    # remove double spaces
    c['copyright'] = ' '.join(
        soup.select('#footer ul.privacy')[0].li.stripped_strings)

    c['twitter_handle'] = scrape_twitter_handle(soup)
    # TODO: make a method for scraping facebook URLs
    c['facebook_url'] = soup.select('a.facebook')[0]['href']
    c['donate_url'] = urljoin(URL, soup.select('a.donate')[0]['href'])

    yield 'campaign', c

    # rating records
    trs = soup.table.findAll('tr')
    num_ranked = len(trs)

    for tr in trs:
        header_match = HEADER_RE.match(tr.h2.text.strip())
        company_in_caps, score, max_score = header_match.groups()
        score = Decimal(score)
        max_score = int(max_score)
        judgment = score_to_judgment(score)

        rank = int(IMG_RE.match(tr.img['alt'].strip()).group(1))

        # get company name not in ALL CAPS
        company = REPORT_CARD_RE.match((tr.a.text.strip())).group(1)

        if company.upper() != company_in_caps.upper():
            raise ValueError(u"Non-matching company name: {}".format(company))

        yield 'rating', {
            'company': company,
            'score': score,
            'max_score': max_score,
            'rank': rank,
            'num_ranked': num_ranked,
            'judgment': judgment,
            'categories': [CATEGORY],
        }

Example #29

0

Show file

File: wwf_palm_oil.py Project: spendright/scrape-campaigns

def scrape_category(url, category):
    log.info('{} page'.format(category))
    soup = scrape_soup(url)

    for tr in soup.select('div.main-column tbody tr'):
        score_td, company_td, country_td = tr.select('td')

        c = {'category': category}  # company
        r = {'company': c, 'max_score': MAX_SCORE}  # rating

        r['score'] = float(score_td.text)
        color = COLOR_RE.search(score_td['style']).group(0)
        r['judgment'] = color_to_judgment(color)

        company = company_td.text
        m = COMPANY_PARENS_RE.match(company)
        if m:
            # stuff in parentheses... it can mean so much!
            company, aside = m.groups()
            if aside.strip() == 'Subway':
                c['company'] = aside
                c['parent_company'] = company
            elif aside.startswith('prev.'):
                c['company'] = company
            elif company == 'Aldi':
                c['company'] = company + ' ' + aside
            elif aside.startswith('UK'):
                c['company'] = company
                r['scope'] = aside
            elif aside == 'Global':
                c['company'] = company
            elif ' of ' in aside:  # e.g. division/subsidiary of
                c['company'] = company
                c['parent_company'] = aside[(aside.index(' of ') + 4):]
            else:
                c['company'] = company
                c['parent_company'] = aside
        elif '/' in company:
            company, brand = company.split('/', 1)
            c['company'] = company
            c['brands'] = [brand]
        else:
            c['company'] = company

        c['hq_country'] = country_td.text

        yield 'rating', r

Example #30

0

Show file

File: astrazeneca.py Project: spendright/scrape-companies

def scrape_brands():
    soup = scrape_soup(URL)

    yield COMPANY

    for tr in soup.select('#primary table tbody tr'):
        td = tr.td  # pick first td
        if td:
            td_brands = SEPARATOR_RE.split(td.text)
            for brand in td_brands:
                if ' (' in brand:
                    brand = brand[:brand.index(' (')]

                if brand.strip().lower() in OTC_BRANDS:
                    yield brand
                else:
                    yield dict(brand=brand, is_prescription=True)