Example #1
0
def main():
    opts = parse_args()

    log_to_stderr(verbose=opts.verbose, quiet=opts.quiet)

    if opts.urls:
        all_urls = opts.urls
    elif environ.get('MORPH_URLS'):
        all_urls = filter(None, environ['MORPH_URLS'].split())
    else:
        all_urls = set()

        for db_name in SOURCE_DBS:
            download_db(db_name)
            db = open_db(db_name)
            for table in show_tables(db):
                if table in SKIP_TABLES:
                    continue
                urls = select_urls(db, table)
                if urls:
                    log.info('read {} urls from {}.{}'.format(
                        len(urls), db_name, table))
                all_urls.update(urls)

    create_table_if_not_exists('url', with_scraper_id=False)

    dt = open_dt()
    failures = []  # tuple of (url, exception)

    for i, url in enumerate(sorted(all_urls)):
        log.info('scraping {} ({} of {})'.format(
            url, i + 1, len(all_urls)))

        try:
            html = scrape(url)

            soup = BeautifulSoup(html)
            row = dict(url=url, last_scraped=iso_now())
            row['twitter_handle'] = scrape_twitter_handle(
                soup, required=False)
            row['facebook_url'] = scrape_facebook_url(
                soup, required=False)

            log.debug('`url`: {}'.format(repr(row)))
            dt.upsert(row, 'url')
        except Exception as e:
            failures.append((url, e))
            print_exc()

    # show a summary of failures
    if failures:
        log.warn('Failed to scrape {} of {} URL{}:'.format(
            len(failures), len(all_urls),
            's' if len(failures) > 2 else ''))
        for url, e in failures:
            log.warn(u'  {}: {}'.format(url, repr(e)))

    if len(failures) > len(all_urls) * MAX_PROPORTION_FAILURES:
        raise Exception('too many failures')
Example #2
0
def scrape_industries():
    industry_list = scrape(INDUSTRIES_URL, headers={})
    match = JSON_CALLBACK_RE.search(industry_list)
    industry_json = json.loads(match.group(1))

    return {
        int(i['Industry']['id']): i['Industry']['name']
        for i in industry_json['Industries']
    }
Example #3
0
def do_corp(url, industry):
    biz_id = url.split('/')[-1]

    # whitelist of businesses
    if 'MORPH_B_CORP_BIZ_IDS' in environ:
        if biz_id not in environ['MORPH_B_CORP_BIZ_IDS'].split(','):
            return

    log.info('Business page: {}'.format(biz_id))

    try:
        html = scrape(url)
    except HTTPError as e:
        if 'infinite loop' in e.msg:
            log.warn('infinite loop when fetching {}'.format(url))
            return
        elif e.code == 403 and e.geturl() != url:
            log.warn('redirect to bad URL: {}'.format(url))
            return
        else:
            raise

    soup = BeautifulSoup(html)

    c = {}

    # just being in the directory gets you a good judgment
    r = {'judgment': 1, 'company': c, 'url': url}

    # scrape score anyway

    # some pages don't have score (e.g.
    # http://www.bcorporation.net/community/farm-capital-services-llc-0)
    score_div = soup.find('div', class_='field-name-field-overall-b-score')
    if score_div:
        r['score'] = int(score_div.text)
        r['max_score'] = MAX_SCORE

    c['company'] = soup.select('h1#page-title')[0].text

    # use both industry and category on page (industry is more consistent)
    c['categories'] = [industry]
    # *almost* all bizs have their own category description, but not all
    category_h3s = soup.select('.company-desc-inner h3')
    if category_h3s:
        cat = category_h3s[0].text.strip()
        if cat:
            c['categories'].append(cat)

    # social media
    left_col = soup.select('.two-col.last')[0]
    c['twitter_handle'] = scrape_twitter_handle(left_col, required=False)
    c['facebook_url'] = scrape_facebook_url(left_col, required=False)

    homepage_as = soup.select('.company-desc-inner a')
    if homepage_as:
        c['url'] = homepage_as[0]['href']

    # logo not always available; e.g. on
    # http://www.bcorporation.net/community/atayne-llc
    logo_img = soup.find('img', class_='image-style-company-logo-full')
    if logo_img:
        c['logo_url'] = urljoin(url, logo_img['src'])

    # TODO: add store_url. This is in the lower-right box,
    # but not consistently formatted. Examples:
    # http://www.bcorporation.net/community/one-village-coffee-llc
    # http://www.bcorporation.net/community/feelgoodz-llc

    # turn Company Highlights into claims
    ch_section = soup.find(
        'section', class_='field-name-field-company-highlights')
    if ch_section:
        claims = []

        for strong in ch_section.select('strong'):
            if isinstance(strong.nextSibling, unicode):
                # the colon for the heading isn't inside <strong>
                claims.extend(strong.nextSibling.lstrip(':').split(';'))
            elif strong.nextSibling is None:
                claims.extend(strong.stripped_strings)

        for claim in claims:
            claim = claim.strip()
            if claim:
                yield 'claim', dict(
                    company=c['company'],
                    claim=claim,
                    judgment=1)

    yield 'rating', r
def scrape_soup_from_bad_html(url):
    """Discard a bad comment that's frustrating BeautifulSoup"""
    return BeautifulSoup(scrape(url)[3:])
Example #5
0
def scrape_rating_page(rating_id):
    url = RATINGS_URL + str(rating_id)
    soup = BeautifulSoup(scrape(url, headers={}), from_encoding='utf-8')

    d = {}
    d['url'] = url

    # handle header field (brand)
    brand = soup.select('.rating-name')[0].text.strip()
    log.info('Rating {}: {}'.format(rating_id, brand))

    # get logo image
    logo_url = None
    brand_logo_img = soup.find('img', alt='brand logo')
    if brand_logo_img and 'src' in brand_logo_img.attrs:
        logo_url = brand_logo_img['src']

    for suffix in SUFFIXES:
        if brand.endswith(suffix):
            brand = brand[:-len(suffix)]
            d.update(SUFFIXES[suffix])
            break
    d['brand'] = brand

    h3_spans = {
        span.text.strip().lower(): span
        for span in soup.select('td h3 span')
    }

    scope_span = h3_spans['scope']
    scope_table = scope_span.find_parent('table')

    scope_tds = scope_table.select('tr td[colspan=3]')

    # handle "Rating applies to these products/ lines" field
    scope = scope_tds[0].text.strip()
    # fix dangling comma on "Woolworths manufactured apparel,"
    scope = scope.rstrip(',')

    if scope in SCOPE_CORRECTIONS:
        d.update(SCOPE_CORRECTIONS[scope])
    elif scope:
        d['scope'] = scope

    # handle "Rating based on assessment of" field
    company = scope_tds[1].text.strip()
    # fix e.g. "Clean Clothes, Inc.: Maggie's Organics"
    if company.endswith(': ' + brand):
        company = company[:-(2 + len(brand))]

    for prefix in COMPANY_PREFIXES:
        if company.startswith(prefix):
            company = company[len(prefix):].rstrip(')')
            d.update(COMPANY_PREFIXES[prefix])
            break
    for suffix in SUFFIXES:
        if company.endswith(suffix):
            company = company[:-len(suffix)]
            d.update(SUFFIXES[suffix])
            break

    # handle empty company field (e.g. Frontier)
    if not company:
        company = brand

    if company in COMPANY_CORRECTIONS:
        d.update(COMPANY_CORRECTIONS[company])
    else:
        d['company'] = company

    # handle "Industries" field
    #
    # in cases where a company is rated, this seems to be attached to
    # the company, not the specific brands, so it's okay to just
    # add this to the rating (whether it's a company or brand rating)
    categories = scope_tds[2].text.strip()
    if categories:
        d['categories'] = [c.strip() for c in categories.split(',')]

    # handle "Date Published" field
    date = to_iso_date(scope_tds[3].text.strip())
    # if no date, guess based on relevant report
    if not date and d.get('categories'):
        for category, year in REPORT_YEARS:
            if category in d['categories']:
                date = str(year)
                break

    if date is not None:
        d['date'] = date

    # handle grades
    gb_span = h3_spans['grade breakdown']
    gb_tr = gb_span.find_parent('tr').find_next_sibling('tr')

    area_to_grade = {}
    for grade_span in gb_tr.select('span.grade_circle'):
        area = grade_span.next_sibling
        if not isinstance(area, unicode):
            area = area.text  # "Overall" is bolded, others are not
        area = area.lower().strip()
        grade = grade_span.text
        area_to_grade[area] = grade

    d['grade'] = area_to_grade['overall']

    # convert to judgment
    d['judgment'] = grade_to_judgment(d['grade'])

    # attach logo_url to brand or company as appropriate
    if logo_url:
        if 'brand' in d and 'rating_brands' not in d:
            yield 'brand', dict(
                company=d['company'], brand=d['brand'], logo_url=logo_url)
        else:
            yield 'company', dict(
                company=d['company'], logo_url=logo_url)

    # work out claims
    claims = []

    about_span = h3_spans.get('about this rating')
    if about_span:  # not all companies have this
        about_text = about_span.find_parent(
            'tbody').find_next_sibling('tbody').text

        # about_text looks like POLICIES: stuff. TRANSPARENCY: more stuff ...
        # need to convert this to area -> claim

        areas = []
        starts = []
        ends = []

        for m in CLAIM_AREA_RE.finditer(about_text):
            areas.append(m.group(1).lower())
            starts.append(m.start())
            ends.append(m.end())

        for area, start, end in zip(areas, ends, starts[1:] + [-1]):
            area_claim = about_text[start:end]

            for claim in extract_claims(area_claim):
                judgment = judge_claim(claim)

                claims.append(
                    dict(company=company, claim=claim, judgment=judgment))

    # rate company or brands as appropriate
    if 'rating_brands' in d:
        rating_brands = d.pop('rating_brands')
        for rating_brand in rating_brands:
            rating = d.copy()
            rating['brand'] = rating_brand
            yield 'rating', rating

            for claim in claims:
                claim = claim.copy()
                claim['brand'] = rating_brand
                yield 'claim', claim
    else:
        rating = d.copy()
        if 'brand' in rating:
            rating['brands'] = [rating.pop('brand')]
        yield 'rating', rating
        for claim in claims:
            yield 'claim', claim
Example #6
0
def scrape_org_page(org_id, org_name=''):
    company = {}
    rating = {}

    url = PROFILE_URL_FMT.format(org_id)
    rating['url'] = url

    html = scrape(url)
    # skip some HTML comments that confuse BeautifulSoup
    soup = BeautifulSoup(html[100:])

    sections = soup.select('div.legislation-box')

    # rating section
    rating_section = sections[1]
    company_h2 = rating_section.h2
    if company_h2.span.small.text != 'RATING':
        raise ValueError('company section not found')

    company['company'] = company_h2.text[:company_h2.text.index('[')].strip()
    if not company['company']:
        # Nestlé Purina had no name on org page as of 2015-04-30
        company['company'] = org_name

    score = company_h2.span.text.split()[-1]
    if score != 'RATING':  # OSI RESTAURANT PARTNERS has no rating (52300)
        rating['score'] = int(score)
        rating['judgment'] = STYLE_TO_JUDGMENT[company_h2.span['style']]

    website_label = rating_section.find('strong', text='Website:')
    if website_label:  # sometimes missing, like on Invesco (1109)
        url_a = website_label.findNextSibling()
        if url_a.name == 'a':
            company['url'] = fix_url(url_a['href'])

    # feedback section
    feedback_section = sections[2]
    if feedback_section.h2.text != 'Customer Feedback':
        raise ValueError('feedback section not found')

    feedback_url_a = feedback_section.find(
        'strong', text='Website:').findNextSibling()
    if feedback_url_a.name == 'a':
        company['feedback_url'] = fix_url(feedback_url_a['href'])

    feedback_dict = list_to_dict(list(feedback_section.p.stripped_strings))
    if feedback_dict['Phone:'] != 'N/A':
        company['phone'] = feedback_dict['Phone:']

    if feedback_dict['Email:'] != 'N/A':
        company['email'] = feedback_dict['Email:']

    # brands section
    brands_section = sections[3]
    if brands_section.h2.text != 'Brands & Products':
        raise ValueError('feedback section not found')

    # when there are no brands, HRC helpfully puts this in a
    # second p
    company['brands'] = [
        b for b in brands_section.p.stripped_strings
        if b != 'end While']

    rating['company'] = company
    yield 'rating', rating