Example #1
0
def scrape_campaign():
    soup = scrape_soup(DIRECTORY_URL)

    c = {
        'campaign': CAMPAIGN,
        'url': CAMPAIGN_URL,
        'goal': GOAL,
        'author': AUTHOR,
    }

    c['copyright'] = scrape_copyright(soup)
    c['facebook_url'] = scrape_facebook_url(soup)
    c['twitter_handle'] = scrape_twitter_handle(soup)

    yield 'campaign', c

    select = soup.find('select', id='edit-field-industry')

    for option in select.select('option'):
        industry = option.get('value')
        if industry:
            industry_url = '{}?{}={}'.format(
                DIRECTORY_URL, select['name'], quote_plus(industry))

            for record in scrape_industry(industry_url, industry):
                yield record
def scrape_campaign(url=URL):
    log.info('Landing Page')
    soup = scrape_soup(url)

    c = {}  # campaign dict

    c['goal'], c['campaign'] = soup.title.text.split('|')[-2:]
    c['goal'] = c['goal'].capitalize()  # for consistency
    c['url'] = url

    # there isn't a copyright notice on the page!
    c['donate_url'] = urljoin(url,
                              soup.find('a', text='Support us')['href'])
    c['facebook_url'] = scrape_facebook_url(soup)

    th = scrape_twitter_handle(soup)
    c['twitter_handle'] = TWITTER_CORRECTIONS.get(th.lower(), th)

    yield 'campaign', c

    for a in soup.select('ul.sectors a'):
        sector = a.text
        sector_url = urljoin(url, a['href'])
        for record in scrape_sector(sector_url, sector):
            yield record
Example #3
0
def main():
    opts = parse_args()

    log_to_stderr(verbose=opts.verbose, quiet=opts.quiet)

    if opts.urls:
        all_urls = opts.urls
    elif environ.get('MORPH_URLS'):
        all_urls = filter(None, environ['MORPH_URLS'].split())
    else:
        all_urls = set()

        for db_name in SOURCE_DBS:
            download_db(db_name)
            db = open_db(db_name)
            for table in show_tables(db):
                if table in SKIP_TABLES:
                    continue
                urls = select_urls(db, table)
                if urls:
                    log.info('read {} urls from {}.{}'.format(
                        len(urls), db_name, table))
                all_urls.update(urls)

    create_table_if_not_exists('url', with_scraper_id=False)

    dt = open_dt()
    failures = []  # tuple of (url, exception)

    for i, url in enumerate(sorted(all_urls)):
        log.info('scraping {} ({} of {})'.format(
            url, i + 1, len(all_urls)))

        try:
            html = scrape(url)

            soup = BeautifulSoup(html)
            row = dict(url=url, last_scraped=iso_now())
            row['twitter_handle'] = scrape_twitter_handle(
                soup, required=False)
            row['facebook_url'] = scrape_facebook_url(
                soup, required=False)

            log.debug('`url`: {}'.format(repr(row)))
            dt.upsert(row, 'url')
        except Exception as e:
            failures.append((url, e))
            print_exc()

    # show a summary of failures
    if failures:
        log.warn('Failed to scrape {} of {} URL{}:'.format(
            len(failures), len(all_urls),
            's' if len(failures) > 2 else ''))
        for url, e in failures:
            log.warn(u'  {}: {}'.format(url, repr(e)))

    if len(failures) > len(all_urls) * MAX_PROPORTION_FAILURES:
        raise Exception('too many failures')
Example #4
0
def do_corp(url, industry):
    biz_id = url.split('/')[-1]

    # whitelist of businesses
    if 'MORPH_B_CORP_BIZ_IDS' in environ:
        if biz_id not in environ['MORPH_B_CORP_BIZ_IDS'].split(','):
            return

    log.info('Business page: {}'.format(biz_id))

    try:
        html = scrape(url)
    except HTTPError as e:
        if 'infinite loop' in e.msg:
            log.warn('infinite loop when fetching {}'.format(url))
            return
        elif e.code == 403 and e.geturl() != url:
            log.warn('redirect to bad URL: {}'.format(url))
            return
        else:
            raise

    soup = BeautifulSoup(html)

    c = {}

    # just being in the directory gets you a good judgment
    r = {'judgment': 1, 'company': c, 'url': url}

    # scrape score anyway

    # some pages don't have score (e.g.
    # http://www.bcorporation.net/community/farm-capital-services-llc-0)
    score_div = soup.find('div', class_='field-name-field-overall-b-score')
    if score_div:
        r['score'] = int(score_div.text)
        r['max_score'] = MAX_SCORE

    c['company'] = soup.select('h1#page-title')[0].text

    # use both industry and category on page (industry is more consistent)
    c['categories'] = [industry]
    # *almost* all bizs have their own category description, but not all
    category_h3s = soup.select('.company-desc-inner h3')
    if category_h3s:
        cat = category_h3s[0].text.strip()
        if cat:
            c['categories'].append(cat)

    # social media
    left_col = soup.select('.two-col.last')[0]
    c['twitter_handle'] = scrape_twitter_handle(left_col, required=False)
    c['facebook_url'] = scrape_facebook_url(left_col, required=False)

    homepage_as = soup.select('.company-desc-inner a')
    if homepage_as:
        c['url'] = homepage_as[0]['href']

    # logo not always available; e.g. on
    # http://www.bcorporation.net/community/atayne-llc
    logo_img = soup.find('img', class_='image-style-company-logo-full')
    if logo_img:
        c['logo_url'] = urljoin(url, logo_img['src'])

    # TODO: add store_url. This is in the lower-right box,
    # but not consistently formatted. Examples:
    # http://www.bcorporation.net/community/one-village-coffee-llc
    # http://www.bcorporation.net/community/feelgoodz-llc

    # turn Company Highlights into claims
    ch_section = soup.find(
        'section', class_='field-name-field-company-highlights')
    if ch_section:
        claims = []

        for strong in ch_section.select('strong'):
            if isinstance(strong.nextSibling, unicode):
                # the colon for the heading isn't inside <strong>
                claims.extend(strong.nextSibling.lstrip(':').split(';'))
            elif strong.nextSibling is None:
                claims.extend(strong.stripped_strings)

        for claim in claims:
            claim = claim.strip()
            if claim:
                yield 'claim', dict(
                    company=c['company'],
                    claim=claim,
                    judgment=1)

    yield 'rating', r
def scrape_campaign():
    log.info("Main page")
    soup = scrape_soup(URL)

    # campaign record
    cn = {"url": URL, "goal": GOAL}
    cn["campaign"], cn["author"] = soup.title.text.split("|")
    # remove double spaces

    cn["copyright"] = scrape_copyright(soup)
    cn["facebook_url"] = scrape_facebook_url(soup)
    cn["twitter_handle"] = scrape_twitter_handle(soup)

    # get year
    cn["date"] = INT_RE.search(soup.select("div.content h2")[0].text).group()

    for a in soup.findAll("a"):
        if a.text.strip() == "Donate":
            cn["donate_url"] = urljoin(URL, a["href"])
            break

    if "donate_url" not in cn:
        raise ValueError("Donate URL not found")

    yield "campaign", cn

    rating_divs = soup.select("div#corank div.row")
    if not rating_divs:
        raise ValueError("ratings not found")

    for div in rating_divs:
        c = {}
        r = {"company": c}

        company_a = div.select("a.coname")[0]
        company = company_a.text

        c["company"] = company

        teaser = div.select("span.teaser")[0].text
        r["categories"] = CATEGORIES_SEP.split(CATEGORIES_RE.match(teaser).group(1))

        for rank_class, judgment in RANK_CLASS_TO_JUDGMENT.items():
            if div.select("span.rank." + rank_class):
                r["judgment"] = judgment
                break
        else:
            raise ValueError("rating for {} not found".format(r["company"]))

        r["score"] = int(INT_RE.search(div.select("div.col_score")[0].text).group())

        r["categories"] = [CATEGORY]

        # fetch details
        company_id = company_a["href"].split("#")[-1]
        query = dict(action="getcompany", companyid=company_id)

        # use POST to get details JSON
        log.info("Details for {}".format(company))
        details = scrape_json(DETAILS_URL, data=urlencode(query))
        details = details[0][0]  # wrapped in lists. why?

        c["url"] = details["ext_url"]

        # TODO: details['message'] might be useful too. It's a message
        # that participants are supposed to send to the company:
        # "Thank you for the leadership you have shown in working to..."

        yield "rating", r

        detail_soup = BeautifulSoup(details["detail"])
        claim_lis = detail_soup.select("li")

        # First two bullet points are categories and a description
        # of the company's ranking (reversed for Nokia)
        # Last bullet point is what the company can do to improve its score.
        claim_lis = claim_lis[2:-1]

        for i, claim_li in enumerate(claim_lis):
            claim = claim_li.text

            judgment = claim_to_judgment(claim)

            claim = clarify_claim(claim, CLAIM_CLARIFICATIONS)

            yield "claim", dict(company=company, claim=claim, judgment=judgment)