コード例 #1
0
def scrape_campaign():
    yield 'campaign', CAMPAIGN

    pdf_path = basename(PDF_URL)
    if not exists(pdf_path):
        log.info('downloading {} -> {}'.format(PDF_URL, pdf_path))
        download(PDF_URL, pdf_path)

    args = ['pdftohtml', '-f', '22', '-l', '22', '-stdout', pdf_path]
    proc = Popen(args, stdout=PIPE)
    stdout, _ = proc.communicate()
    if proc.returncode:
        raise CalledProcessError(proc.returncode, args)

    soup = BeautifulSoup(stdout)
    companies_and_scores = list(soup.body.stripped_strings)[4:-5]

    companies = companies_and_scores[::2]
    scores = companies_and_scores[1::2]

    for company, score in zip(companies, scores):
        if ' (' in company:
            company = company[:company.index(' (')]

        score = fix_score(float(score))
        judgment = score_to_judgment(score)

        yield 'rating', dict(
            company=company,
            score=score,
            max_score=MAX_SCORE,
            judgment=judgment)
コード例 #2
0
def html_from_pdf():
    pdf_path = basename(PDF_URL)
    if not exists(pdf_path):
        log.info('downloading {} -> {}'.format(PDF_URL, pdf_path))
        download(PDF_URL, pdf_path)

    args = ['pdftohtml', '-f', '2', '-l', '2', '-stdout', pdf_path]
    proc = Popen(args, stdout=PIPE)
    stdout, _ = proc.communicate()
    if proc.returncode:
        raise CalledProcessError(proc.returncode, args)

    return stdout
コード例 #3
0
def scrape_campaign():
    yield 'campaign', CAMPAIGN

    # TODO: some sort of PDF to soup method would help
    pdf_path = basename(PDF_URL)
    if not exists(pdf_path):
        log.info('downloading {} -> {}'.format(PDF_URL, pdf_path))
        download(PDF_URL, pdf_path)

    args = ['pdftohtml', '-f', '36', '-l', '39', '-stdout', pdf_path]
    proc = Popen(args, stdout=PIPE)
    stdout, _ = proc.communicate()
    if proc.returncode:
        raise CalledProcessError(proc.returncode, args)

    soup = BeautifulSoup(stdout)

    strings = list(soup.body.stripped_strings)

    for i, s in enumerate(strings):
        # look for score
        if not SCORE_RE.match(s):
            continue

        row = strings[i - 5:i + 1]

        # asterisk indicates they were in 2014 pilot study
        company = row[0].rstrip('*')
        category = row[1]
        # 2 and 3 are links to SEC filing
        policy_rating = row[4]
        score = float(row[5])

        yield 'category', dict(
            company=company,
            category=category)

        yield 'claim', dict(
            POLICY_RATING_TO_CLAIM[policy_rating],
            company=company)

        yield 'rating', dict(
            company=company,
            description=score_to_description(score),
            judgment=score_to_judgment(score),
            min_score=CAMPAIGN['min_score'],
            max_score=CAMPAIGN['max_score'],
            score=score,
        )