def scrape_campaign(): yield 'campaign', CAMPAIGN pdf_path = basename(PDF_URL) if not exists(pdf_path): log.info('downloading {} -> {}'.format(PDF_URL, pdf_path)) download(PDF_URL, pdf_path) args = ['pdftohtml', '-f', '22', '-l', '22', '-stdout', pdf_path] proc = Popen(args, stdout=PIPE) stdout, _ = proc.communicate() if proc.returncode: raise CalledProcessError(proc.returncode, args) soup = BeautifulSoup(stdout) companies_and_scores = list(soup.body.stripped_strings)[4:-5] companies = companies_and_scores[::2] scores = companies_and_scores[1::2] for company, score in zip(companies, scores): if ' (' in company: company = company[:company.index(' (')] score = fix_score(float(score)) judgment = score_to_judgment(score) yield 'rating', dict( company=company, score=score, max_score=MAX_SCORE, judgment=judgment)
def html_from_pdf(): pdf_path = basename(PDF_URL) if not exists(pdf_path): log.info('downloading {} -> {}'.format(PDF_URL, pdf_path)) download(PDF_URL, pdf_path) args = ['pdftohtml', '-f', '2', '-l', '2', '-stdout', pdf_path] proc = Popen(args, stdout=PIPE) stdout, _ = proc.communicate() if proc.returncode: raise CalledProcessError(proc.returncode, args) return stdout
def scrape_campaign(): yield 'campaign', CAMPAIGN # TODO: some sort of PDF to soup method would help pdf_path = basename(PDF_URL) if not exists(pdf_path): log.info('downloading {} -> {}'.format(PDF_URL, pdf_path)) download(PDF_URL, pdf_path) args = ['pdftohtml', '-f', '36', '-l', '39', '-stdout', pdf_path] proc = Popen(args, stdout=PIPE) stdout, _ = proc.communicate() if proc.returncode: raise CalledProcessError(proc.returncode, args) soup = BeautifulSoup(stdout) strings = list(soup.body.stripped_strings) for i, s in enumerate(strings): # look for score if not SCORE_RE.match(s): continue row = strings[i - 5:i + 1] # asterisk indicates they were in 2014 pilot study company = row[0].rstrip('*') category = row[1] # 2 and 3 are links to SEC filing policy_rating = row[4] score = float(row[5]) yield 'category', dict( company=company, category=category) yield 'claim', dict( POLICY_RATING_TO_CLAIM[policy_rating], company=company) yield 'rating', dict( company=company, description=score_to_description(score), judgment=score_to_judgment(score), min_score=CAMPAIGN['min_score'], max_score=CAMPAIGN['max_score'], score=score, )