def text_mof(pdf_part, pdf_year, pdf_number, pdf_name): s3_url = "https://mgax-mof.s3.amazonaws.com" pdf_url = s3_url + "/" + pdf_name with temp_dir() as tmp: pdf_local_path = tmp / pdf_name text_path = tmp / 'plain.txt' with pdf_local_path.open('wb') as f: resp = requests.get(pdf_url, stream=True) assert resp.status_code == 200 for chunk in FileWrapper(resp.raw): f.write(chunk) subprocess.check_call(['pdftotext', pdf_local_path, text_path]) with text_path.open('r') as f: raw_text = f.read() json = dict([('part', int(pdf_part)), ('year', int(pdf_year)), ('number', int(pdf_number)), ('slug', pdf_name.split('.')[0]), ('text', raw_text)]) resp = requests.put(flask.current_app.config['ELASTIC_SEARCH_URL'] + pdf_name.split('.')[0], data=flask.json.dumps(json)) assert 200 <= resp.status_code < 300, repr(resp)
def get_and_ocr(url): with temp_dir() as tmp: image_path = tmp / 'page.jpg' with image_path.open('wb') as f: if not download(url, f): return None return ocr(image_path)