Beispiel #1
0
def text_mof(pdf_part, pdf_year, pdf_number, pdf_name):

    s3_url = "https://mgax-mof.s3.amazonaws.com"

    pdf_url = s3_url + "/" + pdf_name

    with temp_dir() as tmp:
        pdf_local_path = tmp / pdf_name
        text_path = tmp / 'plain.txt'

        with pdf_local_path.open('wb') as f:
            resp = requests.get(pdf_url, stream=True)
            assert resp.status_code == 200
            for chunk in FileWrapper(resp.raw):
                f.write(chunk)

        subprocess.check_call(['pdftotext', pdf_local_path, text_path])

        with text_path.open('r') as f:
            raw_text = f.read()

        json = dict([('part', int(pdf_part)),
                     ('year', int(pdf_year)),
                     ('number', int(pdf_number)),
                     ('slug', pdf_name.split('.')[0]),
                     ('text', raw_text)])

        resp = requests.put(flask.current_app.config['ELASTIC_SEARCH_URL']
                            + pdf_name.split('.')[0],
                            data=flask.json.dumps(json))
        assert 200 <= resp.status_code < 300, repr(resp)
Beispiel #2
0
def text_mof(pdf_part, pdf_year, pdf_number, pdf_name):

    s3_url = "https://mgax-mof.s3.amazonaws.com"

    pdf_url = s3_url + "/" + pdf_name

    with temp_dir() as tmp:
        pdf_local_path = tmp / pdf_name
        text_path = tmp / 'plain.txt'

        with pdf_local_path.open('wb') as f:
            resp = requests.get(pdf_url, stream=True)
            assert resp.status_code == 200
            for chunk in FileWrapper(resp.raw):
                f.write(chunk)

        subprocess.check_call(['pdftotext', pdf_local_path, text_path])

        with text_path.open('r') as f:
            raw_text = f.read()

        json = dict([('part', int(pdf_part)), ('year', int(pdf_year)),
                     ('number', int(pdf_number)),
                     ('slug', pdf_name.split('.')[0]), ('text', raw_text)])

        resp = requests.put(flask.current_app.config['ELASTIC_SEARCH_URL'] +
                            pdf_name.split('.')[0],
                            data=flask.json.dumps(json))
        assert 200 <= resp.status_code < 300, repr(resp)
Beispiel #3
0
def get_and_ocr(url):
    with temp_dir() as tmp:
        image_path = tmp / 'page.jpg'

        with image_path.open('wb') as f:
            if not download(url, f):
                return None

        return ocr(image_path)
Beispiel #4
0
def get_and_ocr(url):
    with temp_dir() as tmp:
        image_path = tmp / 'page.jpg'

        with image_path.open('wb') as f:
            if not download(url, f):
                return None

        return ocr(image_path)