コード例 #1
0
ファイル: online.py プロジェクト: JH27/crawlers
def update(a):
    print '## Get specific data'
    specific.get_html(a, range=(bill_s, bill_e))
    specific.html2json(a, range=(bill_s, bill_e))

    print '## Get pdfs'
    pdf.get_pdf(a, range=(bill_s, bill_e))
コード例 #2
0
def update(a):
    print '## Get specific data'
    specific.get_html(a, range=(bill_s, bill_e))
    specific.html2json(a, range=(bill_s, bill_e))

    print '## Get pdfs'
    pdf.get_pdf(a, range=(bill_s, bill_e))
コード例 #3
0
ファイル: online.py プロジェクト: JH27/crawlers
def get_new(a):
    print '## Get meta data'
    new_bill_ids = fetch_new_bill_ids(a)

    push_to_queue('insert_bills_db', new_bill_ids)

    print '## Get specific data'
    specific.get_html(a, bill_ids=new_bill_ids)
    specific.html2json(a, bill_ids=new_bill_ids)

    print '## Get pdfs'
    pdf.get_pdf(a, bill_ids=new_bill_ids)
コード例 #4
0
def get_new(a):
    print '## Get meta data'
    new_bill_ids = fetch_new_bill_ids(a)

    push_to_queue('insert_bills_db', new_bill_ids)

    print '## Get specific data'
    specific.get_html(a, bill_ids=new_bill_ids)
    specific.html2json(a, bill_ids=new_bill_ids)

    print '## Get pdfs'
    pdf.get_pdf(a, bill_ids=new_bill_ids)
コード例 #5
0
ファイル: online.py プロジェクト: njbobst/crawlers
def get_new(a):
    print '## Get meta data'
    new_bill_ids = fetch_new_bill_ids(a)

    for queue_name in QUEUE_NAMES.itervalues():
        push_to_queue(queue_name, new_bill_ids)

    print '## Get specific data'
    specific.get_html(a, bill_ids=new_bill_ids)
    specific.html2json(a, bill_ids=new_bill_ids)

    print '## Get pdfs'
    pdf.get_pdf(a, bill_ids=new_bill_ids)
コード例 #6
0
async def pdf(request):
    trace = str(uuid4())
    try:
        data = await request.json()
    except json.decoder.JSONDecodeError:
        return bad_request("Must provide valid JSON")

    if "url" not in data:
        return bad_request("Must provide 'url'", data)
    timeout = int(data.pop("timeout", 120))
    compress = data.pop("compress", False)

    LOG.info(f"{trace} Generating PDF for url {data['url']}")

    try:
        pdf = await asyncio.wait_for(get_pdf(CDP_HOST, **data, trace=trace),
                                     timeout)
    except TimeoutError as e:
        return gateway_timeout(str(e), data)
    except PayloadTooBig as e:
        return payload_too_large(str(e), data)
    except NavigationError as e:
        url = e.url or data["url"]
        return failed_dependency(str(e), url, e.code)
    except Exception as e:
        print(e)
        raise

    if compress:
        pdf = b64encode(zlib.compress(b64decode(pdf))).decode("utf8")

    LOG.info(f"{trace} PDF returned successfully")
    return web.json_response(dict(pdf=pdf, **data))
コード例 #7
0
ファイル: main.py プロジェクト: dongx3/crawlers
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

import meta
import specific
import pdf

assembly_s, assembly_e = 17, 19
bill_s, bill_e = None, None

for a in range(assembly_s, assembly_e+1):
    print '\n# Assembly %d' % a

    print '## Get meta data'
    npages = meta.get_npages(a)
    meta.get_html(a, npages)
    meta.html2csv(a, npages)

    print '## Get specific data'
    specific.get_html(a, range=(bill_s, bill_e))
    specific.html2json(a, range=(bill_s, bill_e))

    print '## Get pdfs'
    pdf.get_pdf(a, range=(bill_s, bill_e))
コード例 #8
0
ファイル: main.py プロジェクト: winnersky/crawlers
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

import meta
import specific
import pdf

assembly_s, assembly_e = 17, 19 # start, end id of assembly
bill_s, bill_e = None, None     # start, end number of bill

for a in range(assembly_s, assembly_e+1):
    print '\n# Assembly %d' % a

    print '## Get meta data'
    npages = meta.get_npages(a)
    meta.get_html(a, npages)
    meta.html2csv(a, npages)

    print '## Get specific data'
    specific.get_html(a, range=(bill_s, bill_e))
    specific.html2json(a, range=(bill_s, bill_e))

    print '## Get pdfs'
    pdf.get_pdf(a, range=(bill_s, bill_e))