Ejemplo n.º 1
0
def worker(dept_code, pages):
    logging.info("Crawl Dep=%s, page start=%s, page end=%s, page count=%s" % (dept_code, pages[0], pages[-1], len(pages)))

    dept_dir = "%s/%s" % (EXTRACT_DETAIL_DIR, dept_code)

    if not os.path.isdir(dept_dir):
        os.makedirs(dept_dir)

    ts_crawler = TSCrawler(proxy_host=random.choice(proxies))

    logging.info("Crawl department %s" % dept_code)

    form_response = ts_crawler.submit_form_by_dept(dept_code)

    for page in pages:
        logging.info("Department=%s, page=%s" % (dept_code, page))

        listing_filename = "%s/%s/listing-%s-%s.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code, page)

        if os.path.isfile(listing_filename):
            continue

        if page != 0:
            form_response = ts_crawler.get_listing_page(page)

        form_html = form_response.read()
        data = list(parse_listing(form_html))

        # Crawl detail
        for idx, _ in enumerate(data):
            detail_filename = "%s/%s/avantage-%s-%s-%s.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code, page, idx)

            if os.path.isfile(detail_filename):
                continue

            with open(detail_filename, "w") as detail_file:
                detail_response = ts_crawler.get_detail(idx)

                if detail_response:
                    detail_file.write(detail_response.read())

        with open(listing_filename, 'w') as tmp_out:
            tmp_out.write(form_html)

    logging.info("Departement=%s is finished" % dept_code)