Ejemplo n.º 1
0
def worker(dept_code, pages):
    logging.info("Crawl Dep=%s, page start=%s, page end=%s, page count=%s" % (dept_code, pages[0], pages[-1], len(pages)))

    dept_dir = "%s/%s" % (EXTRACT_DETAIL_DIR, dept_code)

    if not os.path.isdir(dept_dir):
        os.makedirs(dept_dir)

    ts_crawler = TSCrawler(proxy_host=random.choice(proxies))

    logging.info("Crawl department %s" % dept_code)

    form_response = ts_crawler.submit_form_by_dept(dept_code)

    for page in pages:
        logging.info("Department=%s, page=%s" % (dept_code, page))

        listing_filename = "%s/%s/listing-%s-%s.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code, page)

        if os.path.isfile(listing_filename):
            continue

        if page != 0:
            form_response = ts_crawler.get_listing_page(page)

        form_html = form_response.read()
        data = list(parse_listing(form_html))

        # Crawl detail
        for idx, _ in enumerate(data):
            detail_filename = "%s/%s/avantage-%s-%s-%s.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code, page, idx)

            if os.path.isfile(detail_filename):
                continue

            with open(detail_filename, "w") as detail_file:
                detail_response = ts_crawler.get_detail(idx)

                if detail_response:
                    detail_file.write(detail_response.read())

        with open(listing_filename, 'w') as tmp_out:
            tmp_out.write(form_html)

    logging.info("Departement=%s is finished" % dept_code)
Ejemplo n.º 2
0
def get_dept_remaining_tasks(dept_code):
    first_listing_page_filename = "%s/%s/listing-%s-0.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code)

    if not os.path.isfile(first_listing_page_filename):
        crawler = TSCrawler()
        response = crawler.submit_form_by_dept(dept_code)
        dept_dir = "%s/%s/" % (EXTRACT_DETAIL_DIR, dept_code)
        if not os.path.isdir(dept_dir):
            os.makedirs(dept_dir)
        with open(first_listing_page_filename, 'w') as output:
            output.write(response.read())

    first_listing_page = open(first_listing_page_filename, 'r')

    count, count_per_page = parse_listing_count_and_count_per_page(first_listing_page)

    if count_per_page == 0:
        pages_to_crawl = []
    else:
        pages_to_crawl = range(0, int(count / count_per_page) + 1)

    print "Dep=%s , total pages to crawl=%s" % (dept_code, len(pages_to_crawl))

    already_crawled_listings = glob.glob("%s/%s/listing-%s-*.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code))

    if pages_to_crawl:
        for name in already_crawled_listings:
            page = int(re.search("listing-\d{1,3}[A,B]?-(\d{1,5}).html", name).groups()[0])
            try:
                pages_to_crawl.remove(page)
            except ValueError:
                print name

    print "Dep=%s , remaining pages to crawl=%s" % (dept_code, len(pages_to_crawl))

    return pages_to_crawl